In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
df_original = pd.read_csv('originaldf.csv')

dft1= pd.read_csv('df1-v.csv')
dft2= pd.read_csv('df2-v.csv')
dft3= pd.read_csv('df3-v.csv')
dft4= pd.read_csv('df4-v.csv')
dft5= pd.read_csv('df5-v.csv')


readmitted_mapping = {'<30': 1, '>30': 0, 'NO': 0}
df_original['readmitted'] = df_original['readmitted'].map(readmitted_mapping)
dft1['readmitted'] = dft1['readmitted'].map(readmitted_mapping)
dft2['readmitted'] = dft2['readmitted'].map(readmitted_mapping)
dft3['readmitted'] = dft3['readmitted'].map(readmitted_mapping)
dft4['readmitted'] = dft4['readmitted'].map(readmitted_mapping)
dft5['readmitted'] = dft5['readmitted'].map(readmitted_mapping)



In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb

def encode_categorical_features(df, exclude_columns=None):
    """Encode categorical features using dummy encoding or binary encoding based on the number of unique values,
    while excluding specified columns from encoding.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing features to be encoded.
        exclude_columns (list): List of columns to exclude from encoding.
    
    Returns:
        pd.DataFrame: DataFrame with encoded features.
        dict: Dictionary mapping each original column name to the encoding type used.
    """
    encoded_df = df.copy()
    encoding_info = {}
    if exclude_columns is None:
        exclude_columns = []
    
    categorical_columns = encoded_df.select_dtypes(include=['object', 'category']).columns
    
    for column in categorical_columns:
        if column in exclude_columns:
            encoding_info[column] = "Excluded"
            continue
        
        unique_values = encoded_df[column].nunique()
        if unique_values == 2 or unique_values == 1:
            value_mapping = {label: idx for idx, label in enumerate(encoded_df[column].unique())}
            encoded_df[column] = encoded_df[column].map(value_mapping)
            encoding_info[column] = "Binary Encoding"
        elif unique_values >= 3:
            dummies = pd.get_dummies(encoded_df[column], prefix=column)
            encoded_df = pd.concat([encoded_df, dummies], axis=1)
            encoded_df.drop(column, axis=1, inplace=True)
            encoding_info[column] = "Dummy Encoding"
    
    return encoded_df, encoding_info

def run_multivariable_logistic_regression(encoded_df, target_column):
    """
    Runs a Multivariable Logistic Regression model on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the model, accuracy, and predictions.
    """
    try:
        X = encoded_df.drop(target_column, axis=1)
        y = encoded_df[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        accuracy = model.score(X_test, y_test)
        predictions = model.predict(X_test)

        return {
            "model": model,
            "accuracy": accuracy,
            "predictions": predictions,
            "model_name": "Multivariable Logistic Regression"
        }
    except Exception as e:
        print(f"Error running Multivariable Logistic Regression: {e}")
        return None

def run_random_forest(encoded_df, target_column):
    """
    Runs a Random Forest model on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the model, accuracy, and predictions.
    """
    try:
        X = encoded_df.drop(target_column, axis=1)
        y = encoded_df[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        accuracy = model.score(X_test, y_test)
        predictions = model.predict(X_test)

        return {
            "model": model,
            "accuracy": accuracy,
            "predictions": predictions,
            "model_name": "Random Forest"
        }
    except Exception as e:
        print(f"Error running Random Forest: {e}")
        return None

def run_xgboost(encoded_df, target_column):
    """
    Runs an XGBoost model on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the model, accuracy, and predictions.
    """
    try:
        X = encoded_df.drop(target_column, axis=1)
        y = encoded_df[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Rename columns to ensure they are valid feature names for XGBoost
        X_train.columns = [f"feature_{i}" for i in range(X_train.shape[1])]
        X_test.columns = [f"feature_{i}" for i in range(X_test.shape[1])]

        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        model.fit(X_train, y_train)

        accuracy = model.score(X_test, y_test)
        predictions = model.predict(X_test)

        return {
            "model": model,
            "accuracy": accuracy,
            "predictions": predictions,
            "model_name": "XGBoost"
        }
    except Exception as e:
        print(f"Error running XGBoost: {e}")
        return None

def run_ml_models(encoded_df, target_column):
    """
    Runs Multivariable Logistic Regression, Random Forest, and XGBoost models on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the results for all models.
    """
    results = {}
    
    lr_result = run_multivariable_logistic_regression(encoded_df, target_column)
    if lr_result is not None:
        results["Multivariable Logistic Regression"] = lr_result
        print(f"Accuracy for Multivariable Logistic Regression: {lr_result['accuracy']:.2f}")

    rf_result = run_random_forest(encoded_df, target_column)
    if rf_result is not None:
        results["Random Forest"] = rf_result
        print(f"Accuracy for Random Forest: {rf_result['accuracy']:.2f}")

    xgb_result = run_xgboost(encoded_df, target_column)
    if xgb_result is not None:
        results["XGBoost"] = xgb_result
        print(f"Accuracy for XGBoost: {xgb_result['accuracy']:.2f}")

    return results

# Example usage:
# Assuming 'df' is your DataFrame and 'target_column' is the name of your target column

# Encode categorical features
# encoded_df, encoding_info = encode_categorical_features(df, exclude_columns=[target_column])

# Run the models
# results = run_ml_models(encoded_df, target_column)


In [18]:
# Encode categorical features in the original dataset
encoded_df, encoding_info = encode_categorical_features(df_original, exclude_columns=['readmitted'])

# Print encoding information
print("Encoding Information:")
for column, encoding_type in encoding_info.items():
    print(f"{column}: {encoding_type}")

# Display encoded DataFrame
print("\nEncoded DataFrame:")
print(encoded_df.head())

# Optionally, encode categorical features in noisy datasets
encoded_dfts = []
for dft in [dft1, dft2, dft3, dft4, dft5]:
    encoded_dft, _ = encode_categorical_features(dft, exclude_columns=['readmitted'])
    encoded_dfts.append(encoded_dft)



Encoding Information:
race: Dummy Encoding
gender: Binary Encoding
age: Dummy Encoding
discharge_disposition_id: Dummy Encoding
admission_source_id: Dummy Encoding
diag_1: Dummy Encoding
diag_2: Dummy Encoding
diag_3: Dummy Encoding
max_glu_serum: Dummy Encoding
A1Cresult: Dummy Encoding
metformin: Dummy Encoding
repaglinide: Dummy Encoding
nateglinide: Dummy Encoding
chlorpropamide: Dummy Encoding
glimepiride: Dummy Encoding
acetohexamide: Binary Encoding
glipizide: Dummy Encoding
glyburide: Dummy Encoding
tolbutamide: Binary Encoding
pioglitazone: Dummy Encoding
rosiglitazone: Dummy Encoding
acarbose: Dummy Encoding
miglitol: Dummy Encoding
troglitazone: Binary Encoding
tolazamide: Dummy Encoding
examide: Binary Encoding
citoglipton: Binary Encoding
insulin: Dummy Encoding
glyburide-metformin: Dummy Encoding
glipizide-metformin: Binary Encoding
glimepiride-pioglitazone: Binary Encoding
metformin-rosiglitazone: Binary Encoding
metformin-pioglitazone: Binary Encoding
change: Binary Enc

In [19]:
target_column = 'readmitted'

# Run models on the original dataset
print("\nResults for Original Dataset:")
results_original = run_ml_models(encoded_df, target_column)

# Print results for original dataset
print("\nResults:")
for model_name, result in results_original.items():
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {result['accuracy']:.5f}")
    print(f"Predictions: {result['predictions']}")

# Optionally, run models on each noisy dataset
for i, encoded_dft in enumerate(encoded_dfts, start=1):
    print(f"\nResults for dft{i}:")
    dft_results = run_ml_models(encoded_dft, target_column)
    for model_name, result in dft_results.items():
        print(f"{model_name} Accuracy: {result['accuracy']:.5f}")



Results for Original Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Accuracy for XGBoost: 0.89

Results:

Multivariable Logistic Regression Results:
Accuracy: 0.88847
Predictions: [0 0 0 ... 1 0 0]

Random Forest Results:
Accuracy: 0.88916
Predictions: [0 0 0 ... 0 0 0]

XGBoost Results:
Accuracy: 0.88773
Predictions: [0 0 0 ... 0 0 0]

Results for dft1:




Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Accuracy for XGBoost: 0.89
Multivariable Logistic Regression Accuracy: 0.88928
Random Forest Accuracy: 0.88943
XGBoost Accuracy: 0.88877

Results for dft2:




Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Accuracy for XGBoost: 0.89
Multivariable Logistic Regression Accuracy: 0.88782
Random Forest Accuracy: 0.88812
XGBoost Accuracy: 0.88673

Results for dft3:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Accuracy for XGBoost: 0.89
Multivariable Logistic Regression Accuracy: 0.88782
Random Forest Accuracy: 0.88843
XGBoost Accuracy: 0.88680

Results for dft4:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Accuracy for XGBoost: 0.89
Multivariable Logistic Regression Accuracy: 0.88797
Random Forest Accuracy: 0.88838
XGBoost Accuracy: 0.88725

Results for dft5:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Accuracy for XGBoost: 0.89
Multivariable Logistic Regression Accuracy: 0.88757
Random Forest Accuracy: 0.88845
XGBoost Accuracy: 0.88710


In [21]:
import pandas as pd

# Create a dictionary with the information
data = {
    'DataFrame': ['df1-v', 'df2-v', 'df3-v', 'df4-v', 'df5-v'],
    'P_29 Score': [
        0.8048508675661077,
        0.8280657743789652,
        0.832672944696338,
        0.8381942262639237,
        0.8410949233514133
    ]
}

# Create the DataFrame
df_scores = pd.DataFrame(data)

# Print the DataFrame
print(df_scores)


  DataFrame  P_29 Score
0     df1-v    0.804851
1     df2-v    0.828066
2     df3-v    0.832673
3     df4-v    0.838194
4     df5-v    0.841095


In [22]:
import pandas as pd

# Data for each dataset and their respective accuracies
data = {
    'DataFrame': ['df1-v', 'df2-v', 'df3-v', 'df4-v', 'df5-v'],
    'Multivariable Logistic Regression': [0.88928, 0.88782, 0.88782, 0.88797, 0.88757],
    'Random Forest': [0.88943, 0.88812, 0.88843, 0.88838, 0.88845],
    'XGBoost': [0.88877, 0.88673, 0.88680, 0.88725, 0.88710]
}

# Create DataFrame
df_results = pd.DataFrame(data)

# Print the DataFrame
print(df_results)


  DataFrame  Multivariable Logistic Regression  Random Forest  XGBoost
0     df1-v                            0.88928        0.88943  0.88877
1     df2-v                            0.88782        0.88812  0.88673
2     df3-v                            0.88782        0.88843  0.88680
3     df4-v                            0.88797        0.88838  0.88725
4     df5-v                            0.88757        0.88845  0.88710


In [24]:
import pandas as pd

# Data for P_29 Score
data_scores = {
    'DataFrame': ['df1-v', 'df2-v', 'df3-v', 'df4-v', 'df5-v'],
    'P_29 Score': [
        0.8048508675661077,
        0.8280657743789652,
        0.832672944696338,
        0.8381942262639237,
        0.8410949233514133
    ]
}

# Create the DataFrame for P_29 Scores
df_scores = pd.DataFrame(data_scores)

# Data for accuracies
data_results = {
    'DataFrame': ['df1-v', 'df2-v', 'df3-v', 'df4-v', 'df5-v'],
    'Multivariable Logistic Regression': [0.88928, 0.88782, 0.88782, 0.88797, 0.88757],
    'Random Forest': [0.88943, 0.88812, 0.88843, 0.88838, 0.88845],
    'XGBoost': [0.88877, 0.88673, 0.88680, 0.88725, 0.88710]
}

# Create the DataFrame for accuracies
df_results = pd.DataFrame(data_results)

# Merge the two DataFrames based on 'DataFrame'
merged_df = pd.merge(df_scores, df_results, on='DataFrame')

# Print the merged DataFrame
merged_df


Unnamed: 0,DataFrame,P_29 Score,Multivariable Logistic Regression,Random Forest,XGBoost
0,df1-v,0.804851,0.88928,0.88943,0.88877
1,df2-v,0.828066,0.88782,0.88812,0.88673
2,df3-v,0.832673,0.88782,0.88843,0.8868
3,df4-v,0.838194,0.88797,0.88838,0.88725
4,df5-v,0.841095,0.88757,0.88845,0.8871
