In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Read the CSV files into DataFrames
df_original = pd.read_csv('originaldf.csv')
df_noisy1 = pd.read_csv('df-noisy1.csv')
df_noisy2 = pd.read_csv('df-noisy2.csv')
df_noisy3 = pd.read_csv('df-noisy3.csv')
df_noisy4 = pd.read_csv('df-noisy4.csv')
df_noisy5 = pd.read_csv('df-noisy5.csv')
df_noisy6 = pd.read_csv('df-noisy6.csv')
df_noisy7 = pd.read_csv('df-noisy7.csv')
df_noisy8 = pd.read_csv('df-noisy8.csv')

dft1= pd.read_csv('df1-v.csv')
dft2= pd.read_csv('df2-v.csv')
dft3= pd.read_csv('df3-v.csv')
dft4= pd.read_csv('df4-v.csv')
dft5= pd.read_csv('df5-v.csv')


readmitted_mapping = {'<30': 1, '>30': 0, 'NO': 0}

# Apply the mapping to 'readmitted' column in each DataFrame
df_original['readmitted'] = df_original['readmitted'].map(readmitted_mapping)
df_noisy1['readmitted'] = df_noisy1['readmitted'].map(readmitted_mapping)
df_noisy2['readmitted'] = df_noisy2['readmitted'].map(readmitted_mapping)
df_noisy3['readmitted'] = df_noisy3['readmitted'].map(readmitted_mapping)
df_noisy4['readmitted'] = df_noisy4['readmitted'].map(readmitted_mapping)
df_noisy5['readmitted'] = df_noisy5['readmitted'].map(readmitted_mapping)
df_noisy6['readmitted'] = df_noisy6['readmitted'].map(readmitted_mapping)
df_noisy7['readmitted'] = df_noisy7['readmitted'].map(readmitted_mapping)
df_noisy8['readmitted'] = df_noisy8['readmitted'].map(readmitted_mapping)

dft1['readmitted'] = dft1['readmitted'].map(readmitted_mapping)
dft2['readmitted'] = dft2['readmitted'].map(readmitted_mapping)
dft3['readmitted'] = dft3['readmitted'].map(readmitted_mapping)
dft4['readmitted'] = dft4['readmitted'].map(readmitted_mapping)
dft5['readmitted'] = dft5['readmitted'].map(readmitted_mapping)



In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def encode_categorical_features(df, exclude_columns=None):
    """Encode categorical features using dummy encoding or binary encoding based on the number of unique values,
    while excluding specified columns from encoding.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing features to be encoded.
        exclude_columns (list): List of columns to exclude from encoding.
    
    Returns:
        pd.DataFrame: DataFrame with encoded features.
        dict: Dictionary mapping each original column name to the encoding type used.
    """
    encoded_df = df.copy()
    encoding_info = {}
    if exclude_columns is None:
        exclude_columns = []
    
    categorical_columns = encoded_df.select_dtypes(include=['object', 'category']).columns
    
    for column in categorical_columns:
        if column in exclude_columns:
            encoding_info[column] = "Excluded"
            continue
        
        unique_values = encoded_df[column].nunique()
        if unique_values == 2 or unique_values == 1:
            value_mapping = {label: idx for idx, label in enumerate(encoded_df[column].unique())}
            encoded_df[column] = encoded_df[column].map(value_mapping)
            encoding_info[column] = "Binary Encoding"
        elif unique_values >= 3:
            dummies = pd.get_dummies(encoded_df[column], prefix=column)
            encoded_df = pd.concat([encoded_df, dummies], axis=1)
            encoded_df.drop(column, axis=1, inplace=True)
            encoding_info[column] = "Dummy Encoding"
    
    return encoded_df, encoding_info

def run_multivariable_logistic_regression(encoded_df, target_column):
    """
    Runs a Multivariable Logistic Regression model on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the model, accuracy, and predictions.
    """
    try:
        X = encoded_df.drop(target_column, axis=1)
        y = encoded_df[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        accuracy = model.score(X_test, y_test)
        predictions = model.predict(X_test)

        return {
            "model": model,
            "accuracy": accuracy,
            "predictions": predictions,
            "model_name": "Multivariable Logistic Regression"
        }
    except Exception as e:
        print(f"Error running Multivariable Logistic Regression: {e}")
        return None

def run_random_forest(encoded_df, target_column):
    """
    Runs a Random Forest model on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the model, accuracy, and predictions.
    """
    try:
        X = encoded_df.drop(target_column, axis=1)
        y = encoded_df[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        accuracy = model.score(X_test, y_test)
        predictions = model.predict(X_test)

        return {
            "model": model,
            "accuracy": accuracy,
            "predictions": predictions,
            "model_name": "Random Forest"
        }
    except Exception as e:
        print(f"Error running Random Forest: {e}")
        return None

def run_ml_models(encoded_df, target_column):
    """
    Runs both Multivariable Logistic Regression and Random Forest models on the provided encoded DataFrame.
    
    Parameters:
    - encoded_df: The encoded DataFrame.
    - target_column: The name of the target column in the DataFrame.
    
    Returns:
    - dict: Dictionary containing the results for both models.
    """
    results = {}
    lr_result = run_multivariable_logistic_regression(encoded_df, target_column)
    if lr_result is not None:
        results["Multivariable Logistic Regression"] = lr_result
        print(f"Accuracy for Multivariable Logistic Regression: {lr_result['accuracy']:.2f}")

    rf_result = run_random_forest(encoded_df, target_column)
    if rf_result is not None:
        results["Random Forest"] = rf_result
        print(f"Accuracy for Random Forest: {rf_result['accuracy']:.2f}")

    return results


In [54]:
# Encode categorical features in the original dataset
encoded_df, encoding_info = encode_categorical_features(df_original, exclude_columns=['readmitted'])

# Print encoding information
print("Encoding Information:")
for column, encoding_type in encoding_info.items():
    print(f"{column}: {encoding_type}")

# Display encoded DataFrame
print("\nEncoded DataFrame:")
print(encoded_df.head())

# Optionally, encode categorical features in noisy datasets
encoded_noisy_dfs = []
for df_noisy in [df_noisy1, df_noisy2, df_noisy3, df_noisy4, df_noisy5, df_noisy6, df_noisy7, df_noisy8]:
    encoded_noisy_df, _ = encode_categorical_features(df_noisy, exclude_columns=['readmitted'])
    encoded_noisy_dfs.append(encoded_noisy_df)


Encoding Information:
race: Dummy Encoding
gender: Binary Encoding
age: Dummy Encoding
discharge_disposition_id: Dummy Encoding
admission_source_id: Dummy Encoding
diag_1: Dummy Encoding
diag_2: Dummy Encoding
diag_3: Dummy Encoding
max_glu_serum: Dummy Encoding
A1Cresult: Dummy Encoding
metformin: Dummy Encoding
repaglinide: Dummy Encoding
nateglinide: Dummy Encoding
chlorpropamide: Dummy Encoding
glimepiride: Dummy Encoding
acetohexamide: Binary Encoding
glipizide: Dummy Encoding
glyburide: Dummy Encoding
tolbutamide: Binary Encoding
pioglitazone: Dummy Encoding
rosiglitazone: Dummy Encoding
acarbose: Dummy Encoding
miglitol: Dummy Encoding
troglitazone: Binary Encoding
tolazamide: Dummy Encoding
examide: Binary Encoding
citoglipton: Binary Encoding
insulin: Dummy Encoding
glyburide-metformin: Dummy Encoding
glipizide-metformin: Binary Encoding
glimepiride-pioglitazone: Binary Encoding
metformin-rosiglitazone: Binary Encoding
metformin-pioglitazone: Binary Encoding
change: Binary Enc

In [55]:

target_column = 'readmitted'

# Run models on the original dataset
print("\nResults for Original Dataset:")
results_original = run_ml_models(encoded_df, target_column)

# Print results for original dataset
print("\nResults:")
for model_name, result in results_original.items():
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {result['accuracy']:.5f}")
    print(f"Predictions: {result['predictions']}")

# Optionally, run models on each noisy dataset
for i, encoded_noisy_df in enumerate(encoded_noisy_dfs, start=1):
    print(f"\nResults for Noisy{i} Dataset:")
    noisy_results = run_ml_models(encoded_noisy_df, target_column)
    for model_name, result in noisy_results.items():
        print(f"{model_name} Accuracy: {result['accuracy']:.5f}")



Results for Original Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89

Results:

Multivariable Logistic Regression Results:
Accuracy: 0.88847
Predictions: [0 0 0 ... 1 0 0]

Random Forest Results:
Accuracy: 0.88916
Predictions: [0 0 0 ... 0 0 0]

Results for Noisy1 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88786
Random Forest Accuracy: 0.88786

Results for Noisy2 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88762
Random Forest Accuracy: 0.88762

Results for Noisy3 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88843
Random Forest Accuracy: 0.88843

Results for Noisy4 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88872
Random Forest Accuracy: 0.88872

Results for Noisy5 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88872
Random Forest Accuracy: 0.88872

Results for Noisy6 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88852
Random Forest Accuracy: 0.88852

Results for Noisy7 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88852
Random Forest Accuracy: 0.88852

Results for Noisy8 Dataset:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Multivariable Logistic Regression: 0.89
Accuracy for Random Forest: 0.89
Multivariable Logistic Regression Accuracy: 0.88852
Random Forest Accuracy: 0.88852


In [57]:
import pandas as pd

# Create a dictionary with the results
results_dict = {
    "Dataset": ["Original", "Noisy1", "Noisy2", "Noisy3", "Noisy4", "Noisy5", "Noisy6", "Noisy7", "Noisy8"],
    "Logistic Regression": [0.88852, 0.88786, 0.88762, 0.88843, 0.88872, 0.88872, 0.88852, 0.88852, 0.88852],
    "Random Forest": [0.88916, 0.88786, 0.88762, 0.88843, 0.88872, 0.88872, 0.88852, 0.88852, 0.88852]
}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results_dict)

# Set 'Dataset' as index (optional, but can be useful for certain operations)
results_df.set_index('Dataset', inplace=True)

# Print the DataFrame
print(results_df)


          Logistic Regression  Random Forest
Dataset                                     
Original              0.88852        0.88916
Noisy1                0.88786        0.88786
Noisy2                0.88762        0.88762
Noisy3                0.88843        0.88843
Noisy4                0.88872        0.88872
Noisy5                0.88872        0.88872
Noisy6                0.88852        0.88852
Noisy7                0.88852        0.88852
Noisy8                0.88852        0.88852
