In [None]:
# Import necessary packages
import woodwork as ww
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

#### This file contains a Logistic Regression model that is used to predict Type 2 Diabetes prevalence. 

In [None]:
# Import train, validation and test set 
X =  pd.read_csv('../Processed datasets/After splitting/mrmr/500selected_four_categories.csv')
y  = pd.read_csv('../cleaned_imputed_split/y_train.csv')

X_val = pd.read_csv('../Processed datasets/After splitting/mrmr/500selected_val_four_categories.csv')
y_val = pd.read_csv('../cleaned_imputed_split/y_val.csv')

X_test = pd.read_csv('../Processed datasets/After splitting/mrmr/500selected_test_four_categories.csv')
y_test = pd.read_csv('../cleaned_imputed_split/y_test.csv')

feature_names = X.columns.tolist()

In [None]:
# Replace 2 by 1 so 0 means non-diabetic and 1 means diabetic 
y.loc[y['diabetic_outcome'] == 0, 'diabetic_outcome'] = 0
y.loc[y['diabetic_outcome'] == 2, 'diabetic_outcome'] = 1

y_val.loc[y_val['diabetic_outcome'] == 0, 'diabetic_outcome'] = 0
y_val.loc[y_val['diabetic_outcome'] == 2, 'diabetic_outcome'] = 1

y_test.loc[y_test['diabetic_outcome'] == 0, 'diabetic_outcome'] = 0
y_test.loc[y_test['diabetic_outcome'] == 2, 'diabetic_outcome'] = 1

#### Identifying columns that need rescaling

In [None]:
# Load dictionary with datatypes 
import pickle 
with open('../cleaned_imputed_split/datatype_dictionary.pkl', 'rb') as f:
    datatypes = pickle.load(f)

In [None]:
# Identify columns of datatype Double
double = {key: value for key, value in datatypes.items() if value == ww.logical_types.Double}

In [None]:
# Identify columns of datatype Integer
integers = {key: value for key, value in datatypes.items() if value == ww.logical_types.Integer}

In [None]:
# Add Doubles and Integers to a list
features_to_be_rescaled = list(double.keys()) + list(integers.keys())

In [None]:
# Create a list of columns that need to be rescaled
dataframe_columns = X.columns.tolist()
rescaling = []

for i in features_to_be_rescaled:
    for j in dataframe_columns:
        if i in j:
            rescaling.append(j)

In [None]:
# Rescale necessary columns
scaler = StandardScaler()
X[rescaling] = scaler.fit_transform(X[rescaling])
X_val[rescaling] = scaler.fit_transform(X_val[rescaling])
X_test[rescaling] = scaler.fit_transform(X_test[rescaling])

#### Evaluate cross-validation performance

In [None]:
# Import necessary packages
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, make_scorer
from sklearn.preprocessing import StandardScaler

# Convert the training and validation data to numpy arrays
X = X.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()
y = y.to_numpy()

# Create a empty dictionary to save ROC scores
roc_scores = {}

# Create a list of number of features to loop over 
n_features_list = [10, 50, 100, 200]

# Fit the LogisticRegression 
for n_features in n_features_list:
    # Wrap model with Recursive Feature Elimination
    rfe = RFE(estimator=LogisticRegression(max_iter=5000), n_features_to_select=n_features)
    model = LogisticRegression(max_iter=5000)

    # Create pipeline with RFE and model
    pipeline = Pipeline(steps=[('s', rfe), ('m', model)])

    # Fit model to training data 
    pipeline.fit(X,y.ravel())
     
    # Evaluate model by performing cross-validation
    scores = cross_val_score(pipeline, X,y.ravel(), cv=5, scoring='roc_auc')
    
    # Print average AUC score 
    print(scores)
    print("Average AUC:", np.mean(scores))

#### Evaluate performance of pre-optimized model on the validation set with a confusion matrix

In [None]:
# Convert the training and validation data to numpy arrays
X = X.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()
y = y.to_numpy()

# Train model on train data 
rfe = RFE(estimator=LogisticRegression(max_iter=5000), n_features_to_select=50)
model = LogisticRegression(max_iter=5000)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
pipeline.fit(X,y.ravel())

selected_f = [feature_names[i] for i, support in enumerate(rfe.support_) if support]
print(selected_f)
    
# Evaluate model on validation data 
y_pred_val = pipeline.predict(X_val)

# Print AUC score on validation data 
y_prob_val = pipeline.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_prob_val)
print("AUC:",auc)

# Print confusion matrix achieved on validation data 
cm = confusion_matrix(y_val, y_pred_val, labels=[0,1])
sns.heatmap(cm, annot=True, fmt='g', xticklabels=['No diabetes', 'Diabetes'], yticklabels=['No diabetes', 'Diabetes'])
plt.ylabel('Actual', fontsize=13)
plt.xlabel('Predicted', fontsize=13)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

#### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Select 50 features using RFE
rfe = RFE(estimator=LogisticRegression(max_iter=5000), n_features_to_select=50)
# Create a Logistic Regression model
model = LogisticRegression(max_iter=5000)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])

# Fit the pipeline model to the train data
pipeline.fit(X,y.ravel())

# Define a parameter grid for cross-validation grid search
param_grid = {'m__penalty':['l2'], # Regularization penalty
              'm__C':[0.01, 0.1, 1.0], # Regularization strength
              'm__solver':['lbfgs', 'liblinear', 'newton-cg', 'saga', 'sag'], # Solvers for optimization
              's__n_features_to_select':[10, 50, 100, 200]} # Number of features to select using RFE

# Perform grid search with cross-validation (cv=5) using ROC AUC as the scoring metric
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', verbose=1)
grid_search.fit(X,y.ravel())

best_model = grid_search.best_estimator_

# Convert grid search results to a DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Create a pivot table to reorganize the grid search results for visualization
pivot_table = results.pivot_table(
    values='mean_test_score',
    index=['param_m__penalty', 'param_m__C'],
    columns=['param_m__solver', 'param_s__n_features_to_select']
)


# Plot the grid search results in a clustermap
sns.clustermap(pivot_table, annot=True, cmap='rocket_r')

In [None]:
clustergrid = sns.clustermap(pivot_table, annot=True, cmap='rocket_r')
clustergrid.fig.savefig('hyperparameter_tuning_LR.png')

#### Evaluate StratifiedKFoldCross results

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
from sklearn.metrics import roc_auc_score, confusion_matrix

# Convert the training and validation data to numpy arrays
X = X.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()
y = y.to_numpy()

# Select 50 features using RFE
rfe = RFE(estimator=LogisticRegression(max_iter=5000), n_features_to_select=50)
# Create a Logistic Regression model
model = LogisticRegression(max_iter=5000)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])

# Split the data into 5 folds 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store AUC scores for each fold
auc_scores = []

# Perform cross-validation
for train_index, test_index in skf.split(X,y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the pipeline model on the training data
    pipeline.fit(X_train, y_train.ravel())

    # Predict probabilities on the test data of the fold 
    probas_ = pipeline.predict_proba(X_test)[:,1]

     # Calculate the ROC-AUC score for the current fold
    auc_score = roc_auc_score(y_test, probas_)

    # Append the AUC score to the list
    auc_scores.append(auc_score)

# Print average AUC score across 5 folds 
print(auc_scores)
average_auc = np.mean(auc_scores)
print(f'Average ROC-AUC Score: {average_auc}')

#### Evaluate performance on test set

In [None]:
# Import train, validation and test set 
X =  pd.read_csv('../Processed datasets/After splitting/mrmr/300selected_four_categories.csv')
y  = pd.read_csv('../cleaned_imputed_split/y_train.csv')

X_val = pd.read_csv('../Processed datasets/After splitting/mrmr/300selected_val_four_categories.csv')
y_val = pd.read_csv('../cleaned_imputed_split/y_val.csv')

X_test = pd.read_csv('../Processed datasets/After splitting/mrmr/500selected_test_four_categories.csv')
y_test = pd.read_csv('../cleaned_imputed_split/y_test.csv')

feature_names = X.columns.tolist()

In [None]:
# Replace 2 by 1 so 0 means non-diabetic and 1 means diabetic 
y.loc[y['diabetic_outcome'] == 0, 'diabetic_outcome'] = 0
y.loc[y['diabetic_outcome'] == 2, 'diabetic_outcome'] = 1

y_val.loc[y_val['diabetic_outcome'] == 0, 'diabetic_outcome'] = 0
y_val.loc[y_val['diabetic_outcome'] == 2, 'diabetic_outcome'] = 1

y_test.loc[y_test['diabetic_outcome'] == 0, 'diabetic_outcome'] = 0
y_test.loc[y_test['diabetic_outcome'] == 2, 'diabetic_outcome'] = 1

In [None]:
X[rescaling] = scaler.fit_transform(X[rescaling])
X_val[rescaling] = scaler.fit_transform(X_val[rescaling])
X_test[rescaling] = scaler.fit_transform(X_test[rescaling])

In [None]:
# Merge train and validation set 
X_train_full = pd.concat([X, X_val], ignore_index=True, axis=0)
y_train_full = pd.concat([y, y_val], ignore_index=True, axis=0)

In [None]:
X_original = X.copy()

In [None]:
feature_names = X_train_full.columns.to_list()

In [None]:
# Convert the training and testing data to numpy arrays
X = X_train_full.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()
y = y_train_full.to_numpy()

# Select 50 features using RFE
rfe = RFE(estimator=LogisticRegression(max_iter=5000), n_features_to_select=50)
# Create a Logistic Regression model
model = LogisticRegression(max_iter=5000)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])

# Fit the pipeline model to the train data
pipeline.fit(X,y.ravel())

# Get the selected feature names from the RFE support mask
selected_f = [feature_names[i] for i, support in enumerate(rfe.support_) if support]
print(selected_f)

# Predict target for the test data
y_pred_test = pipeline.predict(X_test)

# Predict probabilities for the positive class (class 1) in the test data
y_prob_test = pipeline.predict_proba(X_test)[:,1]


# Calculate the ROC-AUC score for the test data predictions
auc = roc_auc_score(y_test, y_prob_test)
print("AUC:",auc)

# Generate and plot the confusion matrix for the test data predictions
cm = confusion_matrix(y_test, y_pred_test, labels=[0,1])
sns.heatmap(cm, annot=True, fmt='g', xticklabels=['No diabetes', 'Diabetes'], yticklabels=['No diabetes', 'Diabetes'], cmap='rocket_r')
plt.ylabel('Actual', fontsize=13)
plt.xlabel('Predicted', fontsize=13)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

# Get the feature names
feature_names = feature_names

# Get the indices of the selected features from RFE
selected_indices = np.where(rfe.support_)[0]

# Get the names of the selected features
selected_features = [feature_names[i] for i in selected_indices]

In [None]:
import shap

# Transform the feature set using the RFE step in the pipeline
X_transformed = pipeline.named_steps['s'].transform(X)

# Create a SHAP explainer using the trained Logistic Regression model and the transformed feature set
explainer = shap.Explainer(pipeline.named_steps['m'], X_transformed)

# Calculate SHAP values for the transformed feature set
shap_values = explainer(X_transformed)

# Create a summary plot of the SHAP values to show the impact of each feature
fig, ax = plt.subplots(1,1, figsize=(10,8))
shap.summary_plot(shap_values.values, X_transformed, feature_names=selected_features)

plt.savefig('SHAP_summary_plot_LR.png')
plt.close()

#### Comparison with TNO features

In [None]:
import pickle 

# Save the 50 selected features by the wrapper LR model
with open('../optimized_model_results/50_selected_features_LR.pkl', 'wb') as f:
    pickle.dump(selected_f, f)

In [None]:
# Import the 50 selected features by the wrapper LR model
with open('../optimized_model_results/50_selected_features_LR.pkl', 'rb') as f:
    selected_f = pickle.load(f)

In [None]:
# Load Whitehall dictionary from files to use
with open('../Processed datasets/feature_dictionary.pkl', 'rb') as f:
    columns_whitehall = pickle.load(f)

In [None]:
# List of transformation primitives to search for in the feature names
trans_primitives = ['SQUARE_ROOT', 'PERCENTILE', 'AND(', 'OR(', 'NOT', '*', '-', '+']

# Initialize a dictionary to store counts of features containing each primitive
counts = {primitive: [] for primitive in trans_primitives}

# Iterate over each transformation primitive
for primitive in trans_primitives:
    # Iterate over each selected feature name
    for feature in selected_f:
        # If the primitive is found in the feature name, add the feature to the counts dictionary
        if primitive in feature:
            counts[primitive].append(feature)

# Print the number of features containing each transformation primitive
for primitive, count in counts.items():
    print(f"Number of strings containing '{primitive}': {len(count)}")

print(f"The number of raw features is:", (len(selected_f) - sum(len(count) for count in counts.values())))

In [None]:
import re 
from collections import defaultdict, Counter

# Extract the individual column names from a combined feature 
def extract_columns(feature):
    feature = feature.replace("AND(", "").replace(")", "")
    return re.split(r'\s*[\+\-\*,]\s', feature)

# Map each column name to its corresponding category 
column_to_category = {}
for category, columns in columns_whitehall.items():
    for column in columns:
        column_to_category[column] = category

# Count the number of times that combinations occur 
category_combinations = Counter()

for feature in selected_f:
    columns = extract_columns(feature)
    involved_categories = {column_to_category.get(column) for column in columns if column and column in column_to_category}
    involved_categories.discard(None)

    category_combinations[tuple(sorted(involved_categories))] +=1

# Print the combinations of categories and their counts 
for combination, count in category_combinations.items():
    print(f"Combination {combination}: {count} feature(s)")