In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the training, validation, and test sets into pandas DataFrames
train_data = pd.read_csv('train_data_model.csv')
val_data = pd.read_csv('val_data_model.csv')
test_data = pd.read_csv('matched_test_imputed_df.csv')

In [3]:
# Split the training set into features (X) and target (y)
X_train = train_data.iloc[:, 8:21] # Select the biomarker variables as features
y_train = train_data['diabetes..0.0_x']

# Split the validation set into features (X) and target (y)
X_val = val_data.iloc[:, 8:21] # Select the biomarker variables as features
y_val = val_data['diabetes..0.0_x']

# Split the test set into features (X) and target (y)
X_test = test_data.iloc[:, 8:21] # Select the biomarker variables as features
y_test = test_data['diabetes..0.0_x']

In [4]:
# Standardize the input features using the StandardScaler class
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Train a Logistic Regression model with L1 regularization using the training set
log_reg = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', random_state=42)
log_reg.fit(X_train_scaled, y_train)

In [6]:
# Print the coefficients of the selected features
for i in range(len(X_train.columns)):
    print("{}: {}".format(X_train.columns[i], log_reg.coef_[0][i]))


total_cholesterol.0.0: -0.13737606355326787
vldl_cholesterol.0.0: -0.09272623075866386
ldl_cholesterol.0.0: 0.009515012050197856
hdl_cholesterol.0.0: -0.05151881944170578
glucose.0.0: 0.19191575344639136
c_reactive_protein.0.0: -0.06042837229398867
HbA1c.0.0: 2.184346460393028
insulin_IGF_1.0.0: 0.09389822952400192
lipoprotein.0.0: -0.02262656490572957
phosphate.0.0: 0.04395062447851713
SHBG.0.0: -0.01677969426069273
total_bilirubin.0.0: 0.17754217618789214
testosterone.0.0: 0.0


In [7]:
# Use the trained model to make predictions on the validation set
y_val_pred = log_reg.predict(X_val_scaled)

In [8]:
# Calculate the accuracy of the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation accuracy: {:.2f}".format(val_accuracy))

Validation accuracy: 0.83


In [9]:
# Use the trained model to make predictions on the test set
y_test_pred = log_reg.predict(X_test_scaled)

In [10]:
# Calculate the accuracy of the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy: {:.2f}".format(test_accuracy))

Test accuracy: 0.85


In [11]:
# Print the classification report for the test set
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       803
           1       0.79      0.55      0.65       272

    accuracy                           0.85      1075
   macro avg       0.83      0.75      0.78      1075
weighted avg       0.84      0.85      0.84      1075



imbalanced dataset, try over sampling

In [12]:
from imblearn.over_sampling import SMOTE

In [14]:
# Create a SMOTE object
smote = SMOTE(random_state=42)

In [15]:
# Apply SMOTE to your training data
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train_scaled, y_train)

In [17]:
# Train a Logistic Regression model with L1 regularization using the oversampled training set
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
log_reg.fit(X_train_oversampled, y_train_oversampled)

In [18]:
# Print the coefficients of the selected features
for i in range(len(X_train.columns)):
    print("{}: {}".format(X_train.columns[i], log_reg.coef_[0][i]))


total_cholesterol.0.0: -0.13105716414852078
vldl_cholesterol.0.0: -0.14030140229550916
ldl_cholesterol.0.0: 0.05508488484239568
hdl_cholesterol.0.0: -0.08647742009035878
glucose.0.0: 0.1812084608976216
c_reactive_protein.0.0: -0.08139787654780592
HbA1c.0.0: 2.576675772682382
insulin_IGF_1.0.0: 0.10999720106323943
lipoprotein.0.0: -0.04811031848048072
phosphate.0.0: 0.045577154361844646
SHBG.0.0: -0.014806678153561119
total_bilirubin.0.0: 0.1943680437632538
testosterone.0.0: -0.025674053369008232


In [19]:
# Evaluate the model on the validation set
y_val_pred = log_reg.predict(X_val_scaled)
print("Validation set classification report:")
print(classification_report(y_val, y_val_pred))

Validation set classification report:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       573
           1       0.65      0.76      0.70       227

    accuracy                           0.82       800
   macro avg       0.77      0.80      0.78       800
weighted avg       0.83      0.82      0.82       800



In [21]:
# Evaluate the model on the test set
y_test_pred = log_reg.predict(X_test_scaled)
print("Test set classification report:")
print(classification_report(y_test, y_test_pred))

Test set classification report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       803
           1       0.60      0.77      0.67       272

    accuracy                           0.81      1075
   macro avg       0.76      0.80      0.77      1075
weighted avg       0.83      0.81      0.82      1075



hyperparameter tuning 

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
# Define hyperparameters to tune
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced']
}


In [24]:
# Initialize the logistic regression model
log_reg = LogisticRegression(random_state=42)

In [25]:
# Initialize the grid search with cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)

In [None]:
# Fit the grid search
grid_search.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


In [None]:
# Print the best hyperparameters
print("Best hyperparameters found:")
print(grid_search.best_params_)

In [None]:
# Evaluate the tuned model on the test set
y_test_pred = grid_search.predict(X_test_scaled)


In [None]:
# Print the classification report for the test set
print("Test set classification report:")
print(classification_report(y_test, y_test_pred))