### Model making and Selection

In [1]:
import pandas as pd
import numpy as np 

In [2]:
data=pd.read_csv('Data-set-cleaned.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
data.head(5)

Unnamed: 0,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,max_glu_serum,metformin,repaglinide,glimepiride,glipizide,pioglitazone,insulin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,5,3,2,1,41,0,1,3,14,14,2,-2,-2,-2,-2,-2,-2,0,0,0,0.0,2
1,15,0,0,3,59,0,18,5,3,5,2,-2,-2,-2,-2,-2,1,1,0,0,1.0,1
2,25,0,0,2,11,5,13,15,3,6,2,-2,-2,-2,0,-2,-2,1,0,3,0.0,3
3,35,0,0,2,44,1,16,8,3,1,2,-2,-2,-2,-2,-2,1,1,0,0,1.0,1
4,45,0,0,1,51,0,8,12,12,3,2,-2,-2,-2,0,-2,0,1,0,0,0.0,0


In [5]:
data.readmitted.value_counts()

readmitted
0    85950
1    11120
Name: count, dtype: int64

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight

# Assuming 'data' is your DataFrame with all the necessary preprocessing done

# Separate features and target
X = data.drop('readmitted', axis=1)
y = data['readmitted']

# Handling class imbalance
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(enumerate(class_weights))

# Define models with class weights to address the imbalance
log_reg = LogisticRegression(class_weight=class_weights, max_iter=1000)
rf = RandomForestClassifier(class_weight=class_weights, n_estimators=100)

Logistic Regression 10-fold CV Accuracy: 0.65 (+/- 0.07)
Random Forest 10-fold CV Accuracy: 0.89 (+/- 0.00)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train the Logistic Regression model
log_reg.fit(X_train, y_train)
# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)

# Train the Random Forest model
rf.fit(X_train, y_train)
# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Print classification report for Logistic Regression
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_log_reg))

# Print classification report for Random Forest
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     25785
           1       0.17      0.55      0.26      3336

    accuracy                           0.65     29121
   macro avg       0.55      0.61      0.52     29121
weighted avg       0.83      0.65      0.71     29121

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     25785
           1       0.55      0.00      0.01      3336

    accuracy                           0.89     29121
   macro avg       0.72      0.50      0.47     29121
weighted avg       0.85      0.89      0.83     29121



In [10]:
# Evaluate models using cross-validation
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=10, scoring='accuracy')
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy')

# Print results
print("Logistic Regression 10-fold CV Accuracy: %0.2f (+/- %0.2f)" % (log_reg_scores.mean(), log_reg_scores.std() * 2))
print("Random Forest 10-fold CV Accuracy: %0.2f (+/- %0.2f)" % (rf_scores.mean(), rf_scores.std() * 2))

Logistic Regression 10-fold CV Accuracy: 0.64 (+/- 0.01)
Random Forest 10-fold CV Accuracy: 0.89 (+/- 0.00)


In [12]:
from sklearn.model_selection import GridSearchCV

# Parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # 'liblinear' works well with l1 and l2 penalties
}

# Parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Grid Search for Logistic Regression
grid_search_lr = GridSearchCV(LogisticRegression(class_weight=class_weights, max_iter=1000),
                              param_grid=param_grid_lr, cv=5, scoring='f1_macro', verbose=1)

# Grid Search for Random Forest
grid_search_rf = GridSearchCV(RandomForestClassifier(class_weight=class_weights),
                              param_grid=param_grid_rf, cv=5, scoring='f1_macro', verbose=1)
# Perform Grid Search for Logistic Regression
grid_search_lr.fit(X_train, y_train)

# Perform Grid Search for Random Forest
grid_search_rf.fit(X_train, y_train)
# Best Logistic Regression model
y_pred_lr_best = grid_search_lr.best_estimator_.predict(X_test)
print("Improved Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr_best))

# Best Random Forest model
y_pred_rf_best = grid_search_rf.best_estimator_.predict(X_test)
print("Improved Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf_best))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Improved Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     25785
           1       0.17      0.55      0.26      3336

    accuracy                           0.65     29121
   macro avg       0.55      0.61      0.52     29121
weighted avg       0.83      0.65      0.71     29121

Improved Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.92      0.72      0.80     25785
           1       0.18      0.49      0.27      3336

    accuracy                           0.69     29121
   macro avg       0.55      0.60      0.53     29121
weighted avg       0.83      0.69      0.74     29121

