In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import re
import warnings
warnings.filterwarnings("ignore")

In [20]:
df = pd.read_csv('../data/diabetes_data_encoded.csv').drop(columns=["Unnamed: 0"], axis=1)

df.columns = [re.sub(r'[^\w\s]', '', col) for col in df.columns]
df.columns = [re.sub(r'[_]', ' ', col) for col in df.columns]
df.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [14]:
readmit = df['readmitted'].value_counts()

# Print the counts
print("Occurrences of Readmitted:")
print("Value 0:", readmit[0])
print("Value 1:", readmit[1])
print('%:',readmit[1]/readmit[0])

X = df.drop(['readmitted'], axis=1)
y = df['readmitted'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Occurrences of Readmitted:
Value 0: 90406
Value 1: 11357
%: 0.12562219321726434


In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [16]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_pca, y_train)
rfc_predictions = rfc.predict(X_test_pca) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

metrics_df = pd.DataFrame(data)
print(metrics_df)


              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94     18084
         1.0       0.40      0.02      0.03      2269

    accuracy                           0.89     20353
   macro avg       0.65      0.51      0.49     20353
weighted avg       0.84      0.89      0.84     20353

[[18029    55]
 [ 2232    37]]
  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0                RFC   0.835472  0.887633  0.839018  0.887633   

            Confusion Matrix  
0  [[18029, 55], [2232, 37]]  


In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [18]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_pca,y_train)
gbt_predictions = gbt.predict(X_test_pca) 
print(classification_report(y_val, gbt_predictions))
print(confusion_matrix(y_val, gbt_predictions))

report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df2 = pd.DataFrame(data)
metrics_df = metrics_df.append(df2, ignore_index=True)
print(metrics_df)


              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94     18084
         1.0       0.00      0.00      0.00      2269

    accuracy                           0.89     20353
   macro avg       0.44      0.50      0.47     20353
weighted avg       0.79      0.89      0.84     20353

[[18082     2]
 [ 2269     0]]
  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0                RFC   0.835472  0.887633  0.839018  0.887633   
1                GBT   0.789454  0.888419  0.836018  0.888419   

            Confusion Matrix  
0  [[18029, 55], [2232, 37]]  
1    [[18082, 2], [2269, 0]]  


In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [22]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
params = {
    'objective': 'binary',
    'metric': 'mae',
    'min_child_weight': 60,  # Adjust this value and experiment
    'random_state': 42,
    'max_delta_step': 1,
    'verbose': -1,
    'max_depth': 10, 
}

train_data = lgb.Dataset(X_train_pca, label=y_train)
val_data = lgb.Dataset(X_test_pca, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_test_pca)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df2 = pd.DataFrame(data)
metrics_df = metrics_df.append(df2, ignore_index=True)
print(metrics_df)

metrics_df.to_csv('../data/pcaResults.csv', header=1)

[1]	valid_0's l1: 0.19804
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.19787
[3]	valid_0's l1: 0.197673
[4]	valid_0's l1: 0.197465
[5]	valid_0's l1: 0.19733
[6]	valid_0's l1: 0.19716
[7]	valid_0's l1: 0.196994
[8]	valid_0's l1: 0.196885
[9]	valid_0's l1: 0.19675
[10]	valid_0's l1: 0.196671
[11]	valid_0's l1: 0.19659
[12]	valid_0's l1: 0.19648
[13]	valid_0's l1: 0.196354
[14]	valid_0's l1: 0.196287
[15]	valid_0's l1: 0.19621
[16]	valid_0's l1: 0.196104
[17]	valid_0's l1: 0.195995
[18]	valid_0's l1: 0.195884
[19]	valid_0's l1: 0.1958
[20]	valid_0's l1: 0.195803
[21]	valid_0's l1: 0.195702
[22]	valid_0's l1: 0.195629
[23]	valid_0's l1: 0.195557
[24]	valid_0's l1: 0.195533
[25]	valid_0's l1: 0.19543
[26]	valid_0's l1: 0.195305
[27]	valid_0's l1: 0.195269
[28]	valid_0's l1: 0.195231
[29]	valid_0's l1: 0.195186
[30]	valid_0's l1: 0.195124
[31]	valid_0's l1: 0.19508
[32]	valid_0's l1: 0.195047
[33]	valid_0's l1: 0.195041
[34]	valid_0's l1: 0.194992
[35]	val