### Measures
 - Accuracy
 - Precision
 - Recall
 - F1 Score
 - AUC-ROC

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import re

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")

In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The pgf.debug rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.level rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.fileo rcparam was depr

In [2]:
df = pd.read_csv('../data/diabetes_data_encoded.csv').drop(columns=["Unnamed: 0"], axis=1)

In [3]:
df.columns = [re.sub(r'[^\w\s]', '', col) for col in df.columns]
df.columns = [re.sub(r'[_]', ' ', col) for col in df.columns]
df.dtypes.unique()

array([dtype('int64')], dtype=object)

In [4]:
print(df.shape)

(97916, 1446)


In [5]:
readmit = df['readmitted'].value_counts()

# Print the counts
print("Occurrences of Readmitted:")
print("Value 0:", readmit[0])
print("Value 1:", readmit[1])
print('%:',readmit[1]/readmit[0])

Occurrences of Readmitted:
Value 0: 86950
Value 1: 10966
%: 0.12611845888441633


In [6]:
X = df.drop('readmitted', axis=1)
y = df['readmitted'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

under_sampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

In [8]:
pipeline = make_pipeline(SMOTE(random_state=42), RandomUnderSampler(random_state=42))
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_smote, y_train_smote)
rfc_predictions = rfc.predict(X_val) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

In [None]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Smote-RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

metrics_df = pd.DataFrame(data)
print(metrics_df)

In [None]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_under, y_train_under)
rfc_predictions = rfc.predict(X_val) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

In [None]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Undersampled-RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_resampled, y_train_resampled)
rfc_predictions = rfc.predict(X_val) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

In [None]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Resampled-RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_smote,y_train_smote)
gbt_predictions = gbt.predict(X_val) 
print(classification_report(y_val, gbt_predictions))
print(confusion_matrix(y_val, gbt_predictions))

In [None]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Smote-GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_under,y_train_under)
gbt_predictions = gbt.predict(X_val) 
print(classification_report(y_val, gbt_predictions))
print(confusion_matrix(y_val, gbt_predictions))

In [None]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Undersampled-GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_resampled,y_train_resampled)
gbt_predictions = gbt.predict(X_val) 

In [None]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Resampled-GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
params = {
    'objective': 'binary',
    'metric': 'mae',
    'min_child_weight': 60,  # Adjust this value and experiment
    'random_state': 42,
    'max_delta_step': 1,
    'verbose': -1,
    'max_depth': 10, 
}

train_data = lgb.Dataset(X_train_under, label=y_train_under)
val_data = lgb.Dataset(X_val, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_val)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

In [None]:
print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

In [None]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Undersampled-LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
train_data = lgb.Dataset(X_train_smote, label=y_train_smote)
val_data = lgb.Dataset(X_val, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_val)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

In [None]:
print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

In [None]:
report = classification_report(y_val, y_pred_class, output_dict=True)
conf_matrix = confusion_matrix(y_val, y_pred_class)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Smote-LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

In [None]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
params = {
    'objective': 'binary',
    'metric': 'mae',
    'min_child_weight': 60,  # Adjust this value and experiment
    'random_state': 42,
    'max_delta_step': 1,
    'verbose': -1,
    'max_depth': 10, 
}

train_data = lgb.Dataset(X_train_resampled, label=y_train_resampled)
val_data = lgb.Dataset(X_val, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_val)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

In [None]:
print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

In [None]:
report = classification_report(y_val, y_pred_class, output_dict=True)
conf_matrix = confusion_matrix(y_val, y_pred_class)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Resampled-LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
metrics_df.to_csv('../data/samplingResults.csv', header=1, index=0)

In [None]:
metrics_df