### Measures
 - Accuracy
 - Precision
 - Recall
 - F1 Score
 - AUC-ROC

In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import re

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")

In [28]:
df = pd.read_csv('../data/diabetes_data_encoded.csv').drop(columns=["Unnamed: 0"], axis=1)

In [29]:
df.columns = [re.sub(r'[^\w\s]', '', col) for col in df.columns]
df.columns = [re.sub(r'[_]', ' ', col) for col in df.columns]
df.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [30]:
print(df.shape)

(101763, 2249)


In [31]:
readmit = df['readmitted'].value_counts()

# Print the counts
print("Occurrences of Readmitted:")
print("Value 0:", readmit[0])
print("Value 1:", readmit[1])
print('%:',readmit[1]/readmit[0])

Occurrences of Readmitted:
Value 0: 90406
Value 1: 11357
%: 0.12562219321726434


In [32]:
X = df.drop('readmitted', axis=1)
y = df['readmitted'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

under_sampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

In [34]:
pipeline = make_pipeline(SMOTE(random_state=42), RandomUnderSampler(random_state=42))
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [10]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_smote, y_train_smote)
rfc_predictions = rfc.predict(X_val) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94     18084
         1.0       0.45      0.02      0.03      2269

    accuracy                           0.89     20353
   macro avg       0.67      0.51      0.49     20353
weighted avg       0.84      0.89      0.84     20353

[[18042    42]
 [ 2234    35]]


In [12]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Smote-RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

metrics_df = pd.DataFrame(data)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   

            Confusion Matrix  
0  [[18042, 42], [2234, 35]]  


In [13]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_under, y_train_under)
rfc_predictions = rfc.predict(X_val) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

              precision    recall  f1-score   support

         0.0       0.92      0.61      0.74     18084
         1.0       0.16      0.59      0.25      2269

    accuracy                           0.61     20353
   macro avg       0.54      0.60      0.50     20353
weighted avg       0.84      0.61      0.68     20353

[[11118  6966]
 [  930  1339]]


In [14]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Undersampled-RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  


In [15]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_resampled, y_train_resampled)
rfc_predictions = rfc.predict(X_val) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94     18084
         1.0       0.42      0.01      0.03      2269

    accuracy                           0.89     20353
   macro avg       0.65      0.51      0.48     20353
weighted avg       0.84      0.89      0.84     20353

[[18038    46]
 [ 2236    33]]


In [16]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Resampled-RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   
2      Resampled-RFC   0.837092  0.887879  0.838791  0.887879   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  
2     [[18038, 46], [2236, 33]]  


In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [18]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_smote,y_train_smote)
gbt_predictions = gbt.predict(X_val) 
print(classification_report(y_val, gbt_predictions))
print(confusion_matrix(y_val, gbt_predictions))

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     18084
         1.0       0.22      0.10      0.14      2269

    accuracy                           0.86     20353
   macro avg       0.56      0.53      0.53     20353
weighted avg       0.82      0.86      0.84     20353

[[17268   816]
 [ 2041   228]]


In [19]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Smote-GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   
2      Resampled-RFC   0.837092  0.887879  0.838791  0.887879   
3          Smote-GBT   0.818946  0.859628  0.835975  0.859628   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  
2     [[18038, 46], [2236, 33]]  
3   [[17268, 816], [2041, 228]]  


In [20]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_under,y_train_under)
gbt_predictions = gbt.predict(X_val) 
print(classification_report(y_val, gbt_predictions))
print(confusion_matrix(y_val, gbt_predictions))

              precision    recall  f1-score   support

         0.0       0.92      0.62      0.74     18084
         1.0       0.17      0.60      0.26      2269

    accuracy                           0.62     20353
   macro avg       0.55      0.61      0.50     20353
weighted avg       0.84      0.62      0.69     20353

[[11274  6810]
 [  916  1353]]


In [21]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Undersampled-GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   
2      Resampled-RFC   0.837092  0.887879  0.838791  0.887879   
3          Smote-GBT   0.818946  0.859628  0.835975  0.859628   
4   Undersampled-GBT   0.840229  0.620400  0.690684  0.620400   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  
2     [[18038, 46], [2236, 33]]  
3   [[17268, 816], [2041, 228]]  
4  [[11274, 6810], [916, 1353]]  


In [22]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_resampled,y_train_resampled)
gbt_predictions = gbt.predict(X_val) 

In [23]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Resampled-GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   
2      Resampled-RFC   0.837092  0.887879  0.838791  0.887879   
3          Smote-GBT   0.818946  0.859628  0.835975  0.859628   
4   Undersampled-GBT   0.840229  0.620400  0.690684  0.620400   
5      Resampled-GBT   0.818946  0.859628  0.835975  0.859628   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  
2     [[18038, 46], [2236, 33]]  
3   [[17268, 816], [2041, 228]]  
4  [[11274, 6810], [916, 1353]]  
5   [[17268, 816], [2041, 228]]  


In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [35]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
params = {
    'objective': 'binary',
    'metric': 'mae',
    'min_child_weight': 60,  # Adjust this value and experiment
    'random_state': 42,
    'max_delta_step': 1,
    'verbose': -1,
    'max_depth': 10, 
}

train_data = lgb.Dataset(X_train_under, label=y_train_under)
val_data = lgb.Dataset(X_val, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_val)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

[1]	valid_0's l1: 0.496393
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.493177
[3]	valid_0's l1: 0.490224
[4]	valid_0's l1: 0.4876
[5]	valid_0's l1: 0.485287
[6]	valid_0's l1: 0.483171
[7]	valid_0's l1: 0.481229
[8]	valid_0's l1: 0.479494
[9]	valid_0's l1: 0.478005
[10]	valid_0's l1: 0.476581
[11]	valid_0's l1: 0.475217
[12]	valid_0's l1: 0.473944
[13]	valid_0's l1: 0.472841
[14]	valid_0's l1: 0.47188
[15]	valid_0's l1: 0.470844
[16]	valid_0's l1: 0.470077
[17]	valid_0's l1: 0.46941
[18]	valid_0's l1: 0.468784
[19]	valid_0's l1: 0.468138
[20]	valid_0's l1: 0.467584
[21]	valid_0's l1: 0.467091
[22]	valid_0's l1: 0.466575
[23]	valid_0's l1: 0.466166
[24]	valid_0's l1: 0.465648
[25]	valid_0's l1: 0.465227
[26]	valid_0's l1: 0.464872
[27]	valid_0's l1: 0.464451
[28]	valid_0's l1: 0.464059
[29]	valid_0's l1: 0.463777
[30]	valid_0's l1: 0.463509
[31]	valid_0's l1: 0.4632
[32]	valid_0's l1: 0.463025
[33]	valid_0's l1: 0.462787
[34]	valid_0's l1: 0.462563
[3

In [36]:
print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

              precision    recall  f1-score   support

         0.0       0.93      0.62      0.74     18084
         1.0       0.17      0.61      0.26      2269

    accuracy                           0.62     20353
   macro avg       0.55      0.61      0.50     20353
weighted avg       0.84      0.62      0.69     20353

[[11192  6892]
 [  892  1377]]


In [37]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Undersampled-LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   
2      Resampled-RFC   0.837092  0.887879  0.838791  0.887879   
3          Smote-GBT   0.818946  0.859628  0.835975  0.859628   
4   Undersampled-GBT   0.840229  0.620400  0.690684  0.620400   
5      Resampled-GBT   0.818946  0.859628  0.835975  0.859628   
6  Undersampled-LGBM   0.818946  0.859628  0.835975  0.859628   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  
2     [[18038, 46], [2236, 33]]  
3   [[17268, 816], [2041, 228]]  
4  [[11274, 6810], [916, 1353]]  
5   [[17268, 816], [2041, 228]]  
6   [[17268, 816], [2041, 228]]  


In [38]:
train_data = lgb.Dataset(X_train_smote, label=y_train_smote)
val_data = lgb.Dataset(X_val, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_val)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

[1]	valid_0's l1: 0.481435
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.463044
[3]	valid_0's l1: 0.445105
[4]	valid_0's l1: 0.427407
[5]	valid_0's l1: 0.410166
[6]	valid_0's l1: 0.394114
[7]	valid_0's l1: 0.378823
[8]	valid_0's l1: 0.364588
[9]	valid_0's l1: 0.351741
[10]	valid_0's l1: 0.339839
[11]	valid_0's l1: 0.328781
[12]	valid_0's l1: 0.319186
[13]	valid_0's l1: 0.310578
[14]	valid_0's l1: 0.302221
[15]	valid_0's l1: 0.295045
[16]	valid_0's l1: 0.287881
[17]	valid_0's l1: 0.281706
[18]	valid_0's l1: 0.275991
[19]	valid_0's l1: 0.270784
[20]	valid_0's l1: 0.265946
[21]	valid_0's l1: 0.260717
[22]	valid_0's l1: 0.256245
[23]	valid_0's l1: 0.25258
[24]	valid_0's l1: 0.24854
[25]	valid_0's l1: 0.244921
[26]	valid_0's l1: 0.241744
[27]	valid_0's l1: 0.23849
[28]	valid_0's l1: 0.235718
[29]	valid_0's l1: 0.23287
[30]	valid_0's l1: 0.230411
[31]	valid_0's l1: 0.228056
[32]	valid_0's l1: 0.226507
[33]	valid_0's l1: 0.224361
[34]	valid_0's l1: 0.222545


In [39]:
print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94     18084
         1.0       0.47      0.02      0.04      2269

    accuracy                           0.89     20353
   macro avg       0.68      0.51      0.49     20353
weighted avg       0.84      0.89      0.84     20353

[[18032    52]
 [ 2223    46]]


In [40]:
report = classification_report(y_val, y_pred_class, output_dict=True)
conf_matrix = confusion_matrix(y_val, y_pred_class)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Smote-LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0          Smote-RFC   0.841295  0.888174  0.839126  0.888174   
1   Undersampled-RFC   0.837906  0.612047  0.683919  0.612047   
2      Resampled-RFC   0.837092  0.887879  0.838791  0.887879   
3          Smote-GBT   0.818946  0.859628  0.835975  0.859628   
4   Undersampled-GBT   0.840229  0.620400  0.690684  0.620400   
5      Resampled-GBT   0.818946  0.859628  0.835975  0.859628   
6  Undersampled-LGBM   0.818946  0.859628  0.835975  0.859628   
7         Smote-LGBM   0.843331  0.888223  0.840127  0.888223   

               Confusion Matrix  
0     [[18042, 42], [2234, 35]]  
1  [[11118, 6966], [930, 1339]]  
2     [[18038, 46], [2236, 33]]  
3   [[17268, 816], [2041, 228]]  
4  [[11274, 6810], [916, 1353]]  
5   [[17268, 816], [2041, 228]]  
6   [[17268, 816], [2041, 228]]  
7     [[18032, 52], [2223, 46]]  


In [41]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
params = {
    'objective': 'binary',
    'metric': 'mae',
    'min_child_weight': 60,  # Adjust this value and experiment
    'random_state': 42,
    'max_delta_step': 1,
    'verbose': -1,
    'max_depth': 10, 
}

train_data = lgb.Dataset(X_train_resampled, label=y_train_resampled)
val_data = lgb.Dataset(X_val, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_val)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

[1]	valid_0's l1: 0.481435
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.463044
[3]	valid_0's l1: 0.445105
[4]	valid_0's l1: 0.427407
[5]	valid_0's l1: 0.410166
[6]	valid_0's l1: 0.394114
[7]	valid_0's l1: 0.378823
[8]	valid_0's l1: 0.364588
[9]	valid_0's l1: 0.351741
[10]	valid_0's l1: 0.339839
[11]	valid_0's l1: 0.328781
[12]	valid_0's l1: 0.319186
[13]	valid_0's l1: 0.310578
[14]	valid_0's l1: 0.302221
[15]	valid_0's l1: 0.295045
[16]	valid_0's l1: 0.287881
[17]	valid_0's l1: 0.281706
[18]	valid_0's l1: 0.275991
[19]	valid_0's l1: 0.270784
[20]	valid_0's l1: 0.265946
[21]	valid_0's l1: 0.260717
[22]	valid_0's l1: 0.256245
[23]	valid_0's l1: 0.25258
[24]	valid_0's l1: 0.24854
[25]	valid_0's l1: 0.244921
[26]	valid_0's l1: 0.241744
[27]	valid_0's l1: 0.23849
[28]	valid_0's l1: 0.235718
[29]	valid_0's l1: 0.23287
[30]	valid_0's l1: 0.230411
[31]	valid_0's l1: 0.228056
[32]	valid_0's l1: 0.226507
[33]	valid_0's l1: 0.224361
[34]	valid_0's l1: 0.222545


In [42]:
print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94     18084
         1.0       0.47      0.02      0.04      2269

    accuracy                           0.89     20353
   macro avg       0.68      0.51      0.49     20353
weighted avg       0.84      0.89      0.84     20353

[[18032    52]
 [ 2223    46]]


In [43]:
report = classification_report(y_val, y_pred_class, output_dict=True)
conf_matrix = confusion_matrix(y_val, y_pred_class)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'Resampled-LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
metrics_df.to_csv('../data/samplingResults.csv', header=1, index=0)

In [None]:
metrics_df