In [1]:
## read txt file
from google.colab import drive
drive.mount('/content/drive/')
data_path = "/content/drive/MyDrive/Colab Notebooks/"  # this is your drive

Mounted at /content/drive/


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

df = pd.read_csv(data_path + "data1_p.csv")
df.head()

Unnamed: 0,Outcome,Glucose,BMI,Age,Pregnancies,DiabetesPedigreeFunction,BloodPressure,Insulin
0,1,0.92404,0.210285,1.47922,0.657355,0.606516,-0.028115,-0.805266
1,0,-1.177082,-0.848063,-0.183265,-0.86849,-0.36422,-0.515765,-0.805266
2,1,2.09133,-1.346999,-0.095766,1.267694,0.764788,-0.678315,-0.805266
3,0,-1.043678,-0.621274,-1.058257,-0.86849,-1.011378,-0.515765,0.238698
4,0,-0.143197,-0.999256,-0.270764,0.352186,-0.891795,0.134435,-0.805266


In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import numpy as np
from imblearn.combine import SMOTETomek

In [4]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Use SMOTE to oversample minority classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# View the balanced distribution of categories
print(y_res.value_counts())

Outcome
1    461
0    461
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

X_train_res, X_test, y_train_res, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

print(y_train_res.value_counts())
print(y_test.value_counts())
print(X_train_res)

Outcome
1    369
0    368
Name: count, dtype: int64
Outcome
0    93
1    92
Name: count, dtype: int64
      Glucose       BMI       Age  Pregnancies  DiabetesPedigreeFunction  \
850 -0.259677  0.293554 -0.346273    -0.736816                  1.100709   
536  0.123613 -0.666632 -0.270764    -0.868490                 -1.247028   
171  1.257552  1.087203 -0.358263     0.352186                 -0.209465   
353  0.357071  0.316120 -0.970758    -1.173659                 -0.909380   
521 -0.676815  0.981368  0.779226     1.267694                 -0.930483   
..        ...       ...       ...          ...                       ...   
712  0.799966  0.060346  1.041724     1.479573                  0.669973   
901  1.241552  0.652266  2.179213     1.894299                  0.153903   
199 -1.110380  0.361478 -0.970758    -0.868490                 -1.243511   
379  2.124681  0.724340 -0.183265     0.047017                 -0.670213   
377 -0.176548 -0.213054 -1.058257    -0.563321                

In [6]:
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    RocCurveDisplay
)
import matplotlib.pyplot as plt

def evaluate_model(model, X, y):
  # Evaluate the model and return the indicator dictionary
  y_pred = model.predict(X)
  y_proba = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X)

  return {
    "Accuracy": accuracy_score(y, y_pred),
    "Precision": precision_score(y, y_pred),
    "Recall": recall_score(y, y_pred),
    "F1": f1_score(y, y_pred),
    "AUC-ROC": roc_auc_score(y, y_proba)
  }

In [7]:
# Model Selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier, log_evaluation
from lightgbm.basic import LightGBMError
from sklearn.metrics import roc_auc_score, make_scorer


In [8]:
lr = LogisticRegression(
    max_iter = 100,
    C = 0.01,
    solver = "lbfgs",
    class_weight = "balanced",
    random_state = 42
)
lr.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lr, X_train_res, y_train_res)
test_metrics = evaluate_model(lr, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.7313432835820896, 'Precision': 0.750733137829912, 'Recall': 0.6937669376693767, 'F1': 0.7211267605633803, 'AUC-ROC': np.float64(0.8372363614940498)}
Test Metrics: {'Accuracy': 0.7675675675675676, 'Precision': 0.7951807228915663, 'Recall': 0.717391304347826, 'F1': 0.7542857142857143, 'AUC-ROC': np.float64(0.8818373071528751)}


In [9]:
#RandomForest
rf = RandomForestClassifier(
    n_estimators = 50,
    max_depth = 5,
    min_samples_leaf = 10,
    min_samples_split = 10,
    class_weight = "balanced",
    random_state = 42
)
rf.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rf, X_train_res, y_train_res)
test_metrics = evaluate_model(rf, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.8222523744911805, 'Precision': 0.8098958333333334, 'Recall': 0.8428184281842819, 'F1': 0.8260292164674635, 'AUC-ROC': np.float64(0.9113202545068929)}
Test Metrics: {'Accuracy': 0.8378378378378378, 'Precision': 0.8229166666666666, 'Recall': 0.8586956521739131, 'F1': 0.8404255319148937, 'AUC-ROC': np.float64(0.9036933146330062)}


In [10]:
# Hyperparameter tuning (RandomForest)


# Define parameter network
param_grid_rf = {
    "n_estimators" : [20, 50],
    "max_depth" : [5, 10],
    "min_samples_split" : [5, 10],
    "min_samples_leaf" : [5, 10]
}


rf_v = RandomForestClassifier(class_weight = "balanced", random_state = 42)

# Grid Search
grid_search_rf = GridSearchCV(
    estimator = rf_v,
    param_grid = param_grid_rf,
    cv = 10,
    scoring = "accuracy"
)
grid_search_rf.fit(X_train_res, y_train_res)

# Output optimal parameters and scores
print("Best Parameters(lr):",grid_search_rf.best_params_)

rf = grid_search_rf.best_estimator_

train_metrics = evaluate_model(rf, X_train_res, y_train_res)
test_metrics = evaluate_model(rf, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Best Parameters(lr): {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 50}
Train Metrics: {'Accuracy': 0.9090909090909091, 'Precision': 0.9037433155080213, 'Recall': 0.9159891598915989, 'F1': 0.9098250336473755, 'AUC-ROC': np.float64(0.9742252857311182)}
Test Metrics: {'Accuracy': 0.8540540540540541, 'Precision': 0.8350515463917526, 'Recall': 0.8804347826086957, 'F1': 0.8571428571428571, 'AUC-ROC': np.float64(0.9219261337073399)}


In [11]:
#XGBoost
xgb = XGBClassifier(
    colsample_bytree = 0.9,
    min_child_weight = 5,
    learning_rate = 0.1,
    max_depth = 15,
    n_estimators = 25,
    subsample = 0.8,
    eval_metric = "logloss",
)
xgb.fit(X_train_res, y_train_res,)

train_metrics = evaluate_model(xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(xgb, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.8819538670284939, 'Precision': 0.8671875, 'Recall': 0.9024390243902439, 'F1': 0.8844621513944223, 'AUC-ROC': np.float64(0.9547396017438435)}
Test Metrics: {'Accuracy': 0.827027027027027, 'Precision': 0.7941176470588235, 'Recall': 0.8804347826086957, 'F1': 0.8350515463917526, 'AUC-ROC': np.float64(0.9079008882655446)}


In [12]:
#LightGBM
lgb = LGBMClassifier(
    colsample_bytree=1.0,
    min_child_samples=15,
    reg_alpha=0.2,
    reg_lambda=0.3,
    subsample=0.7,
    num_leaves=18,
    learning_rate=0.05,
    n_estimators=500
)
lgb.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lgb, X_train_res, y_train_res)
test_metrics = evaluate_model(lgb, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

[LightGBM] [Info] Number of positive: 369, number of negative: 368
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1010
[LightGBM] [Info] Number of data points in the train set: 737, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500678 -> initscore=0.002714
[LightGBM] [Info] Start training from score 0.002714
Train Metrics: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'AUC-ROC': np.float64(1.0)}
Test Metrics: {'Accuracy': 0.8540540540540541, 'Precision': 0.8421052631578947, 'Recall': 0.8695652173913043, 'F1': 0.8556149732620321, 'AUC-ROC': np.float64(0.9125759700794763)}


In [13]:
import joblib
from google.colab import files

joblib.dump(rf, "rf_data1.pkl")
files.download("rf_data1.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
joblib.dump(lgb, "lgb_data1.pkl")
files.download("lgb_data1.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# Define a meta model (using LogisticRegression)
meta_model = LogisticRegression(random_state=42)

# Define Stacking Ensemble Model
rflgb_st = StackingClassifier(
    estimators=[('rf', rf), ('lgb', lgb)],
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

rflgb_st.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rflgb_st, X_train_res, y_train_res)
test_metrics = evaluate_model(rflgb_st, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'AUC-ROC': np.float64(1.0)}
Test Metrics: {'Accuracy': 0.8216216216216217, 'Precision': 0.8172043010752689, 'Recall': 0.8260869565217391, 'F1': 0.8216216216216217, 'AUC-ROC': np.float64(0.9326788218793829)}


In [15]:
lrxgb_st = StackingClassifier(
    estimators=[('lr', lr), ('xgb', xgb)],
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

lrxgb_st.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lrxgb_st, X_train_res, y_train_res)
test_metrics = evaluate_model(lrxgb_st, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.8616010854816825, 'Precision': 0.8467532467532467, 'Recall': 0.8834688346883469, 'F1': 0.8647214854111406, 'AUC-ROC': np.float64(0.9365868386944738)}
Test Metrics: {'Accuracy': 0.8378378378378378, 'Precision': 0.81, 'Recall': 0.8804347826086957, 'F1': 0.84375, 'AUC-ROC': np.float64(0.9080177653108928)}


In [16]:
# Define meta model (using RandomForest)
meta_model_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=5,
    min_samples_split=10,
    random_state=42
)

rflgb_st_rf = StackingClassifier(
    estimators=[('rf', rf), ('lgb', lgb)],
    final_estimator=meta_model_rf,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

rflgb_st_rf.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rflgb_st_rf, X_train_res, y_train_res)
test_metrics = evaluate_model(rflgb_st_rf, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.9823609226594301, 'Precision': 0.9659685863874345, 'Recall': 1.0, 'F1': 0.9826897470039947, 'AUC-ROC': np.float64(1.0)}
Test Metrics: {'Accuracy': 0.8108108108108109, 'Precision': 0.7938144329896907, 'Recall': 0.8369565217391305, 'F1': 0.8148148148148148, 'AUC-ROC': np.float64(0.9025245441795232)}


In [None]:
joblib.dump(rflgb_st,"Stacking_lr_rflgb_data1.pkl")
files.download("Stacking_lr_rflgb_data1.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
joblib.dump(rflgb_st_rf,"Stacking_rf_rflgb_data1.pkl")
files.download("Stacking_rf_rflgb_data1.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
lrxgb_st_rf = StackingClassifier(
    estimators=[('lr', lr), ('xgb', xgb)],
    final_estimator=meta_model_rf,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

lrxgb_st_rf.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lrxgb_st_rf, X_train_res, y_train_res)
test_metrics = evaluate_model(lrxgb_st_rf, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)


Train Metrics: {'Accuracy': 0.8629579375848032, 'Precision': 0.8471502590673575, 'Recall': 0.8861788617886179, 'F1': 0.866225165562914, 'AUC-ROC': np.float64(0.938019176387416)}
Test Metrics: {'Accuracy': 0.8108108108108109, 'Precision': 0.7878787878787878, 'Recall': 0.8478260869565217, 'F1': 0.8167539267015707, 'AUC-ROC': np.float64(0.9018817204301075)}


In [18]:
# Define meta model (using XGBoost)
meta_model_xgb = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rflgb_st_xgb = StackingClassifier(
    estimators=[('rf', rf), ('lgb', lgb)],
    final_estimator=meta_model_xgb,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

rflgb_st_xgb.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rflgb_st_xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(rflgb_st_xgb, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.9687924016282226, 'Precision': 0.970108695652174, 'Recall': 0.967479674796748, 'F1': 0.9687924016282226, 'AUC-ROC': np.float64(0.9959717803699777)}
Test Metrics: {'Accuracy': 0.8162162162162162, 'Precision': 0.7959183673469388, 'Recall': 0.8478260869565217, 'F1': 0.8210526315789474, 'AUC-ROC': np.float64(0.890953716690042)}


In [19]:
lrxgb_st_xgb = StackingClassifier(
    estimators=[('lr', lr), ('xgb', xgb)],
    final_estimator=meta_model_xgb,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

lrxgb_st_xgb.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lrxgb_st_xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(lrxgb_st_xgb, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.8398914518317503, 'Precision': 0.8495821727019499, 'Recall': 0.8265582655826558, 'F1': 0.8379120879120879, 'AUC-ROC': np.float64(0.9238614940497231)}
Test Metrics: {'Accuracy': 0.7675675675675676, 'Precision': 0.7525773195876289, 'Recall': 0.7934782608695652, 'F1': 0.7724867724867724, 'AUC-ROC': np.float64(0.8697405329593267)}


In [20]:
from sklearn.ensemble import VotingClassifier

# Voting
en_lr_rf_xgb = VotingClassifier(
    estimators = [
        ("lr", lr),
        ("rf", rf),
        ("xgb", xgb)
    ],
    voting = "soft"
)

en_lr_xgb = VotingClassifier(
    estimators = [
        ("lf", lr),
        ("rf", rf)
    ],
    voting = "soft"
)

en_lr_rf_xgb.fit(X_train_res, y_train_res)
en_lr_xgb.fit(X_train_res, y_train_res)

In [21]:
train_metrics = evaluate_model(en_lr_rf_xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(en_lr_rf_xgb, X_test, y_test)

print("Ensemble 3 Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

train_metrics = evaluate_model(en_lr_xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(en_lr_xgb, X_test, y_test)

print("Ensemble l Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Ensemble 3 Train Metrics: {'Accuracy': 0.8792401628222524, 'Precision': 0.8703703703703703, 'Recall': 0.8915989159891599, 'F1': 0.8808567603748326, 'AUC-ROC': np.float64(0.9481633675032403)}
Test Metrics: {'Accuracy': 0.8432432432432433, 'Precision': 0.8118811881188119, 'Recall': 0.8913043478260869, 'F1': 0.8497409326424871, 'AUC-ROC': np.float64(0.91643291257597)}
Ensemble l Train Metrics: {'Accuracy': 0.8670284938941656, 'Precision': 0.8613333333333333, 'Recall': 0.8753387533875339, 'F1': 0.8682795698924731, 'AUC-ROC': np.float64(0.9386414516319076)}
Test Metrics: {'Accuracy': 0.8378378378378378, 'Precision': 0.8229166666666666, 'Recall': 0.8586956521739131, 'F1': 0.8404255319148937, 'AUC-ROC': np.float64(0.9150303880317905)}


In [None]:
joblib.dump(en_lr_rf_xgb,"Ensemble_3_data1.pkl")
files.download("Ensemble_3_data1.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [23]:
# Definition model
nn = Sequential()

# Input layer, two hidden layers, and output layer
nn.add(Dense(units=64, activation='relu', input_shape=(X_train_res.shape[1],)))
nn.add(Dense(units=32, activation='relu'))
nn.add(Dense(units=1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.01)

# Compilation Model
nn.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Print Model Summary
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
# Training model
history = nn.fit(X_train_res, y_train_res, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.6694 - loss: 0.6182 - val_accuracy: 0.7905 - val_loss: 0.4717
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7552 - loss: 0.4895 - val_accuracy: 0.7770 - val_loss: 0.4874
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7496 - loss: 0.4796 - val_accuracy: 0.7568 - val_loss: 0.4899
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8090 - loss: 0.4135 - val_accuracy: 0.7703 - val_loss: 0.4866
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8040 - loss: 0.4117 - val_accuracy: 0.7838 - val_loss: 0.4977
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8276 - loss: 0.4077 - val_accuracy: 0.7770 - val_loss: 0.5228
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━

In [25]:
# Evaluate the model on the test set
loss, accuracy = nn.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predictive testing set
y_pred = nn.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8135 - loss: 0.7460 
Test Loss: 0.7426101565361023
Test Accuracy: 0.8216215968132019
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.83        93
           1       0.83      0.80      0.82        92

    accuracy                           0.82       185
   macro avg       0.82      0.82      0.82       185
weighted avg       0.82      0.82      0.82       185

Confusion Matrix:
[[78 15]
 [18 74]]


In [26]:
# Evaluate all models
models = {
    "Logistic Regression": lr,
    "Random Forest": rf,
    "XGBoost": xgb,
    "LightGBM":lgb,
    #"Neural Network": nn,
    "L_R_X_Voting": en_lr_rf_xgb,
    "L_X_Voting" : en_lr_xgb,
    "Stacking(LR)_RF_LGB": rflgb_st,
    "Stacking(RF)_RF_LGB" : rflgb_st_rf
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_test, y_test)

pd.DataFrame(results).T.round(3)

Unnamed: 0,Accuracy,Precision,Recall,F1,AUC-ROC
Logistic Regression,0.768,0.795,0.717,0.754,0.882
Random Forest,0.854,0.835,0.88,0.857,0.922
XGBoost,0.827,0.794,0.88,0.835,0.908
LightGBM,0.854,0.842,0.87,0.856,0.913
L_R_X_Voting,0.843,0.812,0.891,0.85,0.916
L_X_Voting,0.838,0.823,0.859,0.84,0.915
Stacking(LR)_RF_LGB,0.822,0.817,0.826,0.822,0.933
Stacking(RF)_RF_LGB,0.811,0.794,0.837,0.815,0.903
