In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils import resample

# Load the dataset
file_path = 'filtered_normalized_data_replace2to1.csv'
df = pd.read_csv(file_path)

X = df.drop('Diabetes_012', axis=1).values
y = df['Diabetes_012'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(sampling_strategy='auto', random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


KeyboardInterrupt: 

In [None]:
# Random Forest with SMOTE
clf = RandomForestClassifier(n_estimators=100)

avg_precision = {0: 0, 1: 0}
avg_recall = {0: 0, 1: 0}
avg_f1_score = {0: 0, 1: 0}

# k-fold k=5
kf = KFold(n_splits=5, random_state=445, shuffle=True)

for train_index, test_index in kf.split(X_train_resampled):
    X_train_kf, X_test_kf = X_train_resampled[train_index], X_train_resampled[test_index]
    y_train_kf, y_test_kf = y_train_resampled[train_index], y_train_resampled[test_index]
    
    clf.fit(X_train_kf, y_train_kf)
    y_pred_kf = clf.predict(X_test_kf)
    
    print(classification_report(y_test_kf, y_pred_kf))
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_kf, y_pred_kf, average=None)
    
    for i in [0, 1]:
        avg_precision[i] += precision[i]
        avg_recall[i] += recall[i]
        avg_f1_score[i] += f1_score[i]

avg_precision = {k: v / 5 for k, v in avg_precision.items()}
avg_recall = {k: v / 5 for k, v in avg_recall.items()}
avg_f1_score = {k: v / 5 for k, v in avg_f1_score.items()}

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-Score:", avg_f1_score)


In [None]:
# Logistic Regression with SMOTE
clf = LogisticRegression(max_iter=3000, solver='lbfgs')

# Initialize average metrics dictionary
avg_precision = {0: 0, 1: 0}
avg_recall = {0: 0, 1: 0}
avg_f1_score = {0: 0, 1: 0}

# k-fold k=5
kf = KFold(n_splits=5, random_state=445, shuffle=True)

for train_index, test_index in kf.split(X_train_resampled):
    X_train_kf, X_test_kf = X_train_resampled[train_index], X_train_resampled[test_index]
    y_train_kf, y_test_kf = y_train_resampled[train_index], y_train_resampled[test_index]
    
    clf.fit(X_train_kf, y_train_kf)
    y_pred_kf = clf.predict(X_test_kf)
    
    print(classification_report(y_test_kf, y_pred_kf))
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_kf, y_pred_kf, average=None)
    
    for i in [0, 1]:
        avg_precision[i] += precision[i]
        avg_recall[i] += recall[i]
        avg_f1_score[i] += f1_score[i]

avg_precision = {k: v / 5 for k, v in avg_precision.items()}
avg_recall = {k: v / 5 for k, v in avg_recall.items()}
avg_f1_score = {k: v / 5 for k, v in avg_f1_score.items()}

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-Score:", avg_f1_score)


In [None]:
# K-Nearest Neighbors (KNN) with SMOTE
clf = KNeighborsClassifier(n_neighbors=5, algorithm='auto')

avg_precision = {0: 0, 1: 0}
avg_recall = {0: 0, 1: 0}
avg_f1_score = {0: 0, 1: 0}

# k-fold k=5
kf = KFold(n_splits=5, random_state=445, shuffle=True)

for train_index, test_index in kf.split(X_train_resampled):
    X_train_kf, X_test_kf = X_train_resampled[train_index], X_train_resampled[test_index]
    y_train_kf, y_test_kf = y_train_resampled[train_index], y_train_resampled[test_index]
    
    clf.fit(X_train_kf, y_train_kf)
    y_pred_kf = clf.predict(X_test_kf)
    
    print(classification_report(y_test_kf, y_pred_kf))
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_kf, y_pred_kf, average=None)
    
    for i in [0, 1]:
        avg_precision[i] += precision[i]
        avg_recall[i] += recall[i]
        avg_f1_score[i] += f1_score[i]

avg_precision = {k: v / 5 for k, v in avg_precision.items()}
avg_recall = {k: v / 5 for k, v in avg_recall.items()}
avg_f1_score = {k: v / 5 for k, v in avg_f1_score.items()}

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-Score:", avg_f1_score)


In [None]:
# Decision Tree with SMOTE
clf = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=42)

avg_precision = {0: 0, 1: 0}
avg_recall = {0: 0, 1: 0}
avg_f1_score = {0: 0, 1: 0}

# k-fold k=5
kf = KFold(n_splits=5, random_state=445, shuffle=True)

for train_index, test_index in kf.split(X_train_resampled):
    X_train_kf, X_test_kf = X_train_resampled[train_index], X_train_resampled[test_index]
    y_train_kf, y_test_kf = y_train_resampled[train_index], y_train_resampled[test_index]
    
    clf.fit(X_train_kf, y_train_kf)
    y_pred_kf = clf.predict(X_test_kf)
    
    print(classification_report(y_test_kf, y_pred_kf))
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_kf, y_pred_kf, average=None)
    
    for i in [0, 1]:
        avg_precision[i] += precision[i]
        avg_recall[i] += recall[i]
        avg_f1_score[i] += f1_score[i]

avg_precision = {k: v / 5 for k, v in avg_precision.items()}
avg_recall = {k: v / 5 for k, v in avg_recall.items()}
avg_f1_score = {k: v / 5 for k, v in avg_f1_score.items()}

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-Score:", avg_f1_score)


In [None]:
# Gradient Boosting with SMOTE
clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)

avg_precision = {0: 0, 1: 0}
avg_recall = {0: 0, 1: 0}
avg_f1_score = {0: 0, 1: 0}

# k-fold k=5
kf = KFold(n_splits=5, random_state=445, shuffle=True)

for train_index, test_index in kf.split(X_train_resampled):
    X_train_kf, X_test_kf = X_train_resampled[train_index], X_train_resampled[test_index]
    y_train_kf, y_test_kf = y_train_resampled[train_index], y_train_resampled[test_index]
    
    clf.fit(X_train_kf, y_train_kf)
    y_pred_kf = clf.predict(X_test_kf)
    
    print(classification_report(y_test_kf, y_pred_kf))
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_kf, y_pred_kf, average=None)
    
    for i in [0, 1]:
        avg_precision[i] += precision[i]
        avg_recall[i] += recall[i]
        avg_f1_score[i] += f1_score[i]

avg_precision = {k: v / 5 for k, v in avg_precision.items()}
avg_recall = {k: v / 5 for k, v in avg_recall.items()}
avg_f1_score = {k: v / 5 for k, v in avg_f1_score.items()}

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-Score:", avg_f1_score)


In [None]:
# XGBoost with SMOTE
clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

avg_precision = {0: 0, 1: 0}
avg_recall = {0: 0, 1: 0}
avg_f1_score = {0: 0, 1: 0}

# k-fold k=5
kf = KFold(n_splits=5, random_state=445, shuffle=True)

for train_index, test_index in kf.split(X_train_resampled):
    X_train_kf, X_test_kf = X_train_resampled[train_index], X_train_resampled[test_index]
    y_train_kf, y_test_kf = y_train_resampled[train_index], y_train_resampled[test_index]
    
    clf.fit(X_train_kf, y_train_kf)
    y_pred_kf = clf.predict(X_test_kf)
    
    print(classification_report(y_test_kf, y_pred_kf))
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_kf, y_pred_kf, average=None)
    
    for i in [0, 1]:
        avg_precision[i] += precision[i]
        avg_recall[i] += recall[i]
        avg_f1_score[i] += f1_score[i]

avg_precision = {k: v / 5 for k, v in avg_precision.items()}
avg_recall = {k: v / 5 for k, v in avg_recall.items()}
avg_f1_score = {k: v / 5 for k, v in avg_f1_score.items()}

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-Score:", avg_f1_score)
