<a href="https://colab.research.google.com/github/Myst-Lee/Advanced-Machine-Learning/blob/main/WOA7015_GroupProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score


# **Data Preprocessing**

In [2]:
diabetes_df = pd.read_csv("../dataset/synthetic_diabetes.csv")
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,3,66,85,9,307,5.588269,0.358663,25,1
1,5,199,62,0,-10,22.837882,0.275430,29,0
2,5,105,60,30,156,42.685438,0.785674,21,0
3,11,110,88,2,15,43.782111,0.668618,63,1
4,1,121,84,32,167,43.361161,0.340283,33,0
...,...,...,...,...,...,...,...,...,...
4995,6,53,99,28,23,59.413541,0.416894,30,0
4996,15,129,73,-5,74,36.345109,0.471418,39,0
4997,8,135,58,18,93,50.829549,0.493583,39,0
4998,6,56,38,15,101,46.415951,0.399238,26,0


# Scaling the Data

In [3]:
diabetes_data_copy = diabetes_df.copy(deep = True)

sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(diabetes_data_copy.drop(["Outcome"],axis = 1)),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                 'BMI', 'DiabetesPedigreeFunction', 'Age'
                 ])
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-0.560091,-1.001756,0.78992,-0.617852,2.861756,-3.501002,-0.575183,-0.450755
1,-0.073394,2.619586,-0.375991,-1.177839,-0.753654,-1.755839,-0.721104,-0.055599
2,-0.073394,0.060142,-0.477375,0.688784,1.139589,0.252161,0.173433,-0.845912
3,1.386696,0.196282,0.941996,-1.053397,-0.468527,0.363112,-0.031783,3.303232
4,-1.046787,0.495792,0.739228,0.813225,1.265045,0.320524,-0.607405,0.339558


In [4]:
y = diabetes_data_copy["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print("Train Set x-y:", X_train.shape, y_train.shape)
print("Test Set x-y:", X_test.shape, y_test.shape)

Train Set x-y: (4250, 8) (4250,)
Test Set x-y: (750, 8) (750,)


# **Model Training**

# Logistic Regression

In [5]:
performance_lst = {}

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["LogisticRegression"] = accuracy

print("Model weights (coefficients):", model.coef_)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Model weights (coefficients): [[ 0.03030399 -0.03884096  0.01449475  0.00596308 -0.02596725  0.02848263
   0.05846742 -0.04757149]]
Accuracy on test set: 71.73%


# K-Neighbors Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["KNeighborsClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 65.33%


# Support Vector Classification

In [8]:
from sklearn.svm import SVC

model = SVC(kernel='rbf')  # For linear kernel, or 'rbf' for non-linear
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["SVC"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 71.73%


# Decision Tree Classifier

In [9]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["DecisionTreeClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 56.67%


# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["RandomForestClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 70.53%


# Gaussian Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["GaussianNB"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 70.67%


# Gradient Boosting Classifier

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["GradientBoostingClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 71.47%


# XGBoost

In [13]:
import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators=1000,      # Equivalent to epochs: number of boosting rounds
    learning_rate=0.01,    # Learning rate
    max_depth=100,           # Max depth of trees
    subsample=0.8,         # Proportion of data used for each tree (stochasticity)
    colsample_bytree=0.8   # Proportion of features used for each tree
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["xgb"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 69.20%


In [14]:
performance_lst

{'LogisticRegression': 0.7173333333333334,
 'KNeighborsClassifier': 0.6533333333333333,
 'SVC': 0.7173333333333334,
 'DecisionTreeClassifier': 0.5666666666666667,
 'RandomForestClassifier': 0.7053333333333334,
 'GaussianNB': 0.7066666666666667,
 'GradientBoostingClassifier': 0.7146666666666667,
 'xgb': 0.692}