<a href="https://colab.research.google.com/github/Myst-Lee/Advanced-Machine-Learning/blob/main/WOA7015_GroupProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score


# **Data Preprocessing**

In [2]:
diabetes_df = pd.read_csv("../dataset/synthetic_diabetes.csv")
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


Scaling the Data

In [3]:
diabetes_data_copy = diabetes_df.copy(deep = True)

sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(diabetes_data_copy.drop(["Outcome"],axis = 1)),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                 'BMI', 'DiabetesPedigreeFunction', 'Age'
                 ])
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496


In [4]:
y = diabetes_data_copy["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print("Train Set x-y:", X_train.shape, y_train.shape)
print("Test Set x-y:", X_test.shape, y_test.shape)

Train Set x-y: (652, 8) (652,)
Test Set x-y: (116, 8) (116,)


# **Model Training**

# Logistic Regression

In [5]:
performance_lst = {}

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["LogisticRegression"] = accuracy

print("Model weights (coefficients):", model.coef_)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Model weights (coefficients): [[ 0.29534256  1.1040507  -0.25017021  0.06020309 -0.19316567  0.69498142
   0.24330538  0.35721425]]
Accuracy on test set: 75.86%


# K-Neighbors Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["KNeighborsClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 66.38%


# Support Vector Classification

In [8]:
from sklearn.svm import SVC

model = SVC(kernel='rbf')  # For linear kernel, or 'rbf' for non-linear
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["SVC"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 72.41%


# Decision Tree Classifier

In [9]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["DecisionTreeClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 75.00%


# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["RandomForestClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 72.41%


# Gaussian Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["GaussianNB"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 78.45%


# Gradient Boosting Classifier

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["GradientBoostingClassifier"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 71.55%


# XGBoost

In [13]:
import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators=1000,      # Equivalent to epochs: number of boosting rounds
    learning_rate=0.01,    # Learning rate
    max_depth=100,           # Max depth of trees
    subsample=0.8,         # Proportion of data used for each tree (stochasticity)
    colsample_bytree=0.8   # Proportion of features used for each tree
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
performance_lst["xgb"] = accuracy

print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 72.41%


In [14]:
performance_lst

{'LogisticRegression': 0.7586206896551724,
 'KNeighborsClassifier': 0.6637931034482759,
 'SVC': 0.7241379310344828,
 'DecisionTreeClassifier': 0.75,
 'RandomForestClassifier': 0.7241379310344828,
 'GaussianNB': 0.7844827586206896,
 'GradientBoostingClassifier': 0.7155172413793104,
 'xgb': 0.7241379310344828}