<a href="https://colab.research.google.com/github/LivaIg/Diabetes-classification/blob/main/diabetes_classification_shallow_learners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Diabetes Prediction Challange with shallow learners**
Task is to build a machine learning classifier that predicts whether a patient is diagnosed with diabetes or not, based on a set of health indicators.

The dataset includes 21 features describing lifestyle, demographic, and health-related factors (e.g., BMI, smoking status, physical activity, age, blood pressure, cholesterol levels, etc.).

In [None]:
#loading datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score,mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier,
                              VotingClassifier, VotingRegressor, StackingClassifier,
                              StackingRegressor)
from sklearn.svm import SVR



# Loading the datasets

In [None]:
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
X_train = pd.read_csv('X_train.csv')
X_test.head()
X_train.head()
y_train.head()

# Prepare the data and perform one-hot encoding

In [None]:
y_train = y_train.drop(columns=["ID"])
y_train = y_train["Diabetes"].map({'Yes': 1, 'No': 0})

In [None]:
X_train = X_train.drop(columns=['ID'], errors='ignore')
X_test = X_test.drop(columns=['ID'], errors='ignore')

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
oneHotEncodedColumn = enc.fit_transform(X_train["Gender"].to_numpy().reshape(-1, 1))
print(oneHotEncodedColumn[:25].toarray())

In [None]:

# Get feature names from encoder
gender_encoded_df = pd.DataFrame(
    oneHotEncodedColumn.toarray(),
    columns=enc.get_feature_names_out(["Gender"]),
    index=X_train.index  # Keep original index for alignment
)

# Drop original Gender column and concatenate encoded columns
X = X_train.drop("Gender", axis=1)
X = pd.concat([X, gender_encoded_df], axis=1)


# Split to test-train data sets



In [None]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X, y_train, test_size=0.2, random_state=42)


# Evaluating the best performance for benchmark

In [None]:


models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVC_rbf": SVC(kernel= 'rbf',random_state=42),
    "SVC_poly": SVC(kernel= 'poly',random_state=42)
}

for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"{name} Accuracy: {acc:.4f}")


# Performing parameter tuning on base models

In [None]:
#parameter tuning

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

model=LogisticRegression(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}


model=SVC(random_state=42,kernel='rbf')
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]  # only for 'poly' kernel
}


model=SVC(random_state=42,kernel='poly')
grid = RandomizedSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt']
}

model=GradientBoostingClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}


model=RandomForestClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

In [None]:
param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'class_weight': [None, 'balanced']
}


model=DecisionTreeClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

# Testing best parameter combinations with base models for best performance

In [None]:
models = {
    "Logistic Regression": LogisticRegression(random_state=42,C=0.1,penalty='l1',solver='liblinear'),
    "Decision Tree": DecisionTreeClassifier(random_state=42,class_weight= 'balanced', criterion= 'entropy', max_depth =10, max_features= None, min_samples_leaf= 1, min_samples_split= 2, splitter= 'random'),
    "Random Forest": RandomForestClassifier(random_state=42,bootstrap=False,max_depth=10,max_features='sqrt',n_estimators=300),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42,learning_rate=0.1,max_depth=5,max_features='sqrt',min_samples_leaf=1,min_samples_split=5,n_estimators=100,subsample=0.8),
    "SVC_rbf": SVC(kernel= 'rbf',random_state=42,C=10,gamma='scale')
}

for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 0.7448
Decision Tree Accuracy: 0.7381
Random Forest Accuracy: 0.7513
Gradient Boosting Accuracy: 0.7525
SVC_rbf Accuracy: 0.7507
