### 

In [35]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [36]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('diab_data.sqlite')

# Query to select data from the database
query = 'SELECT * FROM diabdata';

# Read data into a DataFrame
df = pd.read_sql_query(query, conn)

In [37]:
index = 'index'
df = df.drop(index, axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,54.0,0,0,27.32,6.6,80,0
1,Male,28.0,0,0,27.32,5.7,158,0
2,Female,36.0,0,0,23.45,5.0,155,0
3,Female,20.0,0,0,27.32,6.6,85,0
4,Female,44.0,0,0,19.31,6.5,200,1
...,...,...,...,...,...,...,...,...
74389,Female,40.0,0,0,40.69,3.5,155,0
74390,Female,36.0,0,0,24.60,4.8,145,0
74391,Male,66.0,0,0,27.83,5.7,155,0
74392,Female,24.0,0,0,35.42,4.0,100,0


In [38]:
gender_mapping = {'Male': 1, 'Female': 2}
df['gender'] = df['gender'].map(gender_mapping)

In [39]:
# remove rows with NaN values
df = df.dropna()
df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,2.0,54.0,0,0,27.32,6.6,80,0
1,1.0,28.0,0,0,27.32,5.7,158,0
2,2.0,36.0,0,0,23.45,5.0,155,0
3,2.0,20.0,0,0,27.32,6.6,85,0
4,2.0,44.0,0,0,19.31,6.5,200,1
...,...,...,...,...,...,...,...,...
74389,2.0,40.0,0,0,40.69,3.5,155,0
74390,2.0,36.0,0,0,24.60,4.8,145,0
74391,1.0,66.0,0,0,27.83,5.7,155,0
74392,2.0,24.0,0,0,35.42,4.0,100,0


In [40]:
# split the data into training and testing sets
X = df.drop('diabetes', axis=1)
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
# Undersample the majority class (non-diabetes)
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

#### Logistic Regression 

In [42]:
# Train the model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate Logistic Regression Model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
classification_report_logistic = classification_report(y_test, y_pred_logistic)

# Print evaluation metrics for Logistic Regression Model
print("Logistic Regression Model:")
print(f'Accuracy: {accuracy_logistic}')
print(f'Confusion Matrix:\n{conf_matrix_logistic}')
print(f'Classification Report:\n{classification_report_logistic}')

Logistic Regression Model:
Accuracy: 0.8723015001829492
Confusion Matrix:
[[1206  163]
 [ 186 1178]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      1369
           1       0.88      0.86      0.87      1364

    accuracy                           0.87      2733
   macro avg       0.87      0.87      0.87      2733
weighted avg       0.87      0.87      0.87      2733



#### Random Forest 

In [43]:
# Train the Random Forest Model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

# Make predictions 
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate Random Forest Model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

# Print evaluation metrics for Random Forest Model
print("Random Forest Model:")
print(f'Accuracy: {accuracy_rf}')
print(f'Confusion Matrix:\n{conf_matrix_rf}')
print(f'Classification Report:\n{classification_report_rf}')

Random Forest Model:
Accuracy: 0.8986461763629711
Confusion Matrix:
[[1228  141]
 [ 136 1228]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1369
           1       0.90      0.90      0.90      1364

    accuracy                           0.90      2733
   macro avg       0.90      0.90      0.90      2733
weighted avg       0.90      0.90      0.90      2733



#### Support Vector Machine 

In [44]:
# Train the Support Vector Machine Model
svm_model = SVC(probability=True)  
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate SVM Model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
classification_report_svm = classification_report(y_test, y_pred_svm)

# Print evaluation metrics for SVM Model
print("Support Vector Machine Model:")
print(f'Accuracy: {accuracy_svm}')
print(f'Confusion Matrix:\n{conf_matrix_svm}')
print(f'Classification Report:\n{classification_report_svm}')

Support Vector Machine Model:
Accuracy: 0.8510793999268204
Confusion Matrix:
[[1138  231]
 [ 176 1188]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      1369
           1       0.84      0.87      0.85      1364

    accuracy                           0.85      2733
   macro avg       0.85      0.85      0.85      2733
weighted avg       0.85      0.85      0.85      2733



#### Decision trees 

In [45]:
# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Decision Tree Model:")
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_report_str}')

Decision Tree Model:
Accuracy: 0.876326381266008
Confusion Matrix:
[[1196  173]
 [ 165 1199]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      1369
           1       0.87      0.88      0.88      1364

    accuracy                           0.88      2733
   macro avg       0.88      0.88      0.88      2733
weighted avg       0.88      0.88      0.88      2733

