In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# load dataset
df = pd.read_csv("cleaned_data.csv")

gender_mapping = {'Male': 1, 'Female': 2}
df['gender'] = df['gender'].map(gender_mapping)

df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,2.0,54.0,0,0,27.32,6.6,80,0
1,1.0,28.0,0,0,27.32,5.7,158,0
2,2.0,36.0,0,0,23.45,5.0,155,0
3,2.0,20.0,0,0,27.32,6.6,85,0
4,2.0,44.0,0,0,19.31,6.5,200,1
...,...,...,...,...,...,...,...,...
74389,2.0,40.0,0,0,40.69,3.5,155,0
74390,2.0,36.0,0,0,24.60,4.8,145,0
74391,1.0,66.0,0,0,27.83,5.7,155,0
74392,2.0,24.0,0,0,35.42,4.0,100,0


In [6]:
# remove rows with NaN values
df = df.dropna()

In [9]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,2.0,54.0,0,0,27.32,6.6,80,0
1,1.0,28.0,0,0,27.32,5.7,158,0
2,2.0,36.0,0,0,23.45,5.0,155,0
3,2.0,20.0,0,0,27.32,6.6,85,0
4,2.0,44.0,0,0,19.31,6.5,200,1
...,...,...,...,...,...,...,...,...
74389,2.0,40.0,0,0,40.69,3.5,155,0
74390,2.0,36.0,0,0,24.60,4.8,145,0
74391,1.0,66.0,0,0,27.83,5.7,155,0
74392,2.0,24.0,0,0,35.42,4.0,100,0


In [7]:
# split the data into training and testing sets
X = df.drop('diabetes', axis=1)
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_report_str}')

Accuracy: 0.9581204624899167
Confusion Matrix:
[[13413   146]
 [  477   840]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     13559
           1       0.85      0.64      0.73      1317

    accuracy                           0.96     14876
   macro avg       0.91      0.81      0.85     14876
weighted avg       0.96      0.96      0.96     14876



In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Use the best model for predictions
y_pred_best = best_model.predict(X_test)