In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score,confusion_matrix,classification_report
import pickle
import joblib

In [7]:
df = pd.read_csv(r'Diabetes_Preprocessed_data.csv')

In [8]:
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0
5,0,20.0,0,0,4,27.32,6.6,85,0
6,0,44.0,0,0,4,19.31,6.5,200,1
7,0,79.0,0,0,0,23.86,5.7,85,0
8,1,42.0,0,0,4,33.64,4.8,145,0
9,0,32.0,0,0,4,27.32,5.0,100,0


In [9]:
X = df.drop('diabetes', axis=1)

In [10]:
y = df['diabetes']

In [18]:
X_train,  X_test,y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=45)

In [13]:
# Initialize models with default arguments
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVC': SVC()
}


In [19]:
# Train models and evaluate their performance
metrics = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

# Create a DataFrame for comparison
metrics_df = pd.DataFrame(metrics)

Parameters: { "use_label_encoder" } are not used.



In [20]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Decision Tree,0.9481,0.681953,0.743249,0.711283
1,AdaBoost,0.970011,0.967052,0.674325,0.794586
2,XGBoost,0.969699,0.946637,0.686417,0.795794
3,Gradient Boosting,0.970947,0.986382,0.671503,0.799041
4,SVC,0.947545,0.998969,0.390568,0.561576


In [21]:
# Identifying the best model based on F1 Score
best_model_row = metrics_df.loc[metrics_df['F1 Score'].idxmax()]
best_model_row

Model        Gradient Boosting
Accuracy              0.970947
Precision             0.986382
Recall                0.671503
F1 Score              0.799041
Name: 3, dtype: object

In [28]:
best_model = metrics_df.loc[metrics_df['F1 Score'].idxmax()]['Model']
best_model_Method = models[best_model]
best_model_Method 

In [29]:
# Save the best model to a .pkl file
pkl_file_path = 'Diabetes_Prediction_Model.pkl'
joblib.dump(best_model_Method, pkl_file_path)

['Diabetes_Prediction_Model.pkl']

In [30]:
# Load the model
model = joblib.load(pkl_file_path)

# Test prediction
sample_data = [[0, 45, 0, 0, 2, 25.0, 6.5, 130]]
print("Prediction:", model.predict(sample_data))

Prediction: [0]


