In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data=pd.read_csv('diabetes.csv')
data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.shape


(768, 9)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [6]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [7]:
X.shape

(768, 8)

In [8]:
y.shape

(768,)

In [9]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    

In [11]:
X_train.shape

(614, 8)

In [12]:
X_test.shape


(154, 8)

In [13]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),  
    ('classifier', RandomForestClassifier(random_state=42))  
])

In [14]:
pipeline.fit(X_train, y_train)


In [15]:
y_pred = pipeline.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [17]:
print("Initial Model Results")
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Initial Model Results
Accuracy: 0.7597402597402597
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       100
           1       0.68      0.59      0.63        54

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154

Confusion Matrix:
 [[85 15]
 [22 32]]


In [18]:
param_grid_small = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

In [19]:
grid_search_small = GridSearchCV(pipeline, param_grid_small, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

In [20]:
grid_search_small.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [21]:
best_params_small = grid_search_small.best_params_
best_score_small = grid_search_small.best_score_

In [22]:
best_model_small = grid_search_small.best_estimator_
y_pred_best_small = best_model_small.predict(X_test)
accuracy_best_small = accuracy_score(y_test, y_pred_best_small)
report_best_small = classification_report(y_test, y_pred_best_small)
conf_matrix_best_small = confusion_matrix(y_test, y_pred_best_small)

In [23]:
print("\nTuned Model Results")
print("Best Parameters:", best_params_small)
print("Best Cross-Validation Accuracy:", best_score_small)
print("Test Accuracy:", accuracy_best_small)
print("Classification Report:\n", report_best_small)
print("Confusion Matrix:\n", conf_matrix_best_small)


Tuned Model Results
Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best Cross-Validation Accuracy: 0.7752670173760561
Test Accuracy: 0.7402597402597403
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81       100
           1       0.65      0.56      0.60        54

    accuracy                           0.74       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.74      0.73       154

Confusion Matrix:
 [[84 16]
 [24 30]]


In [24]:
sample_data = [[6,148,72,35,0,33.6,0.627,50]]
sample_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
sample_df = pd.DataFrame(sample_data, columns=sample_columns)

In [25]:
sample_prediction = best_model_small.predict(sample_df)
sample_prediction_proba = best_model_small.predict_proba(sample_df)

In [26]:
print("\nNew Sample Prediction")
print("Prediction (0=No Diabetes, 1=Diabetes):", sample_prediction[0])
print("Prediction Probability (No Diabetes, Diabetes):", sample_prediction_proba[0])


New Sample Prediction
Prediction (0=No Diabetes, 1=Diabetes): 1
Prediction Probability (No Diabetes, Diabetes): [0.20398971 0.79601029]
