In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
data = pd.read_csv('Dataset of Diabetes .csv')

df = data.copy()
df.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   No_Pation  1000 non-null   int64  
 2   Gender     1000 non-null   object 
 3   AGE        1000 non-null   int64  
 4   Urea       1000 non-null   float64
 5   Cr         1000 non-null   int64  
 6   HbA1c      1000 non-null   float64
 7   Chol       1000 non-null   float64
 8   TG         1000 non-null   float64
 9   HDL        1000 non-null   float64
 10  LDL        1000 non-null   float64
 11  VLDL       1000 non-null   float64
 12  BMI        1000 non-null   float64
 13  CLASS      1000 non-null   object 
dtypes: float64(8), int64(4), object(2)
memory usage: 109.5+ KB


In [9]:
df['Gender'].unique()

array(['F', 'M', 'f'], dtype=object)

In [10]:
df['Gender'] = df['Gender'].replace('f', 'F')

In [11]:
df['Gender'].unique()

array(['F', 'M'], dtype=object)

In [12]:
df = df.drop(['ID', 'No_Pation'], axis=1)

In [13]:
df.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [14]:
df['Gender'] = df['Gender'].map({'M':1, 'F': 0})

In [15]:
df.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [16]:
df['CLASS'].unique()

array(['N', 'N ', 'P', 'Y', 'Y '], dtype=object)

In [None]:
df.

In [17]:
df['CLASS'] = df['CLASS'].str.strip()

In [18]:
df.CLASS.unique()

array(['N', 'P', 'Y'], dtype=object)

In [25]:
df['CLASS'] = df['CLASS'].map({'N': 0, 'Y': 1, 'P': 2})

In [26]:
df

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0
2,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
3,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
4,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,1
996,1,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,1
997,1,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,1
998,1,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,1


In [28]:
df.CLASS.unique()

array([0, 2, 1])

In [30]:
X = df.iloc[:, 0:-1]
Y = df.iloc[:, -1]

X.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0
2,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0
3,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0
4,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

model = RandomForestClassifier(n_estimators=150)

params = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(model, params, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

print(f'Best params: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')
print(f'Best estimators: {grid_search.best_estimator_}')

Best params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': None}
Best score: 0.9871428571428572
Best estimators: RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_features=None, n_estimators=150)


In [38]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 35   1   0]
 [  1 253   0]
 [  0   0  10]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        36
           1       1.00      1.00      1.00       254
           2       1.00      1.00      1.00        10

    accuracy                           0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300



In [39]:
y_pred2 = best_model.predict(X_train)

print(confusion_matrix(y_train, y_pred2))
print(classification_report(y_train, y_pred2))

[[ 67   0   0]
 [  0 590   0]
 [  0   0  43]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        67
           1       1.00      1.00      1.00       590
           2       1.00      1.00      1.00        43

    accuracy                           1.00       700
   macro avg       1.00      1.00      1.00       700
weighted avg       1.00      1.00      1.00       700



In [None]:
import pickle

file = 'DiabetesModel.pkl'
with open(file, 'wb') as f:
    pick