**Loading the Dataset**

In [72]:
data = pd.read_csv("../input/diabetes-data-set/diabetes.csv")
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [73]:
print("Name\t\t\t\tData\n")
print(data.dtypes)   #Checking the data type


Name				Data

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


**Checking if the data is corrupted**

In [74]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


After looking at the table, we can notice that the data is corrupted, as it shows 0.000000 for few sections.

In [75]:
#Deal with the corrupted data:
corrupted_data = data.loc[:,'Glucose':'BMI']
corrupted_data.replace(0,np.nan,inplace=True)
data.loc[:,'Glucose':'BMI'] = corrupted_data
data.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

**Table after dealing with the corrupted data**

In [76]:
data.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [77]:
#we will drop the rows of the missing values.
data.dropna(subset=['Glucose','BMI'],inplace=True)

In [78]:
from imblearn.over_sampling import SMOTE  #to balance the class distribution
import matplotlib.pyplot as plt  #To make changes to the figure
import seaborn as sns
sns.set()
%matplotlib inline
from sklearn.impute import KNNImputer
data_knn = data.copy()
knn_imputer = KNNImputer(n_neighbors=5)
data_knn.iloc[:,:] = knn_imputer.fit_transform(data_knn)
cleaned_data = data_knn.copy()

In [79]:
data_knn.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,752.0,752.0,752.0,752.0,752.0,752.0,752.0,752.0,752.0
mean,3.851064,121.941489,72.41383,29.102128,154.656649,32.454654,0.473051,33.3125,0.351064
std,3.375189,30.601198,12.223965,9.473877,98.872359,6.928926,0.330108,11.709395,0.477621
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,23.0,90.0,27.5,0.244,24.0,0.0
50%,3.0,117.0,72.0,29.0,135.0,32.3,0.377,29.0,0.0
75%,6.0,141.0,80.0,34.65,191.85,36.6,0.6275,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


**Random Forest Classification**

In [80]:
from sklearn.model_selection import train_test_split,  cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier #To control over fitting and use averaging to improve predictive accuracy.

In [81]:
X = cleaned_data.drop('Outcome',axis=1)  
y = cleaned_data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,stratify=y,random_state=42)

In [82]:
sm = SMOTE(random_state=42)
x_res,y_res=sm.fit_resample(X_train,y_train)

In [83]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
estimator = RandomForestClassifier(n_estimators=300)

params = { 'max_depth' : range(5,20),
          'min_samples_split' : np.arange(2,10)
}
grid = GridSearchCV(estimator, params, cv=5,verbose=True,n_jobs=-1,scoring='f1')  #Cross validation method to find the optimal parameter values.

In [84]:
grid.fit(x_res,y_res)


Fitting 5 folds for each of 120 candidates, totalling 600 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_estimators=300),
             n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])},
             scoring='f1', verbose=True)

In [85]:
y_pred = grid.predict(X_test)

In [86]:
from sklearn.model_selection import GridSearchCV
#grid = GridSearchCV(estimator, params, cv=5,verbose=True,n_jobs=-1,scoring='recall')

grid.best_params_


{'max_depth': 14, 'min_samples_split': 4}

**Finding the accuracy**

In [87]:
grid.best_score_

0.8227936643215823

In [88]:
rfctuned = RandomForestClassifier(max_depth=8,max_features=8,
                                 min_samples_split=2,n_estimators=1000).fit(X_train,y_train)
predicttuned = rfctuned.predict(X_test)

R2CVtuned = cross_val_score(rfctuned,X_test,y_test,cv=10).mean()
print(R2CVtuned)
errortuned = -cross_val_score(rfctuned,X_test,y_test,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errortuned))

0.7068333333333332
0.5451299539253613


It shows 70% accuracy

In [89]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.83      0.82       162
         1.0       0.67      0.63      0.65        87

    accuracy                           0.76       249
   macro avg       0.74      0.73      0.74       249
weighted avg       0.76      0.76      0.76       249

