In [52]:
import pandas as pd
import numpy as np
import seaborn as sns

In [53]:
data = pd.read_csv("C:/Users/yathi/GitRepos/DataScienceLearning/DataSets/pima_india_diabetes.csv")

In [55]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [26]:
# First Dealing with Missing values... represented in 0's 
# Where some are real zeros and others not
# Real zero features -> Pregnancies, DPF
# Non Real Zero features -> Glucose, BloodPressure, SkinThikness, Insulin, Age, BMI
# So replacing it with the median value 

In [27]:
data.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [28]:
data['Glucose'].mean()

np.float64(120.89453125)

In [29]:
(data == 0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [30]:
median_G = data['Glucose'].median()
data['Glucose'] = data['Glucose'].replace(0, median_G)

In [31]:
median_B = data['BloodPressure'].median()
data['BloodPressure'] = data['BloodPressure'].replace(0, median_B)

In [32]:
median_S = data['SkinThickness'].median()
data['SkinThickness'] = data['SkinThickness'].replace(0, median_S)

In [33]:
median_B = data['BloodPressure'].median()
data['BloodPressure'] = data['BloodPressure'].replace(0, median_B)

In [34]:
median_I = data['Insulin'].median()
data['Insulin'] = data['Insulin'].replace(0, median_I)

In [35]:
median_BMI = data['BMI'].median()
data['BMI'] = data['BMI'].replace(0, median_BMI)

In [36]:
# checking for wrong data [Outliers]

In [37]:
# values which are rare and biologically impossible so we remove those values
# pregnancies   -    > 10
# Glucose       -    >180
# BloodPressure -    <40, >110
# SkinThickness -    >50
# Insulin       -    >300
# BMI           -    >55
# DPF           -    >2.0
# Age           -    >100
data = data[data['Pregnancies'] <= 10]
data = data[data['Glucose'] <= 180]
data = data[(data['BloodPressure'] >= 40) & (data['BloodPressure'] <= 110)]
data = data[data['SkinThickness'] <= 50]
data = data[data['Insulin'] <= 300]
data = data[data['BMI'] <= 55]
data = data[data['DiabetesPedigreeFunction'] <= 2.0]
data = data[data['Age'] <= 100]
data = data.reset_index(drop=True)

In [38]:
data.shape

(651, 9)

In [39]:
(data['Age'] > 100).sum()

np.int64(0)

In [40]:
# Dealing with duplicated values
data.duplicated().sum()

np.int64(0)

In [41]:
# Do we need to consider mean / median for the replacement of null's by looking at their distibution
# OR what factors do we need to consider
# Naive bayes is applied in case where randomness is there... what are all the datasets which we see daily
# When do we need normalization, is there any specific algorithm which needs normalization or else we need to do it for all

In [42]:
x = data.drop('Outcome', axis = 1)
y = data['Outcome']

In [43]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(x, y, random_state=91, test_size = 0.2)

In [44]:
y_train.value_counts()

Outcome
0    370
1    150
Name: count, dtype: int64

In [45]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [46]:
y_test.value_counts().sum()

np.int64(131)

## Decision Tree Classifier

In [47]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
modelDT = DecisionTreeClassifier()

modelDT.fit(x_train,y_train)
y_pred = modelDT.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.68      0.82      0.74        72
           1       0.70      0.53      0.60        59

    accuracy                           0.69       131
   macro avg       0.69      0.67      0.67       131
weighted avg       0.69      0.69      0.68       131



## Random Forest Classifier

In [48]:
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier(n_estimators=5)

modelRF.fit(x_train,y_train)
y_pred = modelRF.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.70      0.81      0.75        75
           1       0.68      0.54      0.60        56

    accuracy                           0.69       131
   macro avg       0.69      0.67      0.68       131
weighted avg       0.69      0.69      0.69       131



## Logistic regression

In [49]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression()

modelLR.fit(x_train,y_train)
y_pred = modelLR.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.67      0.85      0.75        68
           1       0.77      0.54      0.64        63

    accuracy                           0.70       131
   macro avg       0.72      0.70      0.69       131
weighted avg       0.72      0.70      0.69       131



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Naive Bayes 

In [50]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train,y_train)
ypred = model.predict(x_test)
print(classification_report(ypred,y_test))

              precision    recall  f1-score   support

           0       0.68      0.89      0.77        66
           1       0.84      0.57      0.68        65

    accuracy                           0.73       131
   macro avg       0.76      0.73      0.73       131
weighted avg       0.76      0.73      0.73       131



## K - Nearest Neigbours

In [51]:
from sklearn.neighbors import KNeighborsClassifier
modelKnn = KNeighborsClassifier(n_neighbors=5)
modelKnn.fit(x_train,y_train)
ypred = modelKnn.predict(x_test)
print(classification_report(ypred,y_test))

              precision    recall  f1-score   support

           0       0.63      0.83      0.72        66
           1       0.75      0.51      0.61        65

    accuracy                           0.67       131
   macro avg       0.69      0.67      0.66       131
weighted avg       0.69      0.67      0.66       131

