In [874]:
import pandas as pd
import numpy as np
import seaborn as sns

In [875]:
data = pd.read_csv("C:/Users/yathi/GitRepos/DataScienceLearning/DataSets/pima_india_diabetes.csv")

In [876]:
data.head()
data.shape

(768, 9)

In [877]:
# First Dealing with Missing values... represented in 0's 
# Where some are real zeros and others not
# Real zero features -> Pregnancies, DPF
# Non Real Zero features -> Glucose, BloodPressure, SkinThikness, Insulin, Age, BMI

In [878]:
# So replacing it with the median value 

In [879]:
(data == 0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [880]:
median_G = data['Glucose'].median()
data['Glucose'] = data['Glucose'].replace(0, median_G)

In [881]:
median_B = data['BloodPressure'].median()
data['BloodPressure'] = data['BloodPressure'].replace(0, median_B)

In [882]:
median_S = data['SkinThickness'].median()
data['SkinThickness'] = data['SkinThickness'].replace(0, median_S)

In [883]:
median_B = data['BloodPressure'].median()
data['BloodPressure'] = data['BloodPressure'].replace(0, median_B)

In [884]:
median_I = data['Insulin'].median()
data['Insulin'] = data['Insulin'].replace(0, median_I)

In [885]:
median_BMI = data['BMI'].median()
data['BMI'] = data['BMI'].replace(0, median_BMI)

In [886]:
# checking for wrong data [Outliers]

In [887]:
# values which are rare and biologically impossible so we remove those values
# pregnancies   -    > 10
# Glucose       -    >180
# BloodPressure -    <40, >110
# SkinThickness -    >50
# Insulin       -    >300
# BMI           -    >55
# DPF           -    >2.0
# Age           -    >100
data = data[data['Pregnancies'] <= 10]
data = data[data['Glucose'] <= 180]
data = data[(data['BloodPressure'] >= 40) & (data['BloodPressure'] <= 110)]
data = data[data['SkinThickness'] <= 50]
data = data[data['Insulin'] <= 300]
data = data[data['BMI'] <= 55]
data = data[data['DiabetesPedigreeFunction'] <= 2.0]
data = data[data['Age'] <= 100]
data = data.reset_index(drop=True)

In [888]:
data.shape

(651, 9)

In [889]:
(data['Age'] > 100).sum()

np.int64(0)

In [890]:
# Dealing with duplicated values
data.duplicated().sum()

np.int64(0)

In [891]:
# When do we need normalization, is there any specific algorithm which needs normalization or else we need to do it for all

In [892]:
x = data.drop('Outcome', axis = 1)
y = data['Outcome']

In [893]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(x, y, random_state=91, test_size = 0.2)

In [894]:
y_train.value_counts()

Outcome
0    370
1    150
Name: count, dtype: int64

In [895]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

In [922]:
y_test.value_counts().sum()

np.int64(131)

## Decision Tree Classifier

In [897]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
modelDT = DecisionTreeClassifier()

modelDT.fit(x_train,y_train)
y_pred = modelDT.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.67      0.75      0.71        77
           1       0.57      0.46      0.51        54

    accuracy                           0.63       131
   macro avg       0.62      0.61      0.61       131
weighted avg       0.63      0.63      0.63       131



## Random Forest Classifier

In [898]:
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier(n_estimators=5)

modelRF.fit(x_train,y_train)
y_pred = modelRF.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.74      0.80      0.77        80
           1       0.64      0.55      0.59        51

    accuracy                           0.70       131
   macro avg       0.69      0.67      0.68       131
weighted avg       0.70      0.70      0.70       131



## Logistic regression

In [912]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression()

modelLR.fit(x_train,y_train)
y_pred = modelLR.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.69      0.81      0.75        74
           1       0.68      0.53      0.59        57

    accuracy                           0.69       131
   macro avg       0.69      0.67      0.67       131
weighted avg       0.69      0.69      0.68       131



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Naive Bayes 

In [900]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train,y_train)
ypred = model.predict(x_test)
print(classification_report(ypred,y_test))

              precision    recall  f1-score   support

           0       0.68      0.88      0.77        67
           1       0.82      0.56      0.67        64

    accuracy                           0.73       131
   macro avg       0.75      0.72      0.72       131
weighted avg       0.75      0.73      0.72       131



## K - Nearest Neigbours

In [901]:
from sklearn.neighbors import KNeighborsClassifier
modelKnn = KNeighborsClassifier(n_neighbors=5)
modelKnn.fit(x_train,y_train)
ypred = modelKnn.predict(x_test)
print(classification_report(ypred,y_test))

              precision    recall  f1-score   support

           0       0.62      0.82      0.71        66
           1       0.73      0.49      0.59        65

    accuracy                           0.66       131
   macro avg       0.67      0.66      0.65       131
weighted avg       0.67      0.66      0.65       131

