## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [3]:
df = pd.read_csv('diabetes.csv')

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
replace_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

### Replacing NaN values with a Mean value of the column

In [8]:
for column in replace_zero:
    df[column] = df[column].replace(0,np.NaN)
    mean = int(df[column].mean(skipna = True))
    df[column] = df[column].replace(np.NaN,mean)

In [9]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.682292,72.386719,29.108073,155.28125,32.450911,0.471876,33.240885,0.348958
std,3.369578,30.435999,12.096642,8.791221,85.02155,6.875366,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,155.0,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [10]:
x = df.iloc[:,:8]
y = df.iloc[:,8]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 0, test_size = 0.25) 

In [12]:
x_test.head() 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
661,1,199.0,76.0,43.0,155.0,42.9,1.394,22
122,2,107.0,74.0,30.0,100.0,33.6,0.404,23
113,4,76.0,62.0,29.0,155.0,34.0,0.391,25
14,5,166.0,72.0,19.0,175.0,25.8,0.587,51
529,0,111.0,65.0,29.0,155.0,24.6,0.66,31


In [17]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test) 

In [18]:
import math
math.sqrt(len(y_test))

13.856406460551018

Here we are training with training dataset with k value as 12 i.e sqrt(len(y_test)) because we need to predict 
the output on testing data itself  

In [19]:
classifier = KNeighborsClassifier(n_neighbors = 13, p=2, metric = 'euclidean')

In [20]:
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=13, p=2,
           weights='uniform')

In [21]:
x_test[1]

array([-0.53768687, -0.49971175,  0.10046323,  0.07437116, -0.65687004,
        0.13572466, -0.19434743, -0.88246592])

In [22]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [23]:
df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1


In [24]:
y_pred = classifier.predict(x_test) #custom

In [25]:
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [26]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[115,  15],
       [ 25,  37]])

In [27]:
f1_score(y_test, y_pred)

0.6491228070175439

In [28]:
accuracy_score(y_test, y_pred)

0.7916666666666666

## Trying to Standardize the dataset values

In [29]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [30]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [31]:
dfd = df.drop(['Outcome'], axis=1)

In [32]:
scaler=StandardScaler()
scaler.fit(dfd)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [34]:
scaled_data=scaler.transform(dfd)

  """Entry point for launching an IPython kernel.


In [35]:
scaled_data[1]

array([-0.84488505, -1.20601255, -0.5283186 , -0.01230129, -0.00331014,
       -0.85155088, -0.36506078, -0.19067191])

## Using Principle Component Analysis for Dimentionality Reduction

In [36]:
from sklearn.decomposition import PCA

In [37]:
pca=PCA(n_components=3)

In [38]:
pca.fit(scaled_data)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [39]:
x_pca=pca.transform(scaled_data)

In [40]:
scaled_data.shape

(768, 8)

In [41]:
x_pca.shape

(768, 3)

In [42]:
dataset = pd.DataFrame({'First principle component':x_pca[:,0],'Second principle component':x_pca[:,1],'Third principle component':x_pca[:,2]})

In [43]:
dataset.head(2)

Unnamed: 0,First principle component,Second principle component,Third principle component
0,1.505045,-0.587451,0.189931
1,-1.436222,0.08157,-0.10466


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier

In [45]:
x = x_pca
y = df['Outcome']

In [46]:
xx_train, xx_test, yy_train, yy_test = train_test_split(x,y, random_state = 0, test_size = 0.25) 

In [47]:
import math
math.sqrt(len(yy_test))

13.856406460551018

In [48]:
classifier = KNeighborsClassifier(n_neighbors = 13, p=2, metric = 'euclidean')

In [49]:
classifier.fit(xx_train, yy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=13, p=2,
           weights='uniform')

In [50]:
yy_pred = classifier.predict(xx_test)

In [51]:
accuracy_score(yy_test, yy_pred)

0.7604166666666666

## Result: 

       Without Dimentionality Reduction (8 columns) and Accuracy: 79.16%
       With Dimentionality Reduction using PCA (3 columns) and Accuracy: 76.04%
            
This result is pretty good as we have less that 4% in loss of accuracy compared to non pca method