### Diabetes Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sortedcontainers import SortedList
%matplotlib inline

In [3]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

### First lets try using Scikit Learn method

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score,precision_score

In [9]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((537, 8), (537,), (231, 8), (231,))

In [39]:
KNN = KNeighborsClassifier(n_neighbors=12)

In [40]:
KNN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                     weights='uniform')

In [41]:
y_pred = KNN.predict(X_test)

In [42]:
print("Recall value is :",recall_score(y_test,y_pred))

Recall value is : 0.475


In [43]:
print("Precision value is :",precision_score(y_test,y_pred))

Precision value is : 0.6785714285714286


### Lets select the value of K using cross validation method

In [20]:
from sklearn.model_selection import cross_val_score

In [34]:
cross_val_values = []
for i in range(5,22):
    score = cross_val_score(KNN,X_train,y_train,cv=i)
    cross_val_values.append(score.mean())

In [38]:
cross_val_values

[0.7298892350294219,
 0.722513524760716,
 0.7375500439410213,
 0.7374067164179106,
 0.7320150659133708,
 0.7338574423480083,
 0.735621521335807,
 0.7394360269360271,
 0.7392120075046905,
 0.737372276845961,
 0.7393121693121693,
 0.7430369875222816,
 0.7338709677419355,
 0.735632183908046,
 0.7371013741249676,
 0.7337606837606837,
 0.7356043956043956]

### Lets try writing our own Algorithm

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sortedcontainers import SortedList
%matplotlib inline

In [62]:
def get_data(limit=None):
    print("Reading the dataset")
    df = pd.read_csv('diabetes.csv')
    X = df.drop('Outcome',axis=1).values
    Y = df['Outcome'].values
    
    if limit is not None:
        X,Y = X[:limit],Y[:limit]
    return X,Y

In [73]:
class KNN(object):
    
    def __init__(self,k):
        self.k = k
    
    def fit(self,X,Y):
        self.X = X
        self.Y = Y
    
    def score(self,X,Y):
        P = self.predict(X)
        return np.mean(Y ==P)
    
    def predict(self,X):
        y = np.zeros(len(X))
        for i,x in enumerate(X):
            sl = SortedList()
            for j,xt in enumerate(X):
                diff = x-xt
                d = diff.dot(diff)
                
                if len(sl) < self.k:
                    sl.add((d,self.Y[j]))
                else:
                    if d < sl[-1][0]:
                        del sl[-1]
                        sl.add((d,self.Y[j]))
            values = {}
            for _,v in sl:
                values[v] = values.get(v,0)+1
            
            max_count = 0
            max_class =-1
            
            for v,count in values.items():
                if count > max_count:
                    max_count = count
                    max_class = v
            y[i] = max_class
        return y

if __name__ == "__main__":
    X,Y = get_data()
    N =500
    Xtrain,Ytrain = X[:N],Y[:N]
    Xtest,Ytest = X[N:],Y[N:]
    
    for k in range(3,10):
        t0 = datetime.now()
        knn = KNN(k)
        knn.fit(Xtrain,Ytrain)
        print("Time taken to fit the model is :",datetime.now()-t0)
        
        t1 = datetime.now()
        print("Score is",knn.score(Xtrain,Ytrain))
        print("Time for train score is :",datetime.now()-t1)
        
        t1 = datetime.now()
        print("Score is",knn.score(Xtest,Ytest))
        print("Time for test score is :",datetime.now()-t1)
        
        print('----------------------------------------------------------')

Reading the dataset
Time taken to fit the model is : 0:00:00
Score is 0.838
Time for train score is : 0:00:00.859235
Score is 0.5522388059701493
Time for test score is : 0:00:00.235268
----------------------------------------------------------
Time taken to fit the model is : 0:00:00
Score is 0.892
Time for train score is : 0:00:00.756234
Score is 0.5447761194029851
Time for test score is : 0:00:00.226829
----------------------------------------------------------
Time taken to fit the model is : 0:00:00
Score is 0.786
Time for train score is : 0:00:00.741462
Score is 0.6194029850746269
Time for test score is : 0:00:00.255604
----------------------------------------------------------
Time taken to fit the model is : 0:00:00
Score is 0.848
Time for train score is : 0:00:00.940943
Score is 0.5932835820895522
Time for test score is : 0:00:00.232775
----------------------------------------------------------
Time taken to fit the model is : 0:00:00
Score is 0.77
Time for train score is : 0:0

In [68]:
X,Y = get_data()

Reading the dataset


In [69]:
X.shape

(768, 8)

In [70]:
Y.shape

(768,)