In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
### Check is there any null values in the dataset

data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
data['Outcome'].unique()

## Here only target variable has 2 unique values 0/1, all other values in the dataset are continuous values

array([1, 0], dtype=int64)

### Lets First run this using KNN, using the inbuilt Scikit-Learn Libraries

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [49]:
X = data.drop('Outcome',axis=1)
Y = data['Outcome']

In [50]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [51]:
print("Shape of X_train is :",X_train.shape)
print("Shape of X_test is :",X_test.shape)
print("Shape of Y_train is :",Y_train.shape)
print("Shape of Y_test is :",Y_test.shape)

Shape of X_train is : (537, 8)
Shape of X_test is : (231, 8)
Shape of Y_train is : (537,)
Shape of Y_test is : (231,)


Let's initialize the model first:

In [52]:
KNN = KNeighborsClassifier(n_neighbors=7)

In [53]:
## First fit the model using training dataset

KNN.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [54]:
### Predict the values using X_test set

Y_pred = KNN.predict(X_test)

In [55]:
### Check the accuracy of the model

print("The accuracy of the model is :",KNN.score(X_test,Y_test))

The accuracy of the model is : 0.6926406926406926


### So,in the above we can see that the model accuracy is 69%. So now check/improve the accuarcy of the model using Cross validation.

#### 1. Cross-Validation Method

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
### create a new KNN model

KNN_cv = KNeighborsClassifier(n_neighbors=9)

### train model with cross validation of 5

cv_scores = cross_val_score(KNN_cv,X_train,Y_train,cv=5)

print(cv_scores)
print("Mean scores is :",np.mean(cv_scores))

[0.77777778 0.73148148 0.69158879 0.72897196 0.71962617]
Mean scores is : 0.7298892350294219


#### In the above, we will change the value of 'K' and see which value of 'k', will give us the higher value of mean scores.As an example I change of value of k to 3,5,7,9 and k=9 gives us the highest mean score

In [40]:
KNN_new = KNeighborsClassifier(n_neighbors=9)
KNN_new.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [41]:
print("The accuracy of the model is :",KNN_new.score(X_test,Y_test))

The accuracy of the model is : 0.7012987012987013


#### So,we can see that the accuracy of the model improves to 70%

## Now Lets Try this problem by writing our own Algorithm

In [27]:
import numpy as np
import pandas as pd
from sortedcontainers import SortedList
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
def get_data(limit=None):
    df = pd.read_csv('diabetes.csv')
    data = df.values
    np.random.shuffle(data)
    
    X,Y = data[:,:-1],data[:,-1]
    
    if limit is not None:
        X,Y = X[:limit],Y[:limit]
    return X,Y

In [41]:
class KNN(object):
    
    def __init__(self,k):
        self.k = k
        
    def fit(self,X,y):
        self.X = X
        self.y = y
    
    def score(self,X,Y):
        P = self.predict(X)
        return np.mean(P==Y)
    
    def predict(self,X):
        y = np.zeros(len(X))
        
        for i,x in enumerate(X):
            SL =SortedList()
            for j , xt in enumerate(X):
                diff = x-xt
                d = diff.dot(diff)
                
                if len(SL) < self.k:
                    SL.add((d,self.y[j]))
                else:
                    if d < SL[-1][0]:
                        del SL[-1]
                        SL.add((d,self.y[j]))
            
            votes = {}
            for l,m in SL:
                votes[m] = votes.get(m,0) +1
            
            max_votes = 0
            max_votes_class=-1
            
            for v,count in votes.items():
                if count > max_votes:
                    max_votes = count
                    max_votes_class = v
            y[i] = max_votes_class
        return y
    
if __name__ == "__main__":
    X,Y = get_data()
    N =500
    
    X_train,Y_train = X[:N],Y[:N]
    X_test , Y_test = X[N:],Y[N:]
    
    for k in (3,5,7,9):
        knn = KNN(k)
        t0 = datetime.now()
        knn.fit(X_train,Y_train)
        print("Time taken for fitting the training set is :",datetime.now()-t0)
        
        t1 = datetime.now()
        print("Training accuracy is :",knn.score(X_train,Y_train))
        print("Time taken to calculate the training accuracy is:",datetime.now()-t1)
        
        t2 = datetime.now()
        print("Testing accuracy is :",knn.score(X_test,Y_test))
        print("Time taken to calculate the testing set accuracy is:",datetime.now()-t2)
        
        print("----------------------------------------------------------------------------------")

Time taken for fitting the training set is : 0:00:00
Training accuracy is : 0.858
Time taken to calculate the training accuracy is: 0:00:02.468713
Testing accuracy is : 0.5447761194029851
Time taken to calculate the testing set accuracy is: 0:00:01.074630
----------------------------------------------------------------------------------
Time taken for fitting the training set is : 0:00:00
Training accuracy is : 0.796
Time taken to calculate the training accuracy is: 0:00:02.719728
Testing accuracy is : 0.5932835820895522
Time taken to calculate the testing set accuracy is: 0:00:00.697053
----------------------------------------------------------------------------------
Time taken for fitting the training set is : 0:00:00
Training accuracy is : 0.78
Time taken to calculate the training accuracy is: 0:00:02.514950
Testing accuracy is : 0.6082089552238806
Time taken to calculate the testing set accuracy is: 0:00:00.700187
-------------------------------------------------------------------

### Above is the simple demonstration of how the algorithmn works.. Now lets see the other metrics like confusion_matrix, recall value and precision

In [42]:
from sklearn.metrics import confusion_matrix,recall_score,precision_score,roc_auc_score

In [56]:
Y_test.shape

(231,)

In [57]:
print("Recall value is :",recall_score(Y_test,Y_pred))

Recall value is : 0.575


In [58]:
print("Precision value is :",precision_score(Y_test,Y_pred))

Precision value is : 0.5542168674698795


In [59]:
print("Area under the curve value is :",roc_auc_score(Y_test,Y_pred))

Area under the curve value is : 0.6649834437086092


In [60]:
print("Confusion matrix is :\n",confusion_matrix(Y_test,Y_pred))

Confusion matrix is :
 [[114  37]
 [ 34  46]]


Now lets understand the confusion matrix and create a pay off matrix,to check which metric would be the best for our model.

1. True Positives = 46, means model predicts that the person will get the diabetes and in actual the person has the diabetes.So, in this case the team will send person the offers/promotions to get the test done. Lets say it would gonna cost them 10 dollars and when the person  will come, they will earn aroud 100 dollars on him. The net financial implication here would be 100-10 - 90 dollars
2. True Negatives = 114, means model perdicts that the person will not get the diabetes and in actaul the person does't have any diabetes. So, the team will not send any promotions and person will also not come for testing.Net financial implications here are 0 dollars.
3. False Positives = 37, means model predicts that the person will have diabetes, but in actual he doesn't have any diabetes. So,the team will send him the promotions that gonna costs them 10 dollars,but the person will not come.So, net financial implications here are -10 dollars.
4. False Negatives = 34, means model predicts that person is fine, but in actual the person has the diabetes.So, the team will not send him the offers and in turn the company would have lose the money that they would have earned over him by testing.So,net financial cost here would be -100 -10(they could have send him the offers) = -110 dollars.

Since the value of False Negatives and True Positives are very HIGH.So,our best metric would be RECALL.