## KNN, Logistic Regression, Random Forest - Diabetes Template
*Source:https://www.youtube.com/watch?v=4HKqjENq9OU&list=PLEiEAq2VkUULYYgj13YHUWmRePqiu8Ddy&index=22*

In [1]:
#Importing the Libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
#importing the dataset
dataset=pd.read_csv('diabetes.csv')
len(dataset)

768

In [3]:
dataset.shape

(768, 9)

### Analysing and Preparing the Data

In [4]:
#Checking the head of our datatset
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
#Sum the null values in columns 
dataset.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
#Although there are no null values in the dataframe, having zero values is inaccurate 
#If a person truly had zero values, they may die!!!

#Count the number of zeros in the dataframe
dataset.isin([0]).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [8]:
#Getting rid of the zeros in the dataframe
zero_not_accepted=['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero_not_accepted:
    #Replace zeros with null values
    dataset[column] =  dataset[column].replace(0, np.NaN)
    #calculate the mean of the column, but skip null values
    mean = int(dataset[column].mean(skipna=True))
    #replace the null values in the column with the mean
    dataset[column] =  dataset[column].replace(np.NaN, mean)

In [9]:
#Count the number of zeros in the dataframe
dataset.isin([0]).sum()

Pregnancies                 111
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

### Extracting the Dependent and Independent Varaiables

In [10]:
#Extracting the independent variables - age and salary
X = dataset.iloc[:,0:8].values
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [11]:
#Extracting the dependent variable - Purchased
y = dataset.iloc[:,8].values
y

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

### Splitting the Dataset into Train and Test

In [12]:
#train test split
#Diving the data into train and test spilt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=43)

### Scaling the Data

In [13]:
#features Scaling - changes the data to it all between -1 and 1 
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

## Choosing a Model - KNN

### Fitting the Model to the Training Set

In [14]:
#If you are unsure of what n_neighbors should be - there is a standard practise - find the square root of size of the test set
#Rather than have an even number we will take 1 away and use 11
import math
math.sqrt(len(y_test))

17.549928774784245

In [15]:
#Making an object of the knn model 
#Specifiy the number of neighbours as 11 intailly, we can tweak this parameters depending upon the need.
knn_model=KNeighborsClassifier(n_neighbors=11)

In [16]:
#Fitting the model to our train dataset to train our model
knn_model.fit(X_train,y_train)

### Predicting the Test Data

In [17]:
#Making Predictions from our Model
knn_predictions=knn_model.predict(X_test)

### Evaluating the Model

In [18]:
#Evaluating the Performance of our Model 
print('Confusion Matrix \n',confusion_matrix(y_test,knn_predictions))
print('\n')
print('Classification Report \n',classification_report(y_test,knn_predictions))
print('\n')
print('Accuracy Of Our Model ',accuracy_score(y_test,knn_predictions))

Confusion Matrix 
 [[178  27]
 [ 42  61]]


Classification Report 
               precision    recall  f1-score   support

           0       0.81      0.87      0.84       205
           1       0.69      0.59      0.64       103

    accuracy                           0.78       308
   macro avg       0.75      0.73      0.74       308
weighted avg       0.77      0.78      0.77       308



Accuracy Of Our Model  0.775974025974026


## Choosing a Model - Logistic Regression  
The relevant library has been included at the top of the Notebook

### Fitting the Model to the Training Set

In [19]:
#Making the object of Logistic Regression model - fitting the training data to our model to train our model
#Making the object of out Logistic Regression model 
#fitting the training data to our model to train our model
classifier=LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)


### Predicting the Test Data

In [20]:
#Making predictions
#Making predictions
y_pred=classifier.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,

### Evaluating the Model

In [21]:
#Evaluate using confusion matrix, classification report and accuracy score
#Checking the performace of the model
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [22]:
print('Confusion Matrix \n',confusion_matrix(y_test,y_pred))
print('\n')
print('Classification Report\n',classification_report(y_test,y_pred))
print('\n')
print('Accuracy Of Our Model',accuracy_score(y_test,y_pred))

Confusion Matrix 
 [[185  20]
 [ 46  57]]


Classification Report
               precision    recall  f1-score   support

           0       0.80      0.90      0.85       205
           1       0.74      0.55      0.63       103

    accuracy                           0.79       308
   macro avg       0.77      0.73      0.74       308
weighted avg       0.78      0.79      0.78       308



Accuracy Of Our Model 0.7857142857142857


## Choosing a Model - Random Forest

### Fitting the Model to the Training Set

In [23]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Fit the training sets to the model 
clf.fit(X_train,y_train)


### Predicting the Test Data

In [24]:
#Making predictions
y_pred=clf.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,

### Evaluating the Model

In [25]:
#Evaluate using confusion matrix, classification report and accuracy score
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       205
           1       0.73      0.56      0.64       103

    accuracy                           0.79       308
   macro avg       0.77      0.73      0.74       308
weighted avg       0.78      0.79      0.78       308



In [26]:
#Accuracy is at 81% which isnt too bad
confusion_matrix(y_test,y_pred)

array([[184,  21],
       [ 45,  58]], dtype=int64)

In [27]:
print ("Accuracy is ", accuracy_score(y_test,y_pred)*100)

Accuracy is  78.57142857142857


In [28]:
# #You can also make a prediction for a single item, for example:
# sepal length = 3
# sepal width = 5
# petal length = 4
# petal width = 2

#clf.predict([[3, 5, 4, 2]])