<a href="https://colab.research.google.com/github/Mjcherono/TrialProjects/blob/main/The_K_Nearest_Neighbours_(KNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: The K-Nearest Neighbours (KNN)

## Examples

### Example 1: Classification

In [32]:
# Example 
# ---
# Question: Predict the class to which these plants belong. 
# There are three classes in the dataset: Iris-setosa, Iris-versicolor and Iris-virginica. 
# ---
# Dataset url = http://bit.ly/DatasetIris
# ---
# 


In [33]:
# Importing our libraries
# ---
# 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [34]:
# Loading our dataset
# ---
# 

# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe
dataset = pd.read_csv("http://bit.ly/DatasetIris", names = names)

In [35]:
# Previewing our datset
# ---
# 
dataset.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [36]:
# Splitting our dataset into its attributes and labels
# ---
# The X variable contains the first four columns of the dataset (i.e. attributes) while y contains the labels.
# ---
# 
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [37]:
# Train Test Split
# ---
# To avoid over-fitting, we will divide our dataset into training and test splits, 
# which gives us a better idea as to how our algorithm performed during the testing phase. 
# This way our algorithm is tested on un-seen data
# ---
# 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [38]:
# Feature Scaling
# ---
# Before making any actual predictions, it is always a good practice to scale the features 
# so that all of them can be uniformly evaluated.
# ---
# 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
# Training and Predictions
# ---
# The first step is to import the KNeighborsClassifier class from the sklearn.neighbors library. 
# In the second line, this class is initialized with one parameter, i.e. n_neigbours. 
# This is basically the value for the K. There is no ideal value for K and it is selected after testing and evaluation, 
# however to start out, 5 seems to be the most commonly used value for KNN algorithm.
# ---
# 
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [40]:
# The final step is to make predictions on our test data
# ---
# 
y_pred = classifier.predict(X_test)

In [41]:
# Evaluating the Algorithm
# ---
# For evaluating an algorithm, confusion matrix, precision, recall and f1 score are the most commonly used metrics. 
# The confusion_matrix and classification_report methods of the sklearn.metrics can be used to calculate these metrics. 
# ---
# 
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[10  0  0]
 [ 0 11  0]
 [ 0  1  8]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.92      1.00      0.96        11
 Iris-virginica       1.00      0.89      0.94         9

       accuracy                           0.97        30
      macro avg       0.97      0.96      0.97        30
   weighted avg       0.97      0.97      0.97        30



### Example 2: Regression

In [42]:
# Example 2
# ---
# Question: Predict the age of a voter through the use of other variables in the dataset.
# ---
# 


In [43]:
# First installing pydataset
# ---
#pip install pydataset 

In [44]:
# Then loading our libraries
# 
#from pydataset import data
#import pandas as pd
#from sklearn.model_selection import train_test_split
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.metrics import mean_squared_error

In [45]:
# Previewing our turnout dataset
# ---
# 
#df = data("turnout")
#df.head()

In [46]:
# Determining the size of the dataset
# 
#df.shape

In [47]:
# Splitting our data
# ---
# 
#X = df[['age','income','vote']]
#y = df['educate']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

In [48]:
# Training our algorithm
# ---
# 
#clf = KNeighborsRegressor(11)
#clf.fit(X_train, y_train)

In [49]:
# Making our prediction
# ---
# 
#y_pred = clf.predict(X_test)
#print(mean_squared_error(y_test, y_pred))

## <font color="green">Challenge 1</font>

In [70]:
# Challenge 1
# ---
# Question: Predict the income level based on the individual’s personal information in the given dataset.
# ---
# Dataset url = http://bit.ly/DatasetAdult
# ---
# 
adult = pd.read_csv('http://bit.ly/DatasetAdult')
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
#geting the value counts 
cols = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country']
for i in cols:
  print(adult[i].value_counts(),'\n')

In [51]:
#label encoding
from sklearn.preprocessing import LabelEncoder

label_object = {}
categorical_columns = cols
for col in categorical_columns:
  labelencoder = LabelEncoder()
  labelencoder.fit(adult[col])
  adult[col] = labelencoder.fit_transform(adult[col])
  label_object[col] = labelencoder

In [52]:
adult.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,<=50K
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,<=50K
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,>50K


In [53]:
X = adult.drop('income',axis=1).values
y = adult['income'].values

In [54]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [55]:
#standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
#finding number of neighbors

from sklearn.model_selection import GridSearchCV
n = np.arange(1,50)
grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors':n})
grid.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [61]:
#best parameters
print('The best parameters ',grid.best_params_)


The best parameters  {'n_neighbors': 29}


In [62]:
#trainmodel
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=29)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=29, p=2,
                     weights='uniform')

In [63]:
#Making predictions
y_pred = classifier.predict(X_test)

In [64]:
#Evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
confusion_matrix(y_test, y_pred)


array([[6806,  614],
       [ 995, 1354]])

In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       <=50K       0.87      0.92      0.89      7420
        >50K       0.69      0.58      0.63      2349

    accuracy                           0.84      9769
   macro avg       0.78      0.75      0.76      9769
weighted avg       0.83      0.84      0.83      9769



## <font color="green">Challenge 2</font>

In [91]:
# Challenge 2
# ---
# Question: Using KNN, predict if the client will subscribe a term deposit (variable y).
# ---
# Dataset url = http://bit.ly/DatasetBank
# ---
# Dasest info = http://bit.ly/DatasetBankInfo
# ---
# 
subscription = pd.read_csv('http://bit.ly/DatasetBank',sep=';')
subscription.head(3)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [92]:
subscription.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [93]:
cols = ['job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week','poutcome', 'cons.price.idx','y'],

In [98]:
#Checking for workclass in the dataset
exists = 'workclass' in subscription
print(exists)

False


## <font color="green">Challenge 3</font>

In [76]:
# Challenge 3
# ---
# Question: Predict if a person will have diabetes or not using the KNN algorithm.
# ---
# Dataset url = http://bit.ly/DatasetDiabetes
# ---
# 
diabetes = pd.read_csv('http://bit.ly/DatasetDiabetes')
diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [78]:
#split features and labels
X = diabetes.iloc[:, :-1].values
y = diabetes.iloc[:, 8].values

#split to train and test
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [81]:
#standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [89]:
#performing LDA
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#lda = LDA(n_components=1)
#X_train = lda.fit_transform(X_train, y_train)
#X_test = lda.transform(X_test)

In [86]:
#finding number of neighbors

from sklearn.model_selection import GridSearchCV
n = np.arange(1,50)
grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors':n})
grid.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [87]:
#best parameters
print('The best parameters ',grid.best_params_)

The best parameters  {'n_neighbors': 14}


In [88]:
#trainmodel
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=14)
classifier.fit(X_train, y_train)

#Making predictions
y_pred = classifier.predict(X_test)

#Evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[93 14]
 [21 26]]
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       107
           1       0.65      0.55      0.60        47

    accuracy                           0.77       154
   macro avg       0.73      0.71      0.72       154
weighted avg       0.77      0.77      0.77       154



## <font color="green">Challenge 4</font>

In [None]:
# Challenge 4
# ---
# Question: Predict the miles per gallon (mpg) of a car, given its displacement and horsepower.
# ---
# Dataset Train url = http://bit.ly/AutoMPGTrainDataset
# Dataset Test url = http://bit.ly/AutoMPGTestDataset 
# ---
# 
OUR CODE GOES HERE

## <font color="green">Challenge 5</font>

In [99]:
# Challenge 6
# ---
# Question: Predict the target class given the following dataset.
# ---
# Dataset url = http://bit.ly/ClassifiedDataset
# ---
# 
target = pd.read_csv('http://bit.ly/ClassifiedDataset')
target.head(3)

Unnamed: 0.1,Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
0,0,0.913917,1.162073,0.567946,0.755464,0.780862,0.352608,0.759697,0.643798,0.879422,1.231409,1
1,1,0.635632,1.003722,0.535342,0.825645,0.924109,0.64845,0.675334,1.013546,0.621552,1.492702,0
2,2,0.72136,1.201493,0.92199,0.855595,1.526629,0.720781,1.626351,1.154483,0.957877,1.285597,0


In [100]:
target.drop('Unnamed: 0',axis=1, inplace=True)
target.head(3)

Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
0,0.913917,1.162073,0.567946,0.755464,0.780862,0.352608,0.759697,0.643798,0.879422,1.231409,1
1,0.635632,1.003722,0.535342,0.825645,0.924109,0.64845,0.675334,1.013546,0.621552,1.492702,0
2,0.72136,1.201493,0.92199,0.855595,1.526629,0.720781,1.626351,1.154483,0.957877,1.285597,0


In [101]:
#split features and labels
X = target.iloc[:, :-1].values
y = target.iloc[:, 10].values

#split to train and test
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#trainmodel
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

#Making predictions
y_pred = classifier.predict(X_test)


#Evaluating the algorithm
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 79   7]
 [  7 107]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        86
           1       0.94      0.94      0.94       114

    accuracy                           0.93       200
   macro avg       0.93      0.93      0.93       200
weighted avg       0.93      0.93      0.93       200

