<a href="https://colab.research.google.com/github/Kendi-Nceene/Supervised-Learning-The-K-Nearest-Neighbours-KNN-/blob/main/The_K_Nearest_Neighbours_(KNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font color="green">*To start working on this notebook, or any other notebook that we will use in the Moringa Data Science Course, we will need to save our own copy of it. We can do this by clicking File > Save a Copy in Drive. We will then be able to make edits to our own copy of this notebook.*</font>

# Python Programming: The K-Nearest Neighbours (KNN)

## Examples

### Example 1: Classification

In [None]:
# Example 
# ---
# Question: Predict the class to which these plants belong. 
# There are three classes in the dataset: Iris-setosa, Iris-versicolor and Iris-virginica. 
# ---
# Dataset url = http://bit.ly/DatasetIris
# ---
# 


In [None]:
# Importing our libraries
# ---
# 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Loading our dataset
# ---
# 

# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe
df = pd.read_csv("http://bit.ly/DatasetIris", names = names)

In [None]:
# Previewing our dataset
# ---
# 
df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
# Splitting our dataset into its attributes and labels
# ---
# The X variable contains the first four columns of the dataset (i.e. attributes) while y contains the labels.
# ---
# 
X = df.iloc[:, :-1].values
y = df.iloc[:, 4].values

In [None]:
# Train Test Split
# ---
# To avoid over-fitting, we will divide our dataset into training and test splits, 
# which gives us a better idea as to how our algorithm performed during the testing phase. 
# This way our algorithm is tested on un-seen data
# ---
# 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
# Feature Scaling
# ---
# Before making any actual predictions, it is always a good practice to scale the features 
# so that all of them can be uniformly evaluated.
# ---
# 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Training and Predictions
# ---
# The first step is to import the KNeighborsClassifier class from the sklearn.neighbors library. 
# In the second line, this class is initialized with one parameter, i.e. n_neigbours. 
# This is basically the value for the K. There is no ideal value for K and it is selected after testing and evaluation, 
# however to start out, 5 seems to be the most commonly used value for KNN algorithm.
# ---
# 
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
# The final step is to make predictions on our test data
# ---
# 
y_pred = classifier.predict(X_test)

In [None]:
# Evaluating the Algorithm
# ---
# For evaluating an algorithm, confusion matrix, precision, recall and f1 score are the most commonly used metrics. 
# The confusion_matrix and classification_report methods of the sklearn.metrics can be used to calculate these metrics. 
# ---
# 
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[12  0  0]
 [ 0  9  1]
 [ 0  1  7]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.90      0.90      0.90        10
 Iris-virginica       0.88      0.88      0.88         8

       accuracy                           0.93        30
      macro avg       0.92      0.92      0.92        30
   weighted avg       0.93      0.93      0.93        30



### Example 2: Regression

In [1]:
# Example 2
# ---
# Question: Predict the age of a voter through the use of other variables in the dataset.
# ---
# 

In [2]:
# First installing pydataset
# ---
!pip install pydataset

Collecting pydataset
  Downloading pydataset-0.2.0.tar.gz (15.9 MB)
[K     |████████████████████████████████| 15.9 MB 5.2 MB/s 
Building wheels for collected packages: pydataset
  Building wheel for pydataset (setup.py) ... [?25l[?25hdone
  Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939432 sha256=ba265155e8296e6fe4d3566894fa97a9b80e1eb0b763d87b93626c9cfe057385
  Stored in directory: /root/.cache/pip/wheels/32/26/30/d71562a19eed948eaada9a61b4d722fa358657a3bfb5d151e2
Successfully built pydataset
Installing collected packages: pydataset
Successfully installed pydataset-0.2.0


In [3]:
# Then loading our libraries
# 
from pydataset import data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

initiated datasets repo at: /root/.pydataset/


In [4]:
# Previewing our turnout dataset
# ---
# 
df = data("turnout")
df.head()

Unnamed: 0,race,age,educate,income,vote
1,white,60,14.0,3.3458,1
2,white,51,10.0,1.8561,0
3,white,24,12.0,0.6304,0
4,white,38,8.0,3.4183,1
5,white,25,12.0,2.7852,1


In [5]:
# Determining the size of the dataset
# 
df.shape

(2000, 5)

In [6]:
# Splitting our data
# ---
# 
X = df[['age','income','vote']]
y = df['educate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

In [7]:
# Training our algorithm
# ---
# 
clf = KNeighborsRegressor(5)
clf.fit(X_train, y_train)

KNeighborsRegressor()

In [8]:
# Making our prediction
# ---
# 
y_pred = clf.predict(X_test)
print(mean_squared_error(y_test, y_pred))

10.189533333333332


## <font color="green">Challenge 1</font>

In [None]:
# Challenge 1
# ---
# Question: Predict the income level based on the individual’s personal information in the given dataset.
# ---
# Dataset url = http://bit.ly/DatasetAdult
# ---
# 
import pandas as pd
data = pd.read_csv('http://bit.ly/DatasetAdult')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
data.shape

(48842, 15)

In [None]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [None]:
#Preprocessing o the data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
objects = [col for col in data.columns if data[col].dtype == 'object'] 
objects

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'native-country',
 'income']

In [None]:
#Label encoding the data
for col in objects:
  data[col] = le.fit_transform(data[col])

In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 14].values

In [None]:
X = data[['age','workclass', 'education', 'educational-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',]]
y = data['income']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Training our algorithm
# ---
# 
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
# Making our prediction
# ---
# 

y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[6701  719]
 [ 929 1420]]
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7420
           1       0.66      0.60      0.63      2349

    accuracy                           0.83      9769
   macro avg       0.77      0.75      0.76      9769
weighted avg       0.83      0.83      0.83      9769



## <font color="green">Challenge 2</font>

In [None]:
# Challenge 2
# ---
# Question: Using KNN, predict if the client will subscribe a term deposit (variable y).
# ---
# Dataset url = http://bit.ly/DatasetBank
# ---
# Dasest info = http://bit.ly/DatasetBankInfo
# ---
# 

In [None]:
bank = pd.read_csv('http://bit.ly/DatasetBank', sep = ';')
bank.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## <font color="green">Challenge 3</font>

In [None]:
# Challenge 3
# ---
# Question: Predict if a person will have diabetes or not using the KNN algorithm.
# ---
# Dataset url = http://bit.ly/DatasetDiabetes
# ---
# 
diabetes = pd.read_csv('http://bit.ly/DatasetDiabetes')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes.shape

(768, 9)

In [None]:
diabetes.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 8].values

In [None]:
# Train Test Split
# ---
# To avoid over-fitting, we will divide our dataset into training and test splits, 
# which gives us a better idea as to how our algorithm performed during the testing phase. 
# This way our algorithm is tested on un-seen data
# ---
# 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
# Feature Scaling
# ---
# Before making any actual predictions, it is always a good practice to scale the features 
# so that all of them can be uniformly evaluated.
# ---
# 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Training and Predictions
# ---
# The first step is to import the KNeighborsClassifier class from the sklearn.neighbors library. 
# In the second line, this class is initialized with one parameter, i.e. n_neigbours. 
# This is basically the value for the K. There is no ideal value for K and it is selected after testing and evaluation, 
# however to start out, 5 seems to be the most commonly used value for KNN algorithm.
# ---
# 
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
# The final step is to make predictions on our test data
# ---
# 
y_pred = classifier.predict(X_test)

In [None]:
# Evaluating the Algorithm
# ---
# For evaluating an algorithm, confusion matrix, precision, recall and f1 score are the most commonly used metrics. 
# The confusion_matrix and classification_report methods of the sklearn.metrics can be used to calculate these metrics. 
# ---
# 
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[6701  719]
 [ 929 1420]]
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7420
           1       0.66      0.60      0.63      2349

    accuracy                           0.83      9769
   macro avg       0.77      0.75      0.76      9769
weighted avg       0.83      0.83      0.83      9769



## <font color="green">Challenge 4</font>

In [None]:
# Challenge 4
# ---
# Question: Predict the miles per gallon (mpg) of a car, given its displacement and horsepower.
# ---
# Dataset Train url = http://bit.ly/AutoMPGTrainDataset
# Dataset Test url = http://bit.ly/AutoMPGTestDataset 
# ---
# 


In [None]:
train= pd.read_csv('auto_train.csv')
train.head()

Unnamed: 0,displacement,horsepower,mpg
0,307.0,130,18.0
1,350.0,165,15.0
2,318.0,150,18.0
3,304.0,150,16.0
4,302.0,140,17.0


In [None]:
test = pd.read_csv('auto_test.csv')
test.head()


Unnamed: 0,displacement,horsepower,mpg
0,89,71,31.9
1,86,65,34.1
2,98,80,35.7
3,121,80,27.4
4,183,77,25.4


In [None]:
#Splitting of the dataset into x and y
X_train = train[['displacement', 'horsepower']].values
y_train = train['mpg'].values

X_test = test[['displacement', 'horsepower']].values
y_test = test['mpg'].values

In [None]:
#Training of the dataset
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(5)
trained = regressor.fit(X_train, y_train)

In [None]:
#Making predictions
y_pred = regressor.predict(X_test)

In [None]:
#Evaluation of the model
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))

54.41354455445545


In [None]:
a = X_train.mean()
b = y_train.mean()
c = ( a + b ) / 2
c

92.63530927835052

In [None]:
a = (68.61 + 92.64)/2
a

80.625

## <font color="green">Challenge 5</font>

In [9]:
# Challenge 5
# ---
# Question: Predict the target class given the following dataset.
# ---
# Dataset url = http://bit.ly/ClassifiedDataset
# ---
# 


In [11]:
classdata  = pd.read_csv('classified.csv')
classdata.head()

Unnamed: 0.1,Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
0,0,0.913917,1.162073,0.567946,0.755464,0.780862,0.352608,0.759697,0.643798,0.879422,1.231409,1
1,1,0.635632,1.003722,0.535342,0.825645,0.924109,0.64845,0.675334,1.013546,0.621552,1.492702,0
2,2,0.72136,1.201493,0.92199,0.855595,1.526629,0.720781,1.626351,1.154483,0.957877,1.285597,0
3,3,1.234204,1.386726,0.653046,0.825624,1.142504,0.875128,1.409708,1.380003,1.522692,1.153093,1
4,4,1.279491,0.94975,0.62728,0.668976,1.232537,0.703727,1.115596,0.646691,1.463812,1.419167,1


In [12]:
classdata.shape

(1000, 12)

In [18]:
classdata.columns

Index(['Unnamed: 0', 'WTT', 'PTI', 'EQW', 'SBI', 'LQE', 'QWG', 'FDJ', 'PJF',
       'HQE', 'NXJ', 'TARGET CLASS'],
      dtype='object')

In [28]:
# Splitting our data
# ---
# 
X =classdata[[ 'Unnamed: 0','WTT', 'PTI', 'EQW', 'SBI', 'LQE', 'QWG', 'FDJ', 'PJF', 'HQE', 'NXJ']]
y = classdata['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=11, random_state=0)


In [29]:
# Training our algorithm
# ---
# 
clf = KNeighborsRegressor(5)
clf.fit(X_train, y_train)

KNeighborsRegressor()

In [30]:
# Making our prediction
# ---
# 
y_pred = clf.predict(X_test)
print(mean_squared_error(y_test, y_pred))

0.18909090909090914
