#K Nearest Neighbours

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.preprocessing import scale
from collections import Counter

In [21]:
#Import dataset
df=pd.read_csv('/content/drive/MyDrive/Pembelajaran Mesin/Praktikum/Praktikum 6 dan 7/iris.csv')
df.head

<bound method NDFrame.head of      sepal.length  sepal.width  petal.length  petal.width    variety
0             5.1          3.5           1.4          0.2     Setosa
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
3             4.6          3.1           1.5          0.2     Setosa
4             5.0          3.6           1.4          0.2     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[150 rows x 5 columns]>

In [22]:
#Splitting data into 70:30 train:test ratio
df_X=df.iloc[:,:4]
df_Y=df.iloc[:,4]
X_train,X_test,Y_train,Y_test=train_test_split(df_X,df_Y,test_size=0.3,random_state=33)

In [23]:
#Changing the index of the records to sequential
X_train.index=range(len(X_train))
Y_train.index=range(len(X_train))
X_test.index=range(len(X_test))
Y_test.index=range(len(Y_test))

In [24]:
#Function to return the list of distances of the test records from train records
def distNeighbours(X_train,Y_train,X_test,K):
    distance=[]
    for i in range(len(X_train)):
      eDistance=0
      for j in range(len(X_train.columns)):
        eDistance+=round(np.sqrt(pow((X_train.iloc[i,j]-X_test[j]),2)),2)
      distance.append((eDistance,i,Y_train.iloc[i]))
      distance=sorted(distance, key=lambda x: x[0])[0:K]
    return distance

In [25]:
#Predict the output of the categorical variables based on K nearest neighbours 
#Output is the most frequent class among K nearest neighbours
def predictOutputCategorical(X_train,Y_train,X_test,K):
  neighbours=[]
  responses=[]
  for i in range(len(X_test)):
    neighbours.append(distNeighbours(X_train,Y_train,X_test.iloc[i,:],K))
  for i in neighbours:
    votes={}
    for j in i:
      if j[-1] in votes.keys():
        votes[j[-1]]=votes[j[-1]]+1
      else:
        votes[j[-1]]=1
    responses.append(sorted(votes,key=votes.get,reverse=True)[0])
  return responses

In [26]:
#Predict the output of the numeric variables based on K nearest neighbours
#Output is the mean of the K nearest neighbours
def predictOutputNumeric(X_train,Y_train,X_test,K):
  neighbours=[]
  responses=[]
  for i in range(len(X_test)):
    neighbours.append(distNeighbours(X_train,Y_train,X_test.iloc[i,:],K))
  for i in neighbours:
    mean=0
    for j in i:
      mean+=j[-1]
    mean=mean/K
    responses.append(mean)
  return responses

In [27]:
#Accuarcy of the categorical predictions 
def getAccuracyCategorical(actual,predicted):
  correct=0
  for i in range(len(predicted)):
    if predicted[i]==actual[i]:
      correct+=1
  return round((correct/len(actual))*100,2)

In [28]:
#Accuarcy of the numerical predictions
def getAccuracyNumeric(actual,predicted):
  error=0
  for i in range(len(predicted)):
    error+=pow((actual[i]-predicted[i]),2)
  error=error/len(predicted)-1
  return 100-error

In [29]:
#Predict species
output=predictOutputCategorical(X_train,Y_train,X_test,3)
getAccuracyCategorical(Y_test,output)

97.78

In [30]:
#Fit model using in built sklearn function
model=KNeighborsClassifier(n_neighbors=3,p=2,metric='minkowski')
model.fit(X_train,Y_train)

In [31]:
#Accuracy of the model
print('Accuracy : {:^0.2f}'.format(metrics.accuracy_score(Y_test,model.predict(X_test))*100))

Accuracy : 97.78


In [32]:
#Check whether the both outputs are same or not
#They are same as displayed below
output==model.predict(X_test)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [33]:
#Import dataset
#This is for trying out regression using KNN
df=pd.read_csv("/content/drive/MyDrive/Pembelajaran Mesin/Praktikum/Praktikum 6 dan 7/Freshmen.csv")
df.head()

Unnamed: 0,GPA,Miles from Home,College,Accommodations,Years Off,Part-Time Work Hours,Attends Office Hours,High School GPA
0,0.73,253,Social Sciences,Dorm,4,35,Sometimes,3.23
1,1.6,143,Social Sciences,Dorm,5,30,Never,2.35
2,2.17,171,Social Sciences,Dorm,0,25,Never,3.95
3,1.02,332,Sciences,Off-campus,5,30,Sometimes,3.44
4,3.14,112,Business,Dorm,0,25,Sometimes,3.2


In [34]:
#Change the data types of the categorical variables accordingly
df.College=df.College.astype('category')
df.Accommodations=df.Accommodations.astype('category')
df['Attends Office Hours']=df['Attends Office Hours'].astype('category')

In [35]:
#Generate dummy values of the categorical variables and drop one (i.e. n-1 dummies for n categories)
df_dummies=pd.get_dummies(df,drop_first=True)
#Display top 5 records
df_dummies.head()

Unnamed: 0,GPA,Miles from Home,Years Off,Part-Time Work Hours,High School GPA,College_Engineering,College_Liberal Arts,College_Sciences,College_Social Sciences,Accommodations_Off-campus,Accommodations_Other,Attends Office Hours_Regularly,Attends Office Hours_Sometimes
0,0.73,253,4,35,3.23,0,0,0,1,0,0,0,1
1,1.6,143,5,30,2.35,0,0,0,1,0,0,0,0
2,2.17,171,0,25,3.95,0,0,0,1,0,0,0,0
3,1.02,332,5,30,3.44,0,0,1,0,1,0,0,1
4,3.14,112,0,25,3.2,0,0,0,0,0,0,0,1


In [36]:
#Specifying the X and Y
X_train=df_dummies.iloc[:,1:]
Y_train=df_dummies.GPA

In [37]:
#Splitting data into 70:30 train:test ratio
X_train,X_test,Y_train,Y_test=train_test_split(X_train,Y_train,test_size=0.3,random_state=33)

In [38]:
#Changing the index of the records to sequential
X_train.index=range(len(X_train))
Y_train.index=range(len(X_train))
X_test.index=range(len(X_test))
Y_test.index=range(len(Y_test))

In [39]:
#Predict GPA
output=predictOutputNumeric(X_train,Y_train,X_test,3)
print('Accuracy from the code : {:^0.2f}'.format(getAccuracyNumeric(Y_test,output),2))

Accuracy from the code : 99.94


In [40]:
model=KNeighborsRegressor(n_neighbors=3,p=2)
model.fit(X_train,Y_train)

In [41]:
print('Accuracy from the model {:0.2f}'.
      format(metrics.mean_squared_error(Y_test,model.predict(X_test))*100))

Accuracy from the model 99.88


In [42]:
#Check whether both the outputs are same or not
#They are not same - Need to find why?
output==model.predict(X_test)

array([ True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True, False,  True, False,  True,  True,  True,
        True, False, False])