In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.preprocessing import scale
from collections import Counter

In [2]:
# Predict the output of the numeric variables based on K nearest neighbours
# Output is the mean of the K nearest neighbours
#loc gets rows (or columns) with particular labels from the index.
#iloc gets rows (or columns) at particular positions in the index (so it only takes integers).
def predictOutputNumeric(X_train,Y_train,X_test,K):
    neighbours=[]
    responses=[]
    for i in range(len(X_test)):
        neighbours.append(distNeighbours(X_train,Y_train,X_test.iloc[i,:],K))
    for i in neighbours:
        mean=0
        for j in i:
            mean+=j[-1]
        mean=mean/K
        responses.append(mean)
    return responses

In [3]:
# Function to return the list of distances of the test records from train records
def distNeighbours(X_train,Y_train,X_test,K):
    distance=[]
    for i in range(len(X_train)):
        eDistance=0
        for j in range(len(X_train.columns)):   
                eDistance+=round(np.sqrt(pow((X_train.iloc[i,j]-X_test[j]),2)),2)
        distance.append((eDistance,i,Y_train.iloc[i]))
        distance=sorted(distance, key=lambda x: x[0])[0:K]
    return distance

In [4]:
# Accuarcy of the numerical predictions
def getAccuracyNumeric(actual,predicted):
    error=0
    for i in range(len(predicted)):
        error+=pow((actual[i]-predicted[i]),2)
    error=error/len(predicted)-1
    return 100-error

##  K Nearest Neighbour Regression


In [5]:
# This is for trying out regression using KNN
df=pd.read_csv('/home/johan/repos/GitHub/Introduction-to-Machine-Learning/Datasets/Freshmen.csv')
df.head()

Unnamed: 0,GPA,Miles from Home,College,Accommodations,Years Off,Part-Time Work Hours,Attends Office Hours,High School GPA
0,0.73,253,Social Sciences,Dorm,4,35,Sometimes,3.23
1,1.6,143,Social Sciences,Dorm,5,30,Never,2.35
2,2.17,171,Social Sciences,Dorm,0,25,Never,3.95
3,1.02,332,Sciences,Off-campus,5,30,Sometimes,3.44
4,3.14,112,Business,Dorm,0,25,Sometimes,3.2


In [6]:
print(df.College[0:3],type(df.College))
print('--------------------')
print(df['Attends Office Hours'][0:3],type(df['Attends Office Hours'][0:3]))

0    Social Sciences
1    Social Sciences
2    Social Sciences
Name: College, dtype: object <class 'pandas.core.series.Series'>
--------------------
0    Sometimes
1        Never
2        Never
Name: Attends Office Hours, dtype: object <class 'pandas.core.series.Series'>


In [7]:
# Change the data types of the categorical variables accordingly
df.Accommodations = df.Accommodations.astype('category')
df['Attends Office Hours'] = df['Attends Office Hours'].astype('category')
df.College = df.College.astype('category')

print(df.College[0:3],type(df.College))
print('--------------------')
print(df['Attends Office Hours'][0:3],type(df['Attends Office Hours'][0:3]))

0    Social Sciences
1    Social Sciences
2    Social Sciences
Name: College, dtype: category
Categories (5, object): [Business, Engineering, Liberal Arts, Sciences, Social Sciences] <class 'pandas.core.series.Series'>
--------------------
0    Sometimes
1        Never
2        Never
Name: Attends Office Hours, dtype: category
Categories (3, object): [Never, Regularly, Sometimes] <class 'pandas.core.series.Series'>


In [8]:
# Generate dummy values of the categorical variables and drop one (i.e. n-1 dummies for n categories)
# Some machine learning techniques require you to drop one dimension from the representation so as
#to avoid dependency among the variables. Use "drop_first=True" to achieve that.
df_dummies = pd.get_dummies(df,drop_first=True)
# Display top 5 records
df_dummies.head()

Unnamed: 0,GPA,Miles from Home,Years Off,Part-Time Work Hours,High School GPA,College_Engineering,College_Liberal Arts,College_Sciences,College_Social Sciences,Accommodations_Off-campus,Accommodations_Other,Attends Office Hours_Regularly,Attends Office Hours_Sometimes
0,0.73,253,4,35,3.23,0,0,0,1,0,0,0,1
1,1.6,143,5,30,2.35,0,0,0,1,0,0,0,0
2,2.17,171,0,25,3.95,0,0,0,1,0,0,0,0
3,1.02,332,5,30,3.44,0,0,1,0,1,0,0,1
4,3.14,112,0,25,3.2,0,0,0,0,0,0,0,1


In [9]:
# Specifying the X and Y
X_train = df_dummies.iloc[:,1:]
print('X_train',X_train.shape)
Y_train = df_dummies.GPA
print('Y_train',Y_train.shape)
# Splitting data into 70:30 train:test ratio
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.3,random_state=33)
print('X_train_divided',X_train.shape)
print('Y_train_divided',Y_train.shape)
print('X_train.index',X_train.index)
print('X_test.index',X_test.index)

X_train (100, 12)
Y_train (100,)
X_train_divided (70, 12)
Y_train_divided (70,)
X_train.index Int64Index([27, 17,  9, 69, 37, 76, 13, 64,  0,  8, 21, 87, 24, 75, 40, 28, 58,
            63, 54, 47, 65, 93, 85, 36, 15, 49,  1, 80, 55, 68, 53, 30, 50, 89,
            25, 52, 48, 32, 26, 79,  3, 74, 92, 10, 98, 94, 23, 19, 16, 22, 11,
            12, 44, 31, 77, 33, 35, 45, 42, 14, 83, 61, 67, 57, 73, 18, 66, 88,
             7, 20],
           dtype='int64')
X_test.index Int64Index([56, 90, 95, 82, 60, 71,  6, 81,  2, 70, 96, 62, 59, 86, 41, 39,  4,
            34, 99, 72, 78, 97, 29, 84, 38, 43, 46,  5, 91, 51],
           dtype='int64')


In [10]:
# Changing the index of the records to sequential
X_train.index = range(len(X_train))
Y_train.index = range(len(X_train))
X_test.index = range(len(X_test))
Y_test.index = range(len(Y_test))
print('X_train.index',X_train.index)
print('X_test.index',X_test.index)

X_train.index RangeIndex(start=0, stop=70, step=1)
X_test.index RangeIndex(start=0, stop=30, step=1)


In [11]:
# Predict GPA
output=predictOutputNumeric(X_train,Y_train,X_test,3)
print('Accuracy from the code: {:^0.2f}'.format(getAccuracyNumeric(Y_test,output),2))

model=KNeighborsRegressor(n_neighbors=3,p=2)
model.fit(X_train,Y_train)
print('Accuracy from the model {:^0.2f}'.format(metrics.mean_squared_error(Y_test,model.predict(X_test))*100))


Accuracy from the code: 99.94
Accuracy from the model 99.88


In [12]:
# Check whether both the outputs are same or not
# They are not same - Need to find why?
output==model.predict(X_test)

array([ True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True, False,  True, False,  True,  True,  True,
        True, False, False])