In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
data = np.loadtxt('dummy_regression_data.csv', delimiter=',')
X_train, X_test, Y_train, Y_test = train_test_split(data[:, 0:-1], data[:, -1], test_size=0.3)
X_train.shape

(70, 1)

### Mean Squared Error

In [3]:
# This fumction calculates and returns the Mean Squared Error (MSE) for a model
def mean_squared_error(Y_true, Y_pred):
    # number of test samples
    M = len(Y_true)
    total_error = 0
    for i in range(M):
        total_error += (Y_true[i] - Y_pred[i]) ** 2
    mse = total_error / M
    return mse

### Cross - Validation - Finding Optimal Value of k for kNN

In [4]:
# Finds the cross validation score for different values of k and returns the value of k with the highest cross validation score
def get_maxscore_k(X_train, Y_train, max_k = 25):
    x_axis = []
    y_axis = []

    # going thorught different values of k from 1 to max_k
    for i in range(1, max_k + 1, 1):
        rgr = KNeighborsRegressor(n_neighbors=i)
        score = cross_val_score(rgr, X_train, Y_train)
        print(i, ':', score.mean())
        x_axis.append(i)
        y_axis.append(score.mean())
        
    optimal_k = x_axis[y_axis.index(max(y_axis))]
    return optimal_k

In [5]:
optimal_k = get_maxscore_k(X_train, Y_train, max_k = 25)
print('k with max cross-val score =', optimal_k)

1 : 0.31369826982749255
2 : 0.4550533949342082
3 : 0.5652967094841576
4 : 0.5559900716656698
5 : 0.5653084200044588
6 : 0.5704240707081839
7 : 0.5822398481231406
8 : 0.5806411278083397
9 : 0.5902185044495432
10 : 0.5844684596693305
11 : 0.5813430257745006
12 : 0.5914018369848913
13 : 0.5731041419919971
14 : 0.5693409715344805
15 : 0.5613918950302237
16 : 0.5529696854480171
17 : 0.5469633111068385
18 : 0.5498581092470963
19 : 0.5343192235120209
20 : 0.5282526514310143
21 : 0.5221018525085777
22 : 0.507496493518457
23 : 0.50109380915633
24 : 0.48796270288450677
25 : 0.47536814971845637
k with max cross-val score = 12


### In-built KNN

In [6]:
k_neighbours = optimal_k
rgr = KNeighborsRegressor(n_neighbors=k_neighbours)
rgr.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=12, p=2,
          weights='uniform')

In [7]:
print('In-Built KNN score - k =', k_neighbours, '--', rgr.score(X_test, Y_test))
Y_pred = rgr.predict(X_test)
print('Mean Squared Error - k =', k_neighbours, '--', mean_squared_error(Y_test, Y_pred))

In-Built KNN score - k = 12 -- 0.4515594279376095
Mean Squared Error - k = 12 -- 147.77216617609025


### K-NN Regressor Implementation 

In [8]:
# This function was meant to train the data - but no explicit training in K-NN
def train(x, y):
    return

In [9]:
# This function takes in the training data and the feature values to the data sample to be predicted on. 
# Using the specified number of nearest neighbours, it returns the predicted value of the specificed data sample
def predict_one(x_train, y_train, x_test_point, k):
    distances = []
    for i in range(len(x_train)):
        # sum of square of distance of each feature - minkowski Distance with p = 2
        distance = ((x_train[i, :] - x_test_point)**2).sum()
        # apending the list of the distance for the point and its index to the list
        distances.append([distance, i])

    # Sorting using the distance from the sample point
    distances = sorted(distances)
    
    targets = []
    # Finding the values of the elements using the first 'k' elemets in the distances list i.e 'k' nearest neighbours
    for i in range(k):
        # list of the indices of the 'k' nearest neighbours
        index_of_training_data = distances[i][1]
        
        # adding the nearest neighbours to the targets list
        targets.append(y_train[index_of_training_data])
    
    # returns the mean value among the targets
    return sum(targets) / len(targets)

In [10]:
# This function takes in the training data, the data samples to be predicted upon and the value of k
# It return the predicted regression values of the data samples given to it for prediction
def predict(x_train, y_train, x_test_data, k):
    predictions = []
    
    # making prediction for the testing data samples
    for x_test in x_test_data:
        predictions.append(predict_one(x_train, y_train, x_test, k))
    return predictions

In [11]:
k_neighbours = optimal_k
y_pred = predict(X_train, Y_train, X_test, k_neighbours)
print('Mean Squared Error - k =', k_neighbours, '--', mean_squared_error(Y_test, y_pred))

Mean Squared Error - k = 12 -- 147.77216617609025
