In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
data = np.loadtxt('dummy_regression_data.csv', delimiter=',')
X_train, X_test, Y_train, Y_test = train_test_split(data[:, 0:-1], data[:, -1], test_size=0.3)
X_train.shape

(70, 1)

### Mean Squared Error

In [3]:
# This fumction calculates and returns the Mean Squared Error (MSE) for a model
def mean_squared_error(Y_true, Y_pred):
    # number of test samples
    M = len(Y_true)
    total_error = 0
    for i in range(M):
        total_error += (Y_true[i] - Y_pred[i]) ** 2
    mse = total_error / M
    return mse

### Cross - Validation - Finding Optimal Value of k for kNN¶

In [4]:
# Finds the cross validation score for different values of k and returns the value of k with the highest cross validation score
def get_maxscore_k(X_train, Y_train, max_k = 25):
    x_axis = []
    y_axis = []

    # going thorught different values of k from 1 to max_k
    for i in range(1, max_k + 1, 1):
        rgr = KNeighborsRegressor(n_neighbors=i, weights='distance')
        score = cross_val_score(rgr, X_train, Y_train)
        print(i, ':', score.mean())
        x_axis.append(i)
        y_axis.append(score.mean())
        
    optimal_k = x_axis[y_axis.index(max(y_axis))]
    return optimal_k

In [5]:
optimal_k = get_maxscore_k(X_train, Y_train, max_k = 25)
print('k with max cross-val score =', optimal_k)

1 : 0.27083791881841957
2 : 0.3815313079370868
3 : 0.426585410049301
4 : 0.46470228706194483
5 : 0.47916677317594747
6 : 0.49478749369453334
7 : 0.49319960720206163
8 : 0.5117423792752347
9 : 0.5151580544188833
10 : 0.5216121563152555
11 : 0.5238682018806501
12 : 0.5288912269021179
13 : 0.5301084130558612
14 : 0.5298488498813985
15 : 0.5314532126745558
16 : 0.5339524845000441
17 : 0.5353415973620991
18 : 0.533677870778804
19 : 0.5345512435887398
20 : 0.5374984325568862
21 : 0.5398583039489493
22 : 0.5402038560161329
23 : 0.5405412721039474
24 : 0.5405746320450836
25 : 0.5395429678024141
k with max cross-val score = 24


### In-built KNN (weights propotional to distance)
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor

In [6]:
k_neighbours = optimal_k
rgr = KNeighborsRegressor(n_neighbors=k_neighbours, weights='distance')
rgr.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=24, p=2,
          weights='distance')

In [7]:
print('In-Built KNN score - k =', k_neighbours, '--', rgr.score(X_test, Y_test))
Y_pred = rgr.predict(X_test)
print('Mean Squared Error - k =', k_neighbours, '--', mean_squared_error(Y_test, Y_pred))

In-Built KNN score - k = 24 -- 0.17644115326827603
Mean Squared Error - k = 24 -- 225.14252821817107


### K-NN Regressor Implementation (weights propotional to distance) - Need to Fix This
http://www.data-machine.com/nmtutorial/distanceweightedknnalgorithm.htm

In [8]:
# This function was meant to train the data - but no explicit training in K-NN
def train(x, y):
    return

In [9]:
# This function takes in the training data and the feature values to the data sample to be predicted on. 
# Using the specified number of nearest neighbours, it returns the predicted value of the specificed data sample
def predict_one(x_train, y_train, x_test_point, k):
    distances = []
    for i in range(len(x_train)):
        # sum of square of distance of each feature - minkowski Distance with p = 2
        distance = ((x_train[i, :] - x_test_point)**2).sum()
        # apending the list of the distance for the point and its index to the list
        distances.append([distance, i])

    # Sorting using the distance from the sample point
    distances = sorted(distances)
    
    targets = []
    total_inverse_distance = 0
    
    for i in range(k):
        total_inverse_distance += (1 / distances[i][0])
    print('total_inverse_distance =', total_inverse_distance)
    
    # Finding the values of the elements using the first 'k' elemets in the distances list i.e 'k' nearest neighbours
    for i in range(k):
        # list of the indices of the 'k' nearest neighbours
        index_of_training_data = distances[i][1]

        # Calculating the scaling weight for the neighbour
        scale = (1 / distances[i][0]) / total_inverse_distance
        
        # adding the nearest neighbours and their weighted scales to the targets list
        targets.append([scale, y_train[index_of_training_data]])
    
    predicted_value = 0
    
    # Calculates the predicted value from the targets
    for i in range(k):
        predicted_value += targets[i][0] * targets[i][1]
    
    # returns the predicted value
    return predicted_value

In [10]:
# This function takes in the training data, the data samples to be predicted upon and the value of k
# It return the predicted regression values of the data samples given to it for prediction
def predict(x_train, y_train, x_test_data, k):
    predictions = []
    
    # making prediction for the testing data samples
    for x_test in x_test_data:
        predictions.append(predict_one(x_train, y_train, x_test, k))
    return predictions

In [11]:
k_neighbours = optimal_k
y_pred = predict(X_train, Y_train, X_test, k_neighbours)
print('Mean Squared Error - k =', k_neighbours, '--', mean_squared_error(Y_test, y_pred))

total_inverse_distance = 1272.4427484392027
total_inverse_distance = 4.717268922594665
total_inverse_distance = 69.08129707791743
total_inverse_distance = 17.184829480290176
total_inverse_distance = 83.61349918206507
total_inverse_distance = 271.94273554412075
total_inverse_distance = 64733.86254695193
total_inverse_distance = 15.470656109859044
total_inverse_distance = 137692547.19431245
total_inverse_distance = 372.27632145252886
total_inverse_distance = 506.4859144627577
total_inverse_distance = 19.229381912679308
total_inverse_distance = 8.674917250974227
total_inverse_distance = 29.167978309515302
total_inverse_distance = 16.9405988550088
total_inverse_distance = 32253.11687588141
total_inverse_distance = 26.16930934135885
total_inverse_distance = 136194901.92059496
total_inverse_distance = 6740.1221233843835
total_inverse_distance = 1540.2110400784663
total_inverse_distance = 84.12776346778037
total_inverse_distance = 450.89095449609346
total_inverse_distance = 69.77728071884398
