In [None]:
import nbimporter
import helper_methods as hm
import preprocessing as pp
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt

### K-NN Classifier Implementation 

In [None]:
# This function was meant to train the data - but no explicit training in K-NN
def train(x, y):
    return

In [None]:
# This function takes in the training data and the feature values to the data sample to be predicted on. 
# Using the specified number of nearest neighbours, it returns the predicted classification of the specificed data sample
def predict_one(x_train, y_train, x_test_point, k):
    distances = []
    for i in range(len(x_train)):
        # sum of square of distance of each feature - minkowski Distance with p = 2
        distance = ((x_train[i, :] - x_test_point)**2).sum()
        # appending the list of the distance for the point and its index to the list
        distances.append([distance, i])

    # Sorting using the distance from the sample point
    distances = sorted(distances)
    
    targets = []
    # Finding the classification of the elements using the first 'k' elemets in the distances list i.e 'k' nearest neighbours
    for i in range(k):
        # list of the indices of the 'k' nearest neighbours
        index_of_training_data = distances[i][1]
        
        # adding the nearest neighbours to the targets list
        targets.append(y_train[index_of_training_data])
    
    # returns the most common entry among the targets
    return Counter(targets).most_common(1)[0][0]

In [None]:
# This function takes in the training data, the data samples to be predicted upon and the value of k
# It return the predicted regression values of the data samples given to it for prediction
def predict(x_train, y_train, x_test_data, k):
    predictions = []
    
    # making prediction for the testing data samples
    for x_test in x_test_data:
        predictions.append(predict_one(x_train, y_train, x_test, k))
    return predictions

In [None]:
# This function takes in the training data, the data samples to be predicted upon and the value of k
# It return the predicted classification of the data samples given to it for prediction while updating the training data
# after each prediction
def predict_and_update(x_train, y_train, x_test_data, k):
    predictions = []
    
    # making prediction for the testing data samples
    for x_test in x_test_data:
        y_pred = predict_one(x_train, y_train, x_test, k)
        predictions.append(y_pred)
        
        # updating the training dataset to include the point just predicted upon
        x_train = np.append(x_train, [x_test], axis=0)
        y_train = np.append(y_train, [y_pred], axis=0)
        
    return predictions

### Using CV to find Optimal parameters

Sorting list: https://stackoverflow.com/questions/17555218/python-how-to-sort-a-list-of-lists-by-the-fourth-element-in-each-list  

In [None]:
def print_parameters_accuracy(accuracies):
    print('#Features \t #Neighbours \t Accuracy')
    for i in range(len(accuracies)):
        print(accuracies[i][0], '\t\t', accuracies[i][1], '\t\t', accuracies[i][2])
    print()

In [None]:
def find_optimal_values(max_features, max_neighbours, num_splits = 10, symbol_name = 'AAPL', use_implementation = True):
    accuracies = list()
    for num_features in range(2, max_features + 1, 2):
        print('Features:', num_features)
        
        X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features, symbol_name, is_binary_ouput=True)
        X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
        
        for k_neighbours in range(5, max_neighbours + 1, 5):
            if use_implementation:
                neighbour_accuracy = hm.timeSeriesCV(X_train, Y_train, num_splits, predict_and_update, [k_neighbours], is_classification=True)
            else:
                neighbour_accuracy = hm.rolling_cross_validation(X_train, Y_train, num_splits, predict, [k_neighbours], is_classification=True)
            accuracies.append([num_features, k_neighbours, neighbour_accuracy])
    
    print_parameters_accuracy(accuracies)
    
    # Sorting the accuracies
    accuracies.sort(reverse=True, key=lambda x: x[2])
    print_parameters_accuracy(accuracies)
    
    return accuracies[0][0], accuracies[0][1]

In [None]:
start = time.time()
num_features, k_neighbours = find_optimal_values(max_features=20, max_neighbours=50, num_splits=10, symbol_name = 'MSFT')
print('time taken:', time.time() - start)

In [None]:
X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features)
X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values

### In-built KNN Classifier

In [None]:
clf = KNeighborsClassifier(n_neighbors=k_neighbours)
clf.fit(X_train, Y_train)

### Training and Predicting

In [None]:
print('In-Built KNN (Accuracy) score - k =', k_neighbours, '--', clf.score(X_test, Y_test))
# Y_pred = rgr.predict(X_test)

In [None]:
Y_pred = predict(X_train, Y_train, X_test, k_neighbours)
print('Implementaion Accuracy Score --', accuracy_score(Y_test, Y_pred))

Y_pred = predict_and_update(X_train, Y_train, X_test, k_neighbours)
print('Implementaion Accuracy Score with Training updates --', accuracy_score(Y_test, Y_pred))