In [1]:
import nbimporter
import helper_methods as hm
import preprocessing as pp
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import matplotlib.pyplot as plt

Importing Jupyter notebook from helper_methods.ipynb
Importing Jupyter notebook from preprocessing.ipynb


### K-NN Classifier Implementation 

In [2]:
class K_Nearest_Neighbours_Classifier:
    
    def __init__(self, k_neighbours):
        self.k_neighbours = k_neighbours
    
    # This function was meant to train the data - but no explicit training in K-NN
    def fit(self, X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        return
    
    
    # This function uses the training data and the feature values to the data sample to be predicted on. 
    # Using the specified number of nearest neighbours, it returns the predicted classification of the specificed data sample
    def predict_one(self, x_test_point):
        distances = []
        for i in range(len(self.X_train)):
            # sum of square of distance of each feature - minkowski Distance with p = 2
            distance = ((self.X_train[i, :] - x_test_point)**2).sum()
            # appending the list of the distance for the point and its index to the list
            distances.append([distance, i])

        # Sorting using the distance from the sample point
        distances = sorted(distances)

        targets = []
        # Finding the classification of the elements using the first 'k' elemets in the distances list i.e 'k' nearest neighbours
        for i in range(self.k_neighbours):
            # list of the indices of the 'k' nearest neighbours
            index_of_training_data = distances[i][1]

            # adding the nearest neighbours to the targets list
            targets.append(self.Y_train[index_of_training_data])

        # returns the most common entry among the targets
        return Counter(targets).most_common(1)[0][0]
    
    
    # This function uses the training data, the data samples to be predicted upon and the value of k
    # It return the predicted classification values of the data samples given to it for prediction
    def predict(self, x_test_data):
        predictions = []

        # making prediction for the testing data samples
        for x_test in x_test_data:
            predictions.append(self.predict_one(x_test))
        return predictions
    
    
    # This function uses the training data, the data samples to be predicted upon and the value of k
    # It returns the predicted classification of the data samples given to it for prediction 
    # while updating the training data after each prediction
    def predict_and_update(self, x_test_data, y_test_data):
        predictions = []

        # making prediction for the testing data samples
        for i in range(len(x_test_data)):
            x_test = x_test_data[i]
            y_test = y_test_data[i]

            y_pred = self.predict_one(x_test)
            predictions.append(y_pred)

            # updating the training dataset to include the point just predicted upon
            self.X_train = np.append(self.X_train, [x_test], axis=0)
            self.Y_train = np.append(self.Y_train, [y_test], axis=0)

        return predictions

### Using CV to find Optimal parameters

Sorting list: https://stackoverflow.com/questions/17555218/python-how-to-sort-a-list-of-lists-by-the-fourth-element-in-each-list  

In [3]:
def print_parameters_accuracy(accuracies):
    print('#Features \t #Neighbours \t Accuracy')
    for i in range(len(accuracies)):
        print(accuracies[i][0], '\t\t', accuracies[i][1], '\t\t', accuracies[i][2])
    print()

In [4]:
def find_optimal_values(max_features, max_neighbours, num_splits = 10, symbol_name = 'AAPL', use_implementation = True):
    accuracies = list()
    for num_features in range(1, max_features + 1, 1):
        print('Features:', num_features)
        
        X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features, symbol_name, is_binary_ouput=True)
        X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
        
        for k_neighbours in range(3, max_neighbours + 1, 2):
            print('Neighbours ------------------------>', k_neighbours)
            knn_tscv = K_Nearest_Neighbours_Classifier(k_neighbours=k_neighbours)
            if use_implementation:
                neighbour_accuracy = hm.timeSeriesCV(X_train, Y_train, num_splits, knn_tscv, is_classification=True)
            else:
                neighbour_accuracy = hm.rolling_cross_validation(X_train, Y_train, num_splits, knn_tscv, is_classification=True)
            accuracies.append([num_features, k_neighbours, neighbour_accuracy])
    
    print_parameters_accuracy(accuracies)
    
    # Sorting the accuracies
    accuracies.sort(reverse=True, key=lambda x: x[2])
    print_parameters_accuracy(accuracies)
    
    return accuracies[0][0], accuracies[0][1]

In [11]:
def get_data_ready(symbol_name, max_features=5, max_neighbours=11):
    start_time = time.time()
    num_features, k_neighbours = find_optimal_values(max_features=max_features, max_neighbours=max_neighbours, num_splits=10, symbol_name = symbol_name)
    end_time = time.time()
    print('Time taken for Cross Validation:', end_time - start_time)
    
    X_train, X_test, Y_train, Y_test = hm.prepare_data(num_features)
    X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
    return X_train, X_test, Y_train, Y_test, k_neighbours    

### 1. SKLearn KNN Classifier

In [6]:
def sklearn_KNN_forecast(X_train, X_test, Y_train, Y_test, k_neighbours):
    print('SKLEARN INBUILT')
    clf = KNeighborsClassifier(n_neighbors=k_neighbours)
    clf.fit(X_train, Y_train)
    print('Accuracy Score --', clf.score(X_test, Y_test))

### 2. Predicting using Implementation

In [7]:
def implemented_KNN_forecast(X_train, X_test, Y_train, Y_test, k_neighbours):
    knn = K_Nearest_Neighbours_Classifier(k_neighbours)
    knn.fit(X_train, Y_train)
    
    print('IMPLEMENTATION') 
    Y_pred = knn.predict(X_test)
    hm.accuracy_metrics(Y_test, Y_pred)

    print('IMPLEMENTATION WITH TRAINING UPDATES') 
    Y_pred = knn.predict_and_update(X_test, Y_test)
    hm.accuracy_metrics(Y_test, Y_pred)

### Running KNN

In [8]:
def forecast(X_train, X_test, Y_train, Y_test, k_neighbours):
    print('Number of Neighbours --', k_neighbours)
    sklearn_KNN_forecast(X_train, X_test, Y_train, Y_test, k_neighbours)
    implemented_KNN_forecast(X_train, X_test, Y_train, Y_test, k_neighbours)

# X_train, X_test, Y_train, Y_test = hm.prepare_data(2)
# X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values
# forecast(X_train, X_test, Y_train, Y_test, 9)

In [12]:
def run_KNN(symbol_name):
    max_features = 5
    max_neighbours = 11
    X_train, X_test, Y_train, Y_test, k_neighbours = get_data_ready(symbol_name, max_features, max_neighbours)
    forecast(X_train, X_test, Y_train, Y_test, k_neighbours)

In [10]:
run_KNN(symbol_name = 'INX')

Features: 1
Neighbours ------------------------> 3
Implemented Rolling Cross Validation
Accuracy: 0.5153797865662272 

Neighbours ------------------------> 5
Implemented Rolling Cross Validation
Accuracy: 0.5156936597614563 

Neighbours ------------------------> 7
Implemented Rolling Cross Validation
Accuracy: 0.5204017576898933 

Neighbours ------------------------> 9
Implemented Rolling Cross Validation
Accuracy: 0.5269930947897049 

Features: 2
Neighbours ------------------------> 3
Implemented Rolling Cross Validation
Accuracy: 0.5182046453232894 

Neighbours ------------------------> 5
Implemented Rolling Cross Validation
Accuracy: 0.5188323917137476 

Neighbours ------------------------> 7
Implemented Rolling Cross Validation
Accuracy: 0.5200878844946641 

Neighbours ------------------------> 9
Implemented Rolling Cross Validation
Accuracy: 0.5279347143753923 

#Features 	 #Neighbours 	 Accuracy
1 		 3 		 0.5153797865662272
1 		 5 		 0.5156936597614563
1 		 7 		 0.520401757689893