In [30]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [31]:
df = pd.read_csv("data/cf_train_no_noise.csv")
df_noise = pd.read_csv("data/cf_train.csv")

In [32]:
def encode(value, encoding):
    for key, val in encoding.items():
        if value == key:
            return val

class_values_era = list(df.era.unique())
class_values_era.sort()
class_values_target = list(df.target_10_val.unique())
class_values_target.sort()
era_encoding = {val: i for i, val in enumerate(class_values_era)}
target_encoding = {val: i for i, val in enumerate(class_values_target)}
df["era"] = df["era"].apply(encode, args=(era_encoding,))
df["target_5_val"] = df["target_5_val"].apply(encode, args=(target_encoding,))
df["target_10_val"] = df["target_10_val"].apply(encode, args=(target_encoding,))

df_noise["era"] = df_noise["era"].apply(encode, args=(era_encoding,))
df_noise["target_5_val"] = df_noise["target_5_val"].apply(encode, args=(target_encoding,))
df_noise["target_10_val"] = df_noise["target_10_val"].apply(encode, args=(target_encoding,))

dataset = df
target_column = "target_10_val"
output_classes = 5
shuffle = False

In [33]:
dataset

Unnamed: 0,Open_n_val,High_n_val,Low_n_val,Close_n_val,Volume_n_val,SMA_10_val,SMA_20_val,CMO_14_val,High_n-Low_n_val,Open_n-Close_n_val,...,Close_n_slope_3_changelen_val,Close_n_slope_5_changelen_val,Close_n_slope_10_changelen_val,row_num,day,era,target_10_val,target_5_val,sigma,day_no
0,0.50,0.50,0.50,0.50,0.0,0.25,0.25,0.50,0.25,1.00,...,0.50,0.50,0.50,75,540,6,1,0,_0_0_,0
1,0.50,0.50,0.50,0.50,0.0,0.25,0.25,0.50,0.50,0.75,...,0.50,0.50,0.50,76,540,6,1,0,_0_0_,0
2,0.50,0.50,0.50,0.50,0.0,0.50,0.25,0.50,0.50,0.75,...,0.25,0.25,0.25,77,540,6,1,1,_0_0_,0
3,0.50,0.50,0.25,0.25,0.0,0.50,0.25,0.50,0.50,0.75,...,0.25,0.25,0.25,78,540,6,1,1,_0_0_,0
4,0.25,0.25,0.25,0.25,0.0,0.50,0.25,0.50,0.50,0.75,...,0.50,0.25,0.25,79,540,6,3,1,_0_0_,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62395,1.00,1.00,1.00,1.00,0.0,1.00,1.00,1.00,0.50,0.75,...,0.00,0.00,0.00,135,537,5,0,0,_0_0_,959
62396,1.00,1.00,1.00,1.00,0.0,1.00,1.00,0.75,0.50,0.75,...,0.00,0.00,0.00,136,537,5,0,0,_0_0_,959
62397,1.00,1.00,1.00,1.00,0.0,1.00,1.00,0.75,0.50,0.75,...,0.00,0.00,0.00,137,537,5,0,0,_0_0_,959
62398,1.00,1.00,1.00,1.00,0.0,1.00,1.00,0.75,0.25,1.00,...,0.00,0.00,0.00,138,537,5,0,0,_0_0_,959


In [34]:
# remove target_5_val, sigma from the dataset
new_dataset = dataset.drop(columns=["target_5_val", "sigma"])
era = new_dataset.pop("era")
target = new_dataset.pop(target_column)
new_dataset[target_column] = target
new_dataset["era"] = era
new_dataset

Unnamed: 0,Open_n_val,High_n_val,Low_n_val,Close_n_val,Volume_n_val,SMA_10_val,SMA_20_val,CMO_14_val,High_n-Low_n_val,Open_n-Close_n_val,...,Open_n-Close_n_changelen_val,SMA_20-SMA_10_changelen_val,Close_n_slope_3_changelen_val,Close_n_slope_5_changelen_val,Close_n_slope_10_changelen_val,row_num,day,day_no,target_10_val,era
0,0.50,0.50,0.50,0.50,0.0,0.25,0.25,0.50,0.25,1.00,...,0.50,0.25,0.50,0.50,0.50,75,540,0,1,6
1,0.50,0.50,0.50,0.50,0.0,0.25,0.25,0.50,0.50,0.75,...,0.50,0.00,0.50,0.50,0.50,76,540,0,1,6
2,0.50,0.50,0.50,0.50,0.0,0.50,0.25,0.50,0.50,0.75,...,0.50,0.00,0.25,0.25,0.25,77,540,0,1,6
3,0.50,0.50,0.25,0.25,0.0,0.50,0.25,0.50,0.50,0.75,...,0.25,0.00,0.25,0.25,0.25,78,540,0,1,6
4,0.25,0.25,0.25,0.25,0.0,0.50,0.25,0.50,0.50,0.75,...,0.25,0.00,0.50,0.25,0.25,79,540,0,3,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62395,1.00,1.00,1.00,1.00,0.0,1.00,1.00,1.00,0.50,0.75,...,1.00,1.00,0.00,0.00,0.00,135,537,959,0,5
62396,1.00,1.00,1.00,1.00,0.0,1.00,1.00,0.75,0.50,0.75,...,1.00,1.00,0.00,0.00,0.00,136,537,959,0,5
62397,1.00,1.00,1.00,1.00,0.0,1.00,1.00,0.75,0.50,0.75,...,1.00,1.00,0.00,0.00,0.00,137,537,959,0,5
62398,1.00,1.00,1.00,1.00,0.0,1.00,1.00,0.75,0.25,1.00,...,1.00,1.00,0.00,0.00,0.00,138,537,959,0,5


In [35]:
knn_dataset = new_dataset.drop(columns=["target_10_val", "era","row_num", "day", "day_no"])
knn_dataset_labels = new_dataset["target_10_val"]
# normalize only row_num,day,day_no
# knn_dataset[["row_num", "day", "day_no"]] = (knn_dataset[["row_num", "day", "day_no"]] - knn_dataset[["row_num", "day", "day_no"]].mean()) / knn_dataset[["row_num", "day", "day_no"]].std()
# knn_dataset = (knn_dataset - knn_dataset.mean()) / knn_dataset.std()


# get stratified dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
for train_index, test_index in sss.split(knn_dataset, knn_dataset_labels):
    knn_dataset = knn_dataset.iloc[train_index]
    knn_dataset_labels = knn_dataset_labels.iloc[train_index]

knn_dataset = knn_dataset.to_numpy()
knn_dataset_labels = knn_dataset_labels.to_numpy()

# # # use only 30% of the dataset for training
# train_size = int(0.1 * len(knn_dataset))
# # knn_dataset = knn_dataset[:train_size]
# # knn_dataset_labels = knn_dataset_labels[:train_size]


In [36]:
knn_dataset_labels_list = knn_dataset_labels.tolist()
len(knn_dataset_labels)

56160

In [37]:
knn_dataset_labels

array([3, 3, 3, ..., 3, 3, 1], dtype=int64)

In [38]:
import nmslib
from sklearn.model_selection import train_test_split
(data_matrix) = (knn_dataset)
# data_matrix,query_matrix,labels,query_labels = train_test_split(knn_dataset,test_size=0.2,random_state=42)

In [39]:
# print((query_matrix.shape, data_matrix.shape) )

In [40]:
M = 100
efC = 512

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 100, 'indexThreadQty': 4, 'efConstruction': 512, 'post': 0}


In [41]:
K=128

In [42]:
space_name='l2'

In [43]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(data_matrix) 

56160

In [44]:
len(index)

56160

In [45]:
# Create an index
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 100, 'indexThreadQty': 4, 'efConstruction': 512}


In [46]:
# Setting query-time parameters
efS = 50
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 50}


In [47]:
# get test-data and query for NN
test_dataset = pd.read_csv("data/cf_test_no_noise.csv")
# test_dataset

test_class_values_era = list(test_dataset.era.unique())
test_class_values_era.sort()
test_class_values_target = list(test_dataset.target_10_val.unique())
test_class_values_target.sort()
test_era_encoding = {val: i for i, val in enumerate(test_class_values_era)}
test_target_encoding = {val: i for i, val in enumerate(test_class_values_target)}
test_dataset["era"] = test_dataset["era"].apply(encode, args=(test_era_encoding,))
test_dataset["target_5_val"] = test_dataset["target_5_val"].apply(encode, args=(test_target_encoding,))
test_dataset["target_10_val"] = test_dataset["target_10_val"].apply(encode, args=(test_target_encoding,))


queryable_test_dataset = test_dataset.drop(columns=["target_5_val", "sigma"])
era = queryable_test_dataset.pop("era")
target = queryable_test_dataset.pop(target_column)
queryable_test_dataset[target_column] = target
queryable_test_dataset["era"] = era
actual_queryable_test_dataset= queryable_test_dataset.drop(columns=["target_10_val", "era","row_num", "day", "day_no"])


actual_queryable_test_dataset.to_numpy()

# normalize the dataset
# actual_queryable_test_dataset = (actual_queryable_test_dataset - actual_queryable_test_dataset.mean()) / actual_queryable_test_dataset.std()

actual_queryable_test_dataset_labels = queryable_test_dataset["target_10_val"]

In [48]:
predictions= []
for i in range(0, len(actual_queryable_test_dataset)):
    query = actual_queryable_test_dataset.iloc[[i]].to_numpy()
    # Query for the nearest neighbours of the first datapoint
    nbrs = index.knnQuery(query, k=K)
    # get the indices of the neighbors
    nbrs_np = np.array(nbrs)
    # print(nbrs_np.shape)
    indexes = nbrs_np[1]
    # change dtype to int
    indexes = indexes.astype(int)
    # print(indexes.shape)
    # print(indexes)
    # print(indexes_1.shape)
    # break
    # get labels corresponding to the indexes from a knn_dataset_labels_list
    # neigbour_labels = knn_dataset_labels[indexes]
    # print(indexes)
    neigbour_labels = [ knn_dataset_labels_list[idx] for idx in indexes] 
    neigbour_labels = pd.Series(neigbour_labels)
    # predict the label
    prediction = neigbour_labels.mode().values[0]
    predictions.append(prediction)

    # print(f"Prediction: {prediction}, Actual: {actual_queryable_test_dataset_labels.iloc[i]}")
    # break

    if i > 10:
        # add i-10th row to knn_dataset and labels and the nmslib index
        eligible_query = actual_queryable_test_dataset.iloc[[i-10]]
        index.addDataPointBatch(eligible_query.to_numpy())
        # print("Length of index: ", len(index))
        # knn_dataset = pd.concat([knn_dataset, actual_queryable_test_dataset.iloc[i]])
        
        knn_dataset_labels_list.append(actual_queryable_test_dataset_labels.iloc[i-10])
        # knn_dataset_labels = pd.concat([knn_dataset_labels, pd.Series([actual_queryable_test_dataset_labels.iloc[i-10]])])

    # print(len(knn_dataset_labels_list),len(index))        

In [49]:
# compute accuracy on test data
correct = 0
for i in range(0, len(predictions)):
    if predictions[i] == actual_queryable_test_dataset_labels.iloc[i]:
        correct += 1
print(f"Accuracy: {correct/len(predictions)}")

Accuracy: 0.25099358974358976
