In [128]:
# from numpy import genfromtxt
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

In [129]:
def read_data_pandas(data_path, cols_to_drop):
    data_frame = pd.read_csv(data_path)
    #data_frame = data_frame.set_index(pd.DatetimeIndex(data_frame['TIME']))
    data_frame = data_frame.drop(columns=cols_to_drop)
    return data_frame

def readFileFromCSVtoDictionary(path_to_file, file_name, cols_to_drop=[]): 
    file_uri = os.path.join(path_to_file, file_name)
    df = read_data_pandas(file_uri, cols_to_drop)
    dictionary = dict()
    for key in df.keys():
        dictionary[key] = list(df[key])
    return dictionary

def createListsPairs(path_to_file, file_name, col_list1, col_list2, prediction_index=0):
    out_list = list()
    dictionary = readFileFromCSVtoDictionary(path_to_file, file_name)
    # print(len(dictionary.items()))
    # print(len(dictionary[col1]))
    for index in range(len(dictionary[col_list1[0]])-prediction_index):
        list1 = list()
        list2 = list()
        for col1 in col_list1:
            list1.append(dictionary[col1][index])
        for col2 in col_list2:
            list2.append(dictionary[col2][index+prediction_index])
        out_list.append((list1, list2))
    return out_list

In [130]:
list_of_pairs = createListsPairs("", "2_5_2020_random_actions_1h_every_60s.csv", ["OUT_H[%]","OUT_T[*C]"], ["T6[*C]"],8)
# print(list_of_pairs[:10])
print()
print(list_of_pairs)


[([53.0, 23.0], [25.0]), ([57.0, 21.0], [25.0]), ([57.0, 21.0], [25.0]), ([57.0, 21.0], [26.0]), ([57.0, 21.0], [26.0]), ([57.0, 21.0], [26.0]), ([57.0, 21.0], [26.0]), ([58.0, 22.0], [26.0]), ([57.0, 22.0], [27.0]), ([55.0, 23.0], [27.0]), ([52.0, 24.0], [27.0]), ([51.0, 24.0], [28.0]), ([48.0, 25.0], [27.0]), ([48.0, 25.0], [28.0]), ([47.0, 26.0], [28.0]), ([46.0, 26.0], [28.0]), ([44.0, 27.0], [28.0]), ([42.0, 27.0], [28.0]), ([40.0, 28.0], [28.0]), ([40.0, 28.0], [28.0]), ([39.0, 29.0], [28.0]), ([38.0, 29.0], [28.0]), ([37.0, 30.0], [28.0]), ([35.0, 30.0], [28.0]), ([35.0, 31.0], [28.0]), ([34.0, 31.0], [28.0]), ([32.0, 32.0], [28.0]), ([32.0, 32.0], [28.0]), ([32.0, 32.0], [27.0]), ([33.0, 31.0], [27.0]), ([33.0, 31.0], [27.0]), ([34.0, 31.0], [27.0]), ([33.0, 30.0], [27.0]), ([35.0, 30.0], [27.0]), ([35.0, 30.0], [28.0]), ([36.0, 29.0], [28.0]), ([37.0, 28.0], [28.0]), ([38.0, 27.0], [28.0]), ([41.0, 26.0], [28.0]), ([42.0, 25.0], [28.0]), ([48.0, 24.0], [28.0]), ([49.0, 24.0],

In [132]:
x = list()
y = list()
for pair in list_of_pairs:
    x.append(pair[0])
    y.append(pair[1])

In [134]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=42)

print(X_train[0], y_train[0])

[34.0, 30.0] [29.0]


In [136]:
KNR = KNeighborsRegressor(180, metric="manhattan", weights="uniform", leaf_size=30)

In [138]:
KNR.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=180, p=2,
                    weights='uniform')

In [140]:
test = np.array(X_test[0])
test1 = test.reshape(1,-1)

true = y_test[1]
print(test, true)

[33. 31.] [29.0]


In [142]:
KNR.predict(test1)

array([[28.50555556]])

In [161]:
def calcolateMSE(y_true, y_pred):
    #     print((y_true, y_pred.tolist()))
    return mean_squared_error(y_true, y_pred)

def predictWithKnnInput(knn, train_x, train_y, to_be_predicted):
    return knn.predict(to_be_predicted)

def instantiateKnn(metricgiven, n_neighbors, weights="uniform", algorithm="auto"):
    knn = KNeighborsRegressor(n_neighbors, metric=metricgiven, weights=weights, algorithm=algorithm)
    return knn

def predictWithBestParameters(train_x, train_y, test_x, test_y, metrics_list, weights_list):
    best = (None, {"prediction": None, "sse": None, "n_neighbors": None, "metric": None, "weights": None})
    for weights in weights_list:
        for n_neighbors in range(1, train_x.shape[0]+1):
            for metric in metrics_list:
                #                 print(f"{metric} {n_neighbors} {weights}")
                knn_i = instantiateKnn(metric, n_neighbors, weights)
                #                 train_x_1 = train_x.copy()
                #                 train_y_1 = train_y.copy()
                #                 print(train_x.shape, train_y.shape)
                knn_i.fit(train_x, train_y)
                sse = 0
                for index in range(len(test_x)):
                    prediction = knn_i.predict(np.array(test_x[index]).reshape(1,-1))
                    mse = calcolateMSE([test_y[index]],prediction)
                    sse += mse
                    #                     print(best)
                    if (best[1]["sse"] == None or sse < best[1]["sse"]):
                        best = (knn_i, {"prediction": prediction, "sse": sse, "n_neighbors": n_neighbors, "metric": metric, "weights": weights}) 
    return best

In [163]:
# metrics which need parameters cannot be run with auto -> ball_tree seems problematic
metrics_list = ["chebyshev", "minkowski", "manhattan", "euclidean"]
weights_list = ["uniform", "distance"]

In [164]:
best_parameters = predictWithBestParameters(np.array(X_train), y_train, X_test, y_test, metrics_list, weights_list)
print(best_parameters)

(KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='chebyshev',
                    metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                    weights='uniform'), {'prediction': array([[27.]]), 'sse': 0.0, 'n_neighbors': 1, 'metric': 'chebyshev', 'weights': 'uniform'})


In [104]:
test2 = np.array(X_test[0]).reshape(1,-1)
true2 = y_test[0]
print(test2, true2)

[[33. 31.]] [27.0]


In [106]:
best_parameters[0].predict(test2)

array([[29.]])