In [1]:
import numpy as np
from pprint import pprint

import random, pickle 
import pandas as pd

from sklearn.datasets import make_regression
# from autosklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import autosklearn
from autosklearn.regression import AutoSklearnRegressor

In [2]:
df = pd.read_pickle("data_cleaned_updated.pkl")
df = df[['input_features', 'pointType_0', 'pointType_1', 'pointType_2',
       'pointType_3', 'pointType_4', 'pointType_5', 'pointType_6',
       'pointType_7', 'pointType_8', 'pointType_9', 'pointType_10',
       'pointType_11', 'pointType_12', 'pointType_13', 'pointType_14',
       'pointType_15', 'pointType_16', 'pointType_17', 'pointType_18',
       'pointType_19', 'pointType_20', 'pointType_21', 'participant_number',
       'left_or_right', 'sample_number']]

In [3]:
df.columns

Index(['input_features', 'pointType_0', 'pointType_1', 'pointType_2',
       'pointType_3', 'pointType_4', 'pointType_5', 'pointType_6',
       'pointType_7', 'pointType_8', 'pointType_9', 'pointType_10',
       'pointType_11', 'pointType_12', 'pointType_13', 'pointType_14',
       'pointType_15', 'pointType_16', 'pointType_17', 'pointType_18',
       'pointType_19', 'pointType_20', 'pointType_21', 'participant_number',
       'left_or_right', 'sample_number'],
      dtype='object')

In [4]:
participant_list = df.participant_number.drop_duplicates().to_list()
random.Random(420).shuffle(participant_list)
print(participant_list)

participant_list_train, participant_list_test = train_test_split(participant_list, random_state=115,  train_size=0.85)

print(participant_list_train, participant_list_test)

['120', '59', '124', '51', '66', '29', '37', '67', '33', '121', '45', '105', '60', '97', '12', '41', '43', '106', '74', '15', '88', '58', '38', '53', '109', '94', '122', '46', '84', '79', '21', '64', '63', '100', '68', '19', '34', '112', '23', '128', '96', '24', '81', '108', '13', '127', '18', '80', '39', '129', '77', '125', '76', '82', '61', '93', '73', '44', '78', '30', '54', '107', '32', '90', '57', '118', '22', '71', '126', '50', '56', '62', '104', '85', '111', '35', '101', '49', '14', '89', '47', '123', '92', '16', '114', '119', '75', '28', '116', '95', '40', '27', '55', '117', '65', '102', '69', '48', '25', '52', '99', '115', '91', '98', '70', '83', '113', '36', '17', '31', '87', '72', '103']
['89', '50', '33', '57', '126', '115', '97', '40', '112', '44', '122', '119', '47', '69', '127', '94', '80', '59', '105', '120', '95', '38', '71', '21', '109', '52', '77', '99', '84', '87', '55', '66', '102', '28', '61', '45', '82', '54', '16', '78', '46', '85', '100', '13', '53', '34', '62'

In [5]:
# X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3)

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

df_test = df[df['participant_number'].isin(participant_list_test)] #select test participants
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle and reset_index
# df_test.reset_index(inplace=True,drop=True)
df_train = df[df['participant_number'].isin(participant_list_train)] #select train participants
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle and reset_index
# df_train.reset_index(inplace=True,drop=True)

In [6]:
arr = df_train[[col for col in df_train.columns if 'pointType' in col]].values

arr_np = np.asarray(arr.tolist())
y_train = arr_np.reshape(arr_np.shape[0], arr_np.shape[1] * arr_np.shape[2])

x_train = np.asarray(df_train.input_features.tolist())

In [7]:
arr = df_test[[col for col in df_test.columns if 'pointType' in col]].values

arr_np = np.asarray(arr.tolist())
y_test = arr_np.reshape(arr_np.shape[0], arr_np.shape[1] * arr_np.shape[2])

x_test = np.asarray(df_test.input_features.tolist())

In [8]:
x_train.shape, y_train.shape , x_test.shape, y_test.shape

((2004, 2048), (2004, 44), (358, 2048), (358, 44))

In [9]:
# load model
with open('data_averaged_time5hours_perRun300_model.pkl', 'rb') as f:
    automl = pickle.load(f)

print(automl.leaderboard())

          rank  ensemble_weight                 type      cost    duration
model_id                                                                  
106          1             0.60  k_nearest_neighbors  0.889486  121.600821
82           2             0.16  k_nearest_neighbors  1.018268  275.414093
87           3             0.02          extra_trees  1.389995   93.927037
65           4             0.06        decision_tree  2.112673   45.150692
47           5             0.04        decision_tree  2.136166   32.638068
101          6             0.06        decision_tree  2.225635   32.276055
94           7             0.04        decision_tree  2.266275   47.853082
20           8             0.02  k_nearest_neighbors  2.777961   71.169368


In [10]:
pprint(automl.show_models(), indent=4)

{   20: {   'cost': 2.7779610260679615,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f53cacd3850>,
            'ensemble_weight': 0.02,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f53cacae070>,
            'model_id': 20,
            'rank': 1,
            'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7f53c6d4c190>,
            'sklearn_regressor': KNeighborsRegressor(n_neighbors=10, weights='distance')},
    47: {   'cost': 2.136165975998706,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f53c5c788b0>,
            'ensemble_weight': 0.04,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f53c5c0e3a0>,
            'model_id': 47,
       

In [11]:
predictions = automl.predict(x_test)
print("Mean absolute error score:", mean_absolute_error(y_test, predictions))

Mean absolute error score: 1.1575682750483642


In [12]:
df_test['predictions'] = predictions.tolist()
df_test['y_test'] = y_test.tolist()

In [13]:
df_test.to_pickle("predicted_output.pkl")
df_test.to_csv("predicted_output.csv")