In [None]:
import numpy as np
from pprint import pprint

import random, pickle 
import pandas as pd

from sklearn.datasets import make_regression
# from autosklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import autosklearn
from autosklearn.regression import AutoSklearnRegressor

In [None]:
# you could read as csv or pickle file, specify the path or put the pkl file generated from the data preprocessing notebook here

df = pd.read_pickle("data_latest_average_non_zero_raw.pkl")
df = df[['input_features', 'pointType_0', 'pointType_1', 'pointType_2',
       'pointType_3', 'pointType_4', 'pointType_5', 'pointType_6',
       'pointType_7', 'pointType_8', 'pointType_9', 'pointType_10',
       'pointType_11', 'pointType_12', 'pointType_13', 'pointType_14',
       'pointType_15', 'pointType_16', 'pointType_17', 'pointType_18',
       'pointType_19', 'pointType_20', 'pointType_21', 'participant_number',
       'left_or_right', 'sample_number']]

In [None]:
df.columns

In [None]:
participant_list = df.participant_number.drop_duplicates().to_list()
random.Random(420).shuffle(participant_list)
print(participant_list)
participant_list_train, participant_list_test = train_test_split(participant_list, random_state=115,  train_size=0.85)
print(participant_list_train, participant_list_test)

In [None]:
data_augmentation_flag = True #enable this flag if you want to do data augmentation

df_test = df[df['participant_number'].isin(participant_list_test)] #select test participants
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle and reset_index
df_train = df[df['participant_number'].isin(participant_list_train)] #select train participants
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle and reset_index


In [None]:
# Data Augmentation

def roll_images(input_features_inner, pixels, axis_, point=None):
    # input_features_inner = df.input_features[591] # for debugging
    input_features_inner_np = np.asarray(input_features_inner)
    input_features_inner_np_rolled = np.roll(input_features_inner_np.reshape(64,32),pixels,axis=axis_)
    
    if point is not None:
    # Convert point from string to tuple if needed
        if isinstance(point, str):
            point = tuple(map(float, point[1:-1].split(',')))
        
        if axis_ == 0:
            return [point[0], point[1]+pixels]
        elif axis_ == 1:
            return [point[0]+pixels, point[1]]
        else:
            return [point[0]+pixels, point[1]+pixels]
    else:
        # Flatten the rotated image
        # print("entered here")
        return input_features_inner_np_rolled.reshape(-1)


if data_augmentation_flag == True:

    x_or_y_shift_list = [-3,-2,-1,1,2,3]
    axis_to_shift_list = [0,1,(0,1)]

    df_train_temp_fixed = df_train.copy()

    for axis_to_shift in axis_to_shift_list:
        for x_or_y_shift in x_or_y_shift_list:

            print(axis_to_shift,x_or_y_shift)

            df_train_augmented = df_train_temp_fixed.copy()

            df_train_augmented['input_features'] = df_train_augmented['input_features'].apply(lambda row: roll_images(row,x_or_y_shift,axis_to_shift))

            for i in range(0, 22):
                # print(i)
                pointType_col = f'pointType_{i}'
                # Apply the rotation function to each row in the original point column
                df_train_augmented[pointType_col] = df_train_augmented.apply(lambda row: roll_images(row['input_features'], x_or_y_shift,axis_to_shift,point=row[pointType_col]), axis=1)
            
            df_train = pd.concat([df_train,df_train_augmented])


    df_train.reset_index(inplace=True,drop=True)

In [None]:
#Extract train and test features and labels

arr = df_train[[col for col in df_train.columns if 'pointType' in col]].values
arr_np = np.asarray(arr.tolist())
y_train = arr_np.reshape(arr_np.shape[0], arr_np.shape[1] * arr_np.shape[2])
x_train = np.asarray(df_train.input_features.tolist())


arr = df_test[[col for col in df_test.columns if 'pointType' in col]].values
arr_np = np.asarray(arr.tolist())
y_test = arr_np.reshape(arr_np.shape[0], arr_np.shape[1] * arr_np.shape[2])
x_test = np.asarray(df_test.input_features.tolist())

#print the size of train and test data for verification
x_train.shape, y_train.shape , x_test.shape, y_test.shape

In [None]:
'''
train the model using autosklearn automatic hyperparameter tuning using bayesian optimization for regressor problems, 
instead of the traditional grid search approaches, the only parameters you need to change are the time limit per each model (to train) 
and the overall time limit. You could also specify the memory limit (as some models take a lot of memory for big data) and a random seed.
'''

automl = AutoSklearnRegressor(
    time_left_for_this_task=3600*5,
    per_run_time_limit=150*2,
    memory_limit = 500000,
    seed = 14141,
    metric = autosklearn.metrics.mean_squared_error,
    
    # resampling_strategy = 'cv'
    resampling_strategy_arguments = {
    "shuffle": True,        # Whether to shuffle before splitting data
    # "folds": 3              # Used in 'cv' based resampling strategies
    }
)
automl.fit(x_train, y_train)

In [None]:
# save model
with open('data_averaged_time5hours_perRun300_model.pkl', 'wb') as f:
    pickle.dump(automl, f)

print(automl.leaderboard())

In [None]:
pprint(automl.show_models(), indent=4)

In [None]:
predictions = automl.predict(x_test)
print("Mean absolute error score:", mean_absolute_error(y_test, predictions))

In [None]:
df_test['predictions'] = predictions.tolist()
df_test['y_test'] = y_test.tolist()

In [None]:
#this output the test data only for visual verification when used in the original data

df_test.to_pickle("data_averaged_time5hours_perRun300.pkl")