## Import packages

In [1]:
import os
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import warnings


warnings.filterwarnings("ignore")

## Check the dataset files

In [2]:
# locate the directory
DIRECTORY = "hopkins_export/"
arr = os.listdir(DIRECTORY)

# see example dataset
subject = pd.read_csv(DIRECTORY + arr[0])
subject.head()

Unnamed: 0,expected_time,flip_time,stim_pos,user_pos,lambda_val,change_rate_x,ecg_rri,ecg_task_ts,ecg_heart_rate,ecg_rri_mean_5,ecg_rri_mean_10,ecg_rri_std_5,ecg_rri_std_10,eda_filtered,eda_peaks,resp_filtered,resp_resp_rate,abs_motion_filtered,emg1_filtered,emg2_filtered
0,6.10841,6.118486,-0.005,3e-05,0.15828,-3.8e-05,0.876029,6.107453,68.000221,0.852864,0.850007,0.017685,0.017734,6.960428,370703.0,-0.067922,0.311401,0.11606,-0.001082,0.003431
1,6.143708,6.151872,-0.00511,-0.035327,0.211694,-0.000111,0.878219,6.140838,67.914834,0.853381,0.850177,0.018323,0.018035,6.957642,370703.254359,-0.054131,0.311246,0.115802,0.000133,0.000611
2,6.177325,6.185258,-0.005972,-0.03347,0.210798,-0.000861,0.880394,6.174223,67.83099,0.853916,0.850359,0.018956,0.018348,6.954874,370703.517973,-0.040055,0.311095,0.131065,-0.001946,0.00045
3,6.21031,6.218644,-0.006795,-0.033076,0.211674,-0.000832,0.882552,6.207608,67.748688,0.854468,0.850551,0.019584,0.018672,6.95212,370703.79084,-0.025745,0.310949,0.169582,-0.00183,0.003452
4,6.243899,6.252031,-0.007646,-0.03261,0.212244,-0.000844,0.884694,6.240992,67.667928,0.855037,0.850755,0.020207,0.019008,6.949382,370704.072962,-0.011254,0.310808,0.292291,0.006188,-0.005109


## Preprocess the dataset

In [3]:
# the list of columns to be dropped
drop_ls = [
    "expected_time",
    "flip_time",
    "stim_pos",
    "user_pos",
    "lambda_val",
    "change_rate_x",
]

# the feature matrix in array form
X = subject.drop(columns=drop_ls).to_numpy()
print("The shape of X:", X.shape)

The shape of X: (17790, 14)


In [4]:
print(X[:3])  # 3 entries as example

[[ 8.76029453e-01  6.10745281e+00  6.80002205e+01  8.52863608e-01
   8.50007183e-01  1.76847055e-02  1.77337879e-02  6.96042791e+00
   3.70703000e+05 -6.79220341e-02  3.11401276e-01  1.16059790e-01
  -1.08246307e-03  3.43132671e-03]
 [ 8.78219475e-01  6.14083771e+00  6.79148341e+01  8.53381336e-01
   8.50177312e-01  1.83225372e-02  1.80349152e-02  6.95764227e+00
   3.70703254e+05 -5.41308891e-02  3.11245944e-01  1.15802469e-01
   1.33204485e-04  6.11424174e-04]
 [ 8.80393509e-01  6.17422261e+00  6.78309899e+01  8.53916205e-01
   8.50358736e-01  1.89555576e-02  1.83475976e-02  6.95487357e+00
   3.70703518e+05 -4.00547234e-02  3.11095287e-01  1.31065021e-01
  -1.94580695e-03  4.49765341e-04]]


In [5]:
# calculate the 3 labels we are interested in
y_pos_dif = (
    subject["user_pos"].to_numpy() - subject["stim_pos"].to_numpy()
)  # position difference
y_pos = subject["stim_pos"].to_numpy()  # position
y_speed = subject["change_rate_x"].to_numpy()  # speed of change

In [6]:
y =   y_speed# choose the interested y

## Divide the dataset into training and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X[:100], y[:100], test_size=0.2, random_state=23
)  # only 100 samples are used here

## Fit and evaluate the ridge regression model

In [8]:
# modify hyperparameters as seen fit
# list on sklearn website
rr = Ridge(random_state=23)


rr.fit(X_train, y_train)

In [9]:
r2_score(rr.predict(X_test), y_test)

-0.05041833450506106

## Score is too low? How to improve it?