In [1]:
import json
import numpy as np
from scipy import stats
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
# import glmnet_python
# from glmnet_python import glmnet
# from glmnet_python import glmnetPredict
# from glmnet_python import glmnetPlot
# from glmnet_python import cvglmnet
# from glmnet_python import cvglmnetPredict
# from glmnet_python import cvglmnetPlot
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import codecs
import seaborn as sns
import matplotlib.pyplot as plt
import pingouin as pg

In [2]:
# VARIABLES TO ADJUST
# 'Mugs', 'Plates', 'Geometric', 'Cutlery', 'Ball'
family_to_select = 'Plates'
target = 'Middle Abd'
num_bins = 5
cv = 3
# C_par [0.2, 0.5, 1, 1.25, 1.5]
C_par = 0.2
l1vsl2 = 1

In [3]:
# CONSTANTS

obj_fam = dict(
    CeramicMug = 'Mugs',
    Glass = 'Mugs',
    MetalMug = 'Mugs',
    CeramicPlate = 'Plates',
    MetalPlate = 'Plates',
    PlasticPlate = 'Plates',
    Cube = 'Geometric',
    Cylinder ='Geometric',
    Triangle ='Geometric',
    Fork = 'Cutlery',
    Knife ='Cutlery',
    Spoon ='Cutlery',
    PingPongBall = 'Ball',
    SquashBall='Ball',
    TennisBall='Ball'
)

kin_labels = ["Thumb Rotate", "Thumb MPJ", "Thumb IJ", "Index MPJ", "Index PIJ", "Middle MPJ", "Middle PIJ", "Ring MPJ", "Ring PIJ", "Pinkie MPJ", "Pinkie PIJ", "Palm Arch", "Wrist Pitch", "Wrist Yaw", "Index Abd", "Pinkie Abd", "Ring Abd", "Middle Abd", "Thumb Abd"]
emg_labels = ['emg' + str(i) for i in range(0,64)]

In [4]:
# READING FILES AND CREATING DATASET
file_kin = "./PyData/filtered_data.json"
file_eps = "./PyData/ep_labels.json"
file_task = "./PyData/task_labels.json"
file_emg = "./PyData/emg_data.json"

with open(file_kin, "r") as f:
    kin_data = json.load(f)  # data[subjects][trials][joints]

with open(file_eps, "r") as g:
    eps = json.load(g)  # eps[subjects][trials]

with open(file_task, "r") as h:
    task = json.load(h)  # task[subjects][trials]

with open(file_emg, "r") as f:
    emg_data = json.load(f)  # data[subjects][trials][sensors]
        

In [5]:
# VECTORIZING DATA

vectorized_task = [x for sublist in task for x in sublist]  # Vectorization of tasks
vectorized_eps = [x for sublist in eps for x in sublist]  # Vectorization of eps
vectorized_kin = [x for sublist in kin_data for x in sublist]  # Vectorization of trials
vectorized_kin = np.array(vectorized_kin, dtype=float) # Conversion to float to we replace 'None' with 'NaN'
vectorized_emg = [x for sublist in emg_data for x in sublist]  # Vectorization of trials
vectorized_emg = np.array(vectorized_emg, dtype=float) # Conversion to float to we replace 'None' with 'NaN'
given_object = [x.split("_")[0] for x in vectorized_task]  # Vectorized given objects
ask_object = [x.split("_")[1] for x in vectorized_task]  # Vectorized asked objects


In [6]:
# CREATE PANDAS DATAFRAME AND GENERATE TRIAL AND EP INDEX

aux = np.concatenate([vectorized_kin, vectorized_emg], axis=1)
labs = np.concatenate([kin_labels, emg_labels])

new_df = pd.DataFrame(aux, columns=labs)
new_df['EPs'] = vectorized_eps
new_df['Task'] = vectorized_task
new_df['Given'] = given_object
new_df['Asked'] = ask_object
new_df['Family'] = [obj_fam[x] for x in given_object]

tr = 0
trial = np.zeros((len(new_df['Task']),), dtype=int)

for i in range(1,len(new_df['Task'])):
    if vectorized_task[i] != vectorized_task[i-1]:
        tr += 1
    trial[i] = tr 
        
new_df['Task number'] = trial        
new_df.head

ep = 0
eps = np.zeros((len(new_df['EPs']),), dtype=int)

for j in range(1,len(new_df['EPs'])):
    if vectorized_task[j] != vectorized_task[j-1]: # EP to 0 if we change trial
        ep = 0
    elif vectorized_eps[j] != vectorized_eps[j-1]:
        ep += 1
    eps[j] = ep 
        
new_df['EP number'] = eps        

In [7]:
# SELECT DATAPOINTS BY FAMILY

sel_df = new_df.loc[new_df['Family'] == family_to_select]

In [8]:
# K-FOLD, Z-SCORE, BIN CREATION, CLASSIFICATION AND STATISTICAL TEST ¿¿¿???

# STRATIFIED K-FOLD CROSS VALIDATION
skf = StratifiedKFold(n_splits=cv)
for train, test in skf.split(sel_df['Task number'], sel_df['Given']):
    
    # TRAIN/TEST SPLIT
    train_df = sel_df.iloc[train]
    test_df = sel_df.iloc[test]
    
    # Z-SCORE NORMALIZATION, ADD MISSING (no numerical) COLUMNS AND REARRANGE DATAFRAME (to fit original order)
    train_z = train_df[train_df.select_dtypes(include=[np.number]).columns].apply(stats.zscore)
    train_z['EPs'] = train_df['EPs']
    train_z['Task'] = train_df['Task']
    train_z['Given'] = train_df['Given']
    train_z['Asked'] = train_df['Asked']
    train_z['Family'] = train_df['Family']
    train_z['Task number'] = train_df['Task number']
    train_z['EP number'] = train_df['EP number']
    train_z = train_z.reindex(columns=train_df.columns)

    
    test_z = test_df[test_df.select_dtypes(include=[np.number]).columns].apply(stats.zscore)
    test_z['EPs'] = test_df['EPs']
    test_z['Task'] = test_df['Task']
    test_z['Given'] = test_df['Given']
    test_z['Asked'] = test_df['Asked']
    test_z['Family'] = test_df['Family']
    test_z['Task number'] = test_df['Task number']
    test_z['EP number'] = test_df['EP number']
    test_z = test_z.reindex(columns=test_df.columns)
    
    
    
    
    