In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn.metrics as skmetrics
from scipy import stats
from math import sqrt

def tidy_corr_matrix(corr_mat) -> pd.DataFrame:
    """  
    Función para convertir una matriz de correlación de pandas en formato tidy.    """
    corr_mat = corr_mat.stack().reset_index()
    corr_mat.columns = ['A','B','Corr']
    corr_mat = corr_mat.loc[corr_mat['A'] != corr_mat['B'], :]
    corr_mat['|Corr|'] = np.abs(corr_mat['Corr'])
    corr_mat = corr_mat.sort_values('|Corr|', ascending=False, ignore_index=True)

    return corr_mat

def show_metrics(y_test, predictions):
    r2s = skmetrics.r2_score(y_test, predictions)
    mae = skmetrics.mean_absolute_error(y_test, predictions)
    mse = skmetrics.mean_squared_error(y_test, predictions)
    rmse = sqrt(mse)

    print(f"R2 Score = {r2s}")
    print(f"     MAR = {mae}")
    print(f"     MSE = {mse}")
    print(f"    RMSE = {rmse}")

keys = []
sides = ['Left', 'Right']
attrs = ['x', 'y']

for side in sides:
    for i in range(21):
        for attr in attrs:
            keys.append(f"{side}_{i}_{attr}")


def extract_row(df: pd.DataFrame, index: int) -> pd.DataFrame:
    row = pd.DataFrame(df['data'].iloc[index])

    row_data = pd.DataFrame(columns=keys)

    row_data['timestamp'] = row['timestamp']
    for key in keys:
        row_data[key] = row['data'].apply(lambda x: x.get(key, np.nan))

    row_data.fillna(method='ffill', inplace=True)
    row_data.fillna(method='bfill', inplace=True)
    
    return row_data

def normalize(df: pd.DataFrame, index, WIDTH, HEIGHT) -> pd.DataFrame:
    row = extract_row(df, index)

    corr_prev = tidy_corr_matrix(row.corr())
    before = corr_prev[corr_prev['|Corr|'] > 0.7]['Corr'].count() / corr_prev.shape[0]

    for key in keys:
        if 'x' in key:
            row[key] *= WIDTH
        else:
            row[key] *= HEIGHT

    wrist = dict()
    middle = dict()
    scale = dict()

    for side in sides:
        wrist[side] = dict()
        middle[side] = dict()
        for attr in attrs:
            wrist[side][attr] = row[f"{side}_0_{attr}"].iloc[0]
            middle[side][attr] = row[f"{side}_12_{attr}"].iloc[0]

        scale[side] = sqrt((middle[side]['x'] - wrist[side]['x'])**2 + (middle[side]['y'] - wrist[side]['y'])**2)

    for side in sides:
        for attr in attrs:
            for i in range(1,21):
                key = f"{side}_{i}_{attr}"
                row[key] = (row[key] - row[f"{side}_0_{attr}"]) / scale[side]

        row[f"{side}_0_x"] = row[f"{side}_0_x"] / WIDTH
        row[f"{side}_0_y"] = row[f"{side}_0_y"] / HEIGHT

    corr_after = tidy_corr_matrix(row.corr())
    after = corr_after[corr_after['|Corr|'] > 0.7]['Corr'].count() / corr_after.shape[0]

    print(f"Columns with high correlation >.7 before: {before}")
    print(f"Columns with high correlation >.7 after normalization: {after}")

    return row

In [3]:
df = pd.read_json('useful.json')
df['target'] = df['filename'].str.split('/').str[1]
df.drop(columns=['filename', 'score'], inplace=True)

print(df.shape)
df.head()

(116, 2)


Unnamed: 0,data,target
75,"[{'timestamp': 0, 'data': {'Left_0_x': 0.66563...",Zapato
113,"[{'timestamp': 0, 'data': {'Right_0_x': 0.4155...",Niño
123,"[{'timestamp': 0, 'data': {'Left_0_x': 0.68489...",Niño
124,"[{'timestamp': 0, 'data': {'Left_0_x': 0.68459...",Niño
126,"[{'timestamp': 0, 'data': {'Left_0_x': 0.69289...",Niño


In [203]:
df['target'].value_counts()

target
Borrar      52
Encender    27
Avión       22
Mal          7
Niño         5
Abrir        2
Zapato       1
Name: count, dtype: int64

In [195]:
df['norm'] = [normalize(df, i, 512, 512) for i in range(df.shape[0])]

Columns with high correlation >.7 before: 0.611764705882353
Columns with high correlation >.7 after normalization: 0.31736694677871147
Columns with high correlation >.7 before: 0.49411764705882355
Columns with high correlation >.7 after normalization: 0.3557422969187675
Columns with high correlation >.7 before: 0.4050420168067227
Columns with high correlation >.7 after normalization: 0.2064425770308123
Columns with high correlation >.7 before: 0.43221288515406164
Columns with high correlation >.7 after normalization: 0.2280112044817927
Columns with high correlation >.7 before: 0.388515406162465
Columns with high correlation >.7 after normalization: 0.2571428571428571
Columns with high correlation >.7 before: 0.3641456582633053
Columns with high correlation >.7 after normalization: 0.26666666666666666
Columns with high correlation >.7 before: 0.3619047619047619
Columns with high correlation >.7 after normalization: 0.3420168067226891
Columns with high correlation >.7 before: 0.358823529

In [196]:
df = df[['target', 'norm']]
df.head()

Unnamed: 0,target,norm
75,Zapato,Left_0_x Left_0_y Left_1_x Left_1_y L...
113,Niño,Left_0_x Left_0_y Left_1_x Left_1_y L...
123,Niño,Left_0_x Left_0_y Left_1_x Left_1_y L...
124,Niño,Left_0_x Left_0_y Left_1_x Left_1_y L...
126,Niño,Left_0_x Left_0_y Left_1_x Left_1_y L...


In [202]:
df['norm'].iloc[0]

Unnamed: 0,Left_0_x,Left_0_y,Left_1_x,Left_1_y,Left_2_x,Left_2_y,Left_3_x,Left_3_y,Left_4_x,Left_4_y,...,Right_16_y,Right_17_x,Right_17_y,Right_18_x,Right_18_y,Right_19_x,Right_19_y,Right_20_x,Right_20_y,timestamp
0,0.665638,0.853088,-0.222893,0.042188,-0.347041,0.225318,-0.416905,0.402914,-0.485756,0.535345,...,0.845043,0.100688,0.466895,0.198364,0.620349,0.275103,0.703793,0.336523,0.765314,0
1,0.667796,0.852059,-0.227135,0.030394,-0.349200,0.224994,-0.420385,0.412852,-0.491470,0.556443,...,0.845043,0.100688,0.466895,0.198364,0.620349,0.275103,0.703793,0.336523,0.765314,33
2,0.667141,0.851356,-0.229280,0.040806,-0.348483,0.233142,-0.417725,0.417338,-0.487692,0.561047,...,0.814437,0.128446,0.442308,0.222959,0.582519,0.298043,0.657826,0.359503,0.711222,66
3,0.667667,0.851346,-0.230219,0.040259,-0.348810,0.232987,-0.416740,0.417714,-0.488075,0.561663,...,0.810490,0.123870,0.441990,0.220938,0.587561,0.297804,0.662779,0.361614,0.714817,100
4,0.667420,0.851474,-0.228258,0.043191,-0.345513,0.236078,-0.413121,0.420167,-0.484514,0.561090,...,0.806894,0.135059,0.436411,0.229593,0.574431,0.305181,0.648928,0.369734,0.700475,133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,0.653788,0.855341,-0.227815,0.021638,-0.366020,0.207181,-0.442780,0.394750,-0.520580,0.541165,...,0.974026,-0.175893,0.492514,-0.232614,0.637602,-0.285568,0.725169,-0.338406,0.789663,4833
144,0.653998,0.856029,-0.225916,0.021956,-0.363705,0.204376,-0.441380,0.388837,-0.516648,0.534558,...,0.974026,-0.175893,0.492514,-0.232614,0.637602,-0.285568,0.725169,-0.338406,0.789663,4866
145,0.654706,0.855676,-0.225586,0.020039,-0.364245,0.206424,-0.446537,0.395251,-0.523746,0.543084,...,0.974026,-0.175893,0.492514,-0.232614,0.637602,-0.285568,0.725169,-0.338406,0.789663,4900
146,0.654105,0.856181,-0.220298,0.027976,-0.353580,0.210758,-0.434509,0.398268,-0.513328,0.544591,...,0.974026,-0.175893,0.492514,-0.232614,0.637602,-0.285568,0.725169,-0.338406,0.789663,4933


In [200]:
df.to_json("clean.json")