In [66]:
#Librerías para la carga, análisis y preprocesamiento de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [67]:
# División de los datos de entrenamiento y prueba
training_files = ['OpportunityUCIDataset/dataset/S1-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S1-Drill.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S2-Drill.dat']
test_files = ['OpportunityUCIDataset/dataset/S3-ADL1.dat',
                'OpportunityUCIDataset/dataset/S3-ADL2.dat',
                'OpportunityUCIDataset/dataset/S3-ADL3.dat',
                'OpportunityUCIDataset/dataset/S3-Drill.dat']

In [68]:
# Leer columnas del dataset que se ecuentran en otro archivo
col_names = []
with open('col_names.txt','r') as f:
    lines = f.read().splitlines()
    for line in lines:
        col_names.append(line)

In [69]:
# Leer datos de entrenamiento
df_training = pd.DataFrame()
for i, file in enumerate(training_files):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training = df_training._append(file_data, ignore_index=True)
df_training.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

# Leer datos de prueba
df_test = pd.DataFrame()
for i, file in enumerate(test_files):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_test = df_test._append(file_data, ignore_index=True)
df_test.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

OpportunityUCIDataset/dataset/S1-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-Drill.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S3-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-Drill.dat se está leyendo...
Lectura hecha!


In [70]:
def segmentation_to_simple_activities(df):
    # Segmentación de la base de datos a solo las simples (Parado, caminando, sentado, acostado)
    df = df.drop(['HL_Activity','LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm',
                  'LL_Right_Arm_Object', 'ML_Both_Arms'], axis = 1)
    # Se eliminan los ejemplos que no entran en alguna de las 4 actividades
    df = df[df['Locomotion'] != 0]
    return df

In [71]:
df_training = segmentation_to_simple_activities(df_training)
df_test = segmentation_to_simple_activities(df_test)

In [72]:
def mapping_new_labels(df):
    # Se mapean los nuevos labels
    mapping = {1:1, 2:2, 4:3, 5:4}
    df['Locomotion'] = df['Locomotion'].map(mapping)
    return df

In [73]:
df_training = mapping_new_labels(df_training)
df_test = mapping_new_labels(df_test)

In [74]:
def cut_no_body_sensors(df):
    # Se dejan solamente los features relacionados a sensores en el cuerpo humano
    df = df.drop(df.iloc[:,134:243], axis=1)
    df = df.drop(['IMU-L-SHOE-Compass', 'IMU-R-SHOE-Compass'], axis=1)

    columns_to_drop = [col for col in df.columns if 'Quaternion' in col]
    df = df.drop(columns=columns_to_drop)
    return df

In [75]:
df_training = cut_no_body_sensors(df_training)
df_test = cut_no_body_sensors(df_test)

In [76]:
def handle_missing_values(df):
    # Manejo de valores nulos por medio de imputación hacia delante
    limit = df.shape[1]*0.9
    df = df.dropna(axis='rows',thresh = limit)
    df.iloc[0] = df.iloc[0].fillna(0)
    df = df.ffill()
    return df

In [77]:
df_training = handle_missing_values(df_training)
df_test = handle_missing_values(df_test)

In [78]:
# Distribución de clases
def class_distribution(df):
    locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
    for value in df.loc[:,'Locomotion']:
        if value == 1:
            locomotion['Stan'] += 1
        elif value == 2:
            locomotion['Walk'] += 1
        elif value == 3:
            locomotion['Sit'] += 1
        elif value == 4:
            locomotion['Lie'] += 1
    print(locomotion)

In [142]:
df = df_training.iloc[:,1:113]
df.shape

(265684, 112)

In [143]:
window_size = 90
total_sensors = 37
total_samples = df.shape[0]//window_size

first_column_from_sensor = 0

resultados_caracteristicas = {
    'Mode Locomotion': [],
}
mode_list = []

for i in range(total_sensors):
    start_rows = 0
    end_rows = window_size
        
    for j in range(total_samples):
        col_x = df.iloc[start_rows:end_rows, first_column_from_sensor]
        col_y = df.iloc[start_rows:end_rows, first_column_from_sensor+1]
        col_z = df.iloc[start_rows:end_rows, first_column_from_sensor+2]

        magnitud = np.sqrt(col_x**2 + col_y**2 + col_z**2)
        media_magnitud = np.mean(magnitud)
        std_magnitud = np.std(magnitud)
        auc_magnitud = np.sum(magnitud)

        sensor_name = df.columns[first_column_from_sensor][:-1]

        mean_magnitude_name = 'MM ' + sensor_name
        std_magnitude_name = 'StdM ' + sensor_name
        AUC_magnitude_name = 'AUCM ' + sensor_name
        
        if mean_magnitude_name not in resultados_caracteristicas:
            resultados_caracteristicas[mean_magnitude_name] = []
        if std_magnitude_name not in resultados_caracteristicas:
            resultados_caracteristicas[std_magnitude_name] = []
        if AUC_magnitude_name not in resultados_caracteristicas:
            resultados_caracteristicas[AUC_magnitude_name] = []

        resultados_caracteristicas[mean_magnitude_name].append(media_magnitud)
        resultados_caracteristicas[std_magnitude_name].append(std_magnitud)
        resultados_caracteristicas[AUC_magnitude_name].append(auc_magnitud)
        
        start_rows = end_rows
        end_rows += 90
    
    first_column_from_sensor += 3
    
start_rows = 0
end_rows = window_size
for k in range(total_samples):
    mode = df.iloc[start_rows:end_rows, 111].mode()[0]
    resultados_caracteristicas['Mode Locomotion'].append(mode)
    
    start_rows = end_rows
    end_rows += 90

resultados_df = pd.DataFrame(resultados_caracteristicas)


In [144]:
resultados_df

Unnamed: 0,Mode Locomotion,MM Acc-RKN^-acc,StdM Acc-RKN^-acc,AUCM Acc-RKN^-acc,MM Acc-HIP-acc,StdM Acc-HIP-acc,AUCM Acc-HIP-acc,MM Acc-LUA^-acc,StdM Acc-LUA^-acc,AUCM Acc-LUA^-acc,...,AUCM IMU-R-SHOE-Nav_A,MM IMU-R-SHOE-Body_A,StdM IMU-R-SHOE-Body_A,AUCM IMU-R-SHOE-Body_A,MM IMU-R-SHOE-AngVelBodyFrame,StdM IMU-R-SHOE-AngVelBodyFrame,AUCM IMU-R-SHOE-AngVelBodyFrame,MM IMU-R-SHOE-AngVelNavFrame,StdM IMU-R-SHOE-AngVelNavFrame,AUCM IMU-R-SHOE-AngVelNavFrame
0,1,1027.437893,12.862027,92469.410363,1023.749907,17.292761,92137.491606,1019.523162,28.178790,91757.084588,...,6349.772371,930.370494,5.675392,83733.344483,68.062319,78.679157,6125.608676,68.062319,78.679157,6125.608676
1,1,1030.797048,32.340220,92771.734338,1026.686019,32.108779,92401.741720,1016.368618,32.833230,91473.175630,...,8078.425547,936.689989,27.494214,84302.098980,139.340285,299.155973,12540.625666,139.340285,299.155973,12540.625666
2,1,1026.393735,10.819752,92375.436167,1024.393975,11.098821,92195.457757,1016.316225,15.974573,91468.460222,...,6273.085764,930.564861,2.769887,83750.837516,35.550837,16.257108,3199.575299,35.550837,16.257108,3199.575299
3,1,1030.919520,22.513172,92782.756816,1025.214879,19.960983,92269.339107,1017.368089,20.407085,91563.128017,...,6271.373822,931.352408,3.636682,83821.716720,54.513498,40.875566,4906.214825,54.513498,40.875566,4906.214825
4,1,1030.558733,18.705833,92750.286003,1025.579803,19.310668,92302.182231,1019.392551,22.860554,91745.329581,...,6849.355505,930.320984,10.297281,83728.888531,141.807470,159.031120,12762.672255,141.807470,159.031120,12762.672255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,1,1044.679218,10.354198,94021.129613,1021.242083,10.162436,91911.787446,1019.453861,10.359746,91750.847532,...,7062.607634,922.002863,7.091342,82980.257677,45.915926,30.272191,4132.433306,45.915926,30.272191,4132.433306
2948,1,1053.664813,103.959847,94829.833209,1005.747558,76.738763,90517.280254,999.170945,83.076944,89925.385034,...,18625.005566,945.027619,199.403443,85052.485687,641.420892,1254.828419,57727.880276,641.420892,1254.828419,57727.880276
2949,3,1024.479356,58.182606,92203.142049,1038.161503,73.022475,93434.535304,1034.771639,83.627878,93129.447503,...,7709.971676,924.857712,11.098631,83237.194061,72.711103,109.147688,6543.999227,72.711103,109.147688,6543.999227
2950,3,1042.547212,10.474248,93829.249118,1023.691061,9.927004,92132.195470,1017.525332,11.781785,91577.279853,...,7145.681063,920.836826,3.163177,82875.314382,40.217115,20.153999,3619.540319,40.217115,20.153999,3619.540319


In [147]:

locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
for value in resultados_df.iloc[:,0]:
    if value == 1:
        locomotion['Stan'] += 1
    elif value == 2:
        locomotion['Walk'] += 1
    elif value == 3:
        locomotion['Sit'] += 1
    elif value == 4:
        locomotion['Lie'] += 1
print(locomotion)

Stan    1381
Walk     890
Sit      610
Lie       71
dtype: int64
