In [1]:
#Librerías para la carga, análisis y preprocesamiento de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Lista de archivos con los datos de los 4 sujetos en las primeras 5 sesiones
list_of_files = ['OpportunityUCIDataset/dataset/S1-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S1-Drill.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S2-Drill.dat',
                 'OpportunityUCIDataset/dataset/S3-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S3-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S3-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S3-Drill.dat']

In [3]:
# Leer columnas del dataset que se ecuentran en otro archivo
col_names = []
with open('col_names.txt','r') as f:
    lines = f.read().splitlines()
    for line in lines:
        col_names.append(line)

In [4]:
# Leer base de datos
df = pd.DataFrame()
for i, file in enumerate(list_of_files):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df = df._append(file_data, ignore_index=True)
df.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

OpportunityUCIDataset/dataset/S1-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-Drill.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-Drill.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-Drill.dat se está leyendo...
Lectura hecha!


In [5]:
# Mostrar los primero 5 ejemplos y las dimensiones del dataframe
display("Primeras 5 filas del dataframe", df.head(), "Las dimensiones del dataframe son", df.shape)

'Primeras 5 filas del dataframe'

Unnamed: 0,MILLISEC,Acc-RKN^-accX,Acc-RKN^-accY,Acc-RKN^-accZ,Acc-HIP-accX,Acc-HIP-accY,Acc-HIP-accZ,Acc-LUA^-accX,Acc-LUA^-accY,Acc-LUA^-accZ,...,LOCATION-TAG4-X,LOCATION-TAG4-Y,LOCATION-TAG4-Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,5789.0,2907.0,1447.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,5789.0,2908.0,1443.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,5789.0,2910.0,1440.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,5789.0,2912.0,1440.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,5791.0,2915.0,1442.0,0,0,0,0,0,0,0


'Las dimensiones del dataframe son'

(494881, 250)

In [6]:
# Segmentación de la base de datos a solo las simples (Parado, caminando, sentado, acostado)
df = df.drop(['HL_Activity','LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm',
              'LL_Right_Arm_Object', 'ML_Both_Arms'], axis = 1)
# Se eliminan los ejemplos que no entran en alguna de las 4 actividades
df = df[df['Locomotion'] != 0]

In [7]:
mapping = {1:1, 2:2, 4:3, 5:4}
df['Locomotion'] = df['Locomotion'].map(mapping)

In [8]:
df = df.drop(df.iloc[:,134:243], axis=1)

In [9]:
display("Primeras 5 filas del dataframe", df.head(), "Las dimensiones del dataframe son", df.shape)

'Primeras 5 filas del dataframe'

Unnamed: 0,MILLISEC,Acc-RKN^-accX,Acc-RKN^-accY,Acc-RKN^-accZ,Acc-HIP-accX,Acc-HIP-accY,Acc-HIP-accZ,Acc-LUA^-accX,Acc-LUA^-accY,Acc-LUA^-accZ,...,IMU-R-SHOE-Body_Ay,IMU-R-SHOE-Body_Az,IMU-R-SHOE-AngVelBodyFrameX,IMU-R-SHOE-AngVelBodyFrameY,IMU-R-SHOE-AngVelBodyFrameZ,IMU-R-SHOE-AngVelNavFrameX,IMU-R-SHOE-AngVelNavFrameY,IMU-R-SHOE-AngVelNavFrameZ,IMU-R-SHOE-Compass,Locomotion
2954,98466,-43.0,971.0,-339.0,27.0,988.0,285.0,124.0,1012.0,72.0,...,345.0,-831.0,-29.0,9.0,-24.0,9.0,29.0,-24.0,165.0,1
2955,98499,-33.0,957.0,-347.0,29.0,981.0,268.0,124.0,1002.0,86.0,...,339.0,-835.0,-5.0,15.0,-33.0,15.0,5.0,-33.0,165.0,1
2956,98532,-35.0,966.0,-363.0,17.0,990.0,282.0,124.0,1008.0,62.0,...,345.0,-833.0,-32.0,25.0,47.0,25.0,32.0,47.0,165.0,1
2957,98566,-41.0,951.0,-341.0,24.0,989.0,291.0,124.0,1005.0,61.0,...,349.0,-834.0,-44.0,4.0,46.0,4.0,44.0,46.0,165.0,1
2958,98599,-60.0,969.0,-349.0,33.0,971.0,281.0,123.0,1036.0,67.0,...,340.0,-834.0,-5.0,3.0,-3.0,3.0,5.0,-3.0,165.0,1


'Las dimensiones del dataframe son'

(418377, 135)

In [10]:
# Contar y mostrar la cantidad de datos ausentes en las primeras 20 columnas
num_of_na = df.isna().sum()

print(num_of_na.iloc[:20],'\nTotal de valores faltantes:',num_of_na.sum(),'\nTotal de datos: ', df.shape[0]*df.shape[1])


MILLISEC             0
Acc-RKN^-accX    12445
Acc-RKN^-accY    12445
Acc-RKN^-accZ    12445
Acc-HIP-accX      7214
Acc-HIP-accY      7214
Acc-HIP-accZ      7214
Acc-LUA^-accX    18731
Acc-LUA^-accY    18731
Acc-LUA^-accZ    18731
Acc-RUA_-accX    13522
Acc-RUA_-accY    13522
Acc-RUA_-accZ    13522
Acc-LH-accX      36348
Acc-LH-accY      36348
Acc-LH-accZ      36348
Acc-BACK-accX    18256
Acc-BACK-accY    18256
Acc-BACK-accZ    18256
Acc-RKN_-accX    26045
dtype: int64 
Total de valores faltantes: 1717241 
Total de datos:  56480895


In [11]:
# Manejo de valores nulos por medio de imputación hacia delante
limit = df.shape[1]*0.9
df = df.dropna(axis='rows',thresh = limit)
df.iloc[0] = df.iloc[0].fillna(0)
df = df.ffill()

display(df.isna().sum().sum())

0

In [12]:
df.dtypes

MILLISEC                        int64
Acc-RKN^-accX                 float64
Acc-RKN^-accY                 float64
Acc-RKN^-accZ                 float64
Acc-HIP-accX                  float64
                               ...   
IMU-R-SHOE-AngVelNavFrameX    float64
IMU-R-SHOE-AngVelNavFrameY    float64
IMU-R-SHOE-AngVelNavFrameZ    float64
IMU-R-SHOE-Compass            float64
Locomotion                      int64
Length: 135, dtype: object

In [13]:
# Distribución de clases

locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
for value in df.loc[:,'Locomotion']:
    if 1 <= value <= 4:
        if value == 1:
            locomotion['Stan'] += 1
        elif value == 2:
            locomotion['Walk'] += 1
        elif value == 3:
            locomotion['Sit'] += 1
        elif value == 4:
            locomotion['Lie'] += 1

display(locomotion)

Stan    208858
Walk    116416
Sit      74803
Lie      11647
dtype: int64

In [14]:
df.describe()

Unnamed: 0,MILLISEC,Acc-RKN^-accX,Acc-RKN^-accY,Acc-RKN^-accZ,Acc-HIP-accX,Acc-HIP-accY,Acc-HIP-accZ,Acc-LUA^-accX,Acc-LUA^-accY,Acc-LUA^-accZ,...,IMU-R-SHOE-Body_Ay,IMU-R-SHOE-Body_Az,IMU-R-SHOE-AngVelBodyFrameX,IMU-R-SHOE-AngVelBodyFrameY,IMU-R-SHOE-AngVelBodyFrameZ,IMU-R-SHOE-AngVelNavFrameX,IMU-R-SHOE-AngVelNavFrameY,IMU-R-SHOE-AngVelNavFrameZ,IMU-R-SHOE-Compass,Locomotion
count,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,...,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0,411724.0
mean,783613.7,-17.697049,828.049572,407.147961,-243.700151,893.647018,87.690489,115.637565,865.234985,270.70386,...,547.411113,-690.77586,7.132873,0.857147,-20.630757,0.857142,-7.13288,-20.630757,15.030588,1.730982
std,492969.8,279.650002,364.201578,428.814465,268.068635,206.68725,289.010541,297.415977,214.933992,303.51904,...,400.271954,335.562333,1291.884964,861.466746,1096.454161,861.46678,1291.884975,1096.454161,70.425637,0.85425
min,27266.0,-2872.0,-1306.0,-6519.0,-2273.0,-756.0,-2465.0,-2231.0,-1829.0,-3866.0,...,-5866.0,-7515.0,-12092.0,-16592.0,-10009.0,-16592.0,-11063.0,-10009.0,-342.0,1.0
25%,402296.0,-135.0,666.0,141.0,-417.0,862.0,-48.0,-53.0,796.0,70.0,...,381.0,-807.0,-47.0,-51.0,-36.0,-51.0,-37.0,-36.0,-8.0,1.0
50%,718826.0,-8.0,951.0,376.0,-258.0,946.0,90.0,117.0,920.0,284.0,...,528.0,-741.0,-2.0,0.0,1.0,0.0,2.0,1.0,3.0,1.0
75%,1055589.0,101.0,1025.0,689.0,-64.0,994.0,224.0,277.0,985.0,446.0,...,584.0,-655.0,37.0,48.0,48.0,48.0,47.0,48.0,31.0,2.0
max,2309877.0,2796.0,3866.0,4582.0,1928.0,2205.0,2306.0,4038.0,3986.0,1933.0,...,7356.0,6043.0,11063.0,11567.0,14771.0,11567.0,12092.0,14771.0,261.0,4.0


In [15]:
# Correlación entre atributos
df.corr()

Unnamed: 0,MILLISEC,Acc-RKN^-accX,Acc-RKN^-accY,Acc-RKN^-accZ,Acc-HIP-accX,Acc-HIP-accY,Acc-HIP-accZ,Acc-LUA^-accX,Acc-LUA^-accY,Acc-LUA^-accZ,...,IMU-R-SHOE-Body_Ay,IMU-R-SHOE-Body_Az,IMU-R-SHOE-AngVelBodyFrameX,IMU-R-SHOE-AngVelBodyFrameY,IMU-R-SHOE-AngVelBodyFrameZ,IMU-R-SHOE-AngVelNavFrameX,IMU-R-SHOE-AngVelNavFrameY,IMU-R-SHOE-AngVelNavFrameZ,IMU-R-SHOE-Compass,Locomotion
MILLISEC,1.000000,-0.039563,0.039927,0.083009,-0.038809,0.063052,-0.153263,-0.021984,0.084537,-0.030588,...,0.041414,-0.002479,0.000404,0.012432,-0.005704,0.012432,-0.000404,-0.005704,-0.004782,-0.057110
Acc-RKN^-accX,-0.039563,1.000000,-0.002409,-0.088756,-0.041148,-0.025912,-0.327030,0.217001,-0.087421,0.241808,...,0.120709,0.213405,-0.018128,0.023561,-0.015550,0.023561,0.018128,-0.015550,-0.029983,0.019465
Acc-RKN^-accY,0.039927,-0.002409,1.000000,-0.495392,-0.134007,0.209282,0.006381,0.218066,0.448214,-0.348197,...,0.161576,-0.174824,0.052957,-0.011097,-0.058270,-0.011097,-0.052958,-0.058270,0.119432,-0.622924
Acc-RKN^-accZ,0.083009,-0.088756,-0.495392,1.000000,-0.087145,0.026407,-0.018228,-0.067198,-0.126454,0.211339,...,-0.023987,0.045663,0.024799,0.004352,0.003567,0.004352,-0.024799,0.003567,-0.132518,0.346413
Acc-HIP-accX,-0.038809,-0.041148,-0.134007,-0.087145,1.000000,0.187508,0.297712,-0.278385,-0.102145,0.292577,...,0.004598,0.049688,-0.022671,-0.021552,0.001927,-0.021552,0.022671,0.001927,0.037523,0.276010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IMU-R-SHOE-AngVelNavFrameX,0.012432,0.023561,-0.011097,0.004352,-0.021552,-0.007607,-0.013688,0.020980,0.002640,-0.013825,...,-0.047348,0.072568,-0.181447,1.000000,-0.071764,1.000000,0.181446,-0.071764,-0.009432,0.014546
IMU-R-SHOE-AngVelNavFrameY,-0.000404,0.018128,-0.052958,-0.024799,0.022671,-0.029776,-0.013433,-0.012591,-0.052949,0.026577,...,0.074579,-0.134056,-1.000000,0.181446,0.405200,0.181446,1.000000,0.405200,-0.009639,0.003606
IMU-R-SHOE-AngVelNavFrameZ,-0.005704,-0.015550,-0.058270,0.003567,0.001927,-0.000871,-0.020008,-0.014990,-0.022360,0.018734,...,-0.085967,-0.175635,-0.405200,-0.071764,1.000000,-0.071764,0.405200,1.000000,-0.005269,-0.019448
IMU-R-SHOE-Compass,-0.004782,-0.029983,0.119432,-0.132518,0.037523,0.000808,0.022591,0.019236,0.049711,-0.120548,...,-0.007251,-0.061635,0.009639,-0.009432,-0.005269,-0.009432,-0.009639,-0.005269,1.000000,-0.107120


In [16]:
#Histograma de los primeros 20 atributos

df.iloc[:,:30].hist(bins=5,figsize=(20,10))
plt.tight_layout()
#plt.show()
plt.savefig('charts/Histograma.png')
plt.close()

In [17]:
# Gráfica de densidad para la distribución de los atributos
df.iloc[:,1:30].plot(kind='density', subplots=True, layout=(6,5), sharex=False, figsize=(15,15))
#plt.show()
plt.savefig('charts/Gráfica de densidad.png')
plt.close()

In [18]:
# Gráfica de cajas y bigotes para la distribución de los atributos
df.iloc[:,1:30].plot(kind='box',subplots=True,layout=(5,6), sharex=False,figsize=(15,15))
#plt.show()
plt.savefig('charts/Gráfica de cajas y bigotes.png')
plt.close()

In [20]:
# Matriz de correlación 
correlations = df.iloc[:,1:30].corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
#plt.show()
plt.savefig('charts/Matriz de correlacion.png')
plt.close()

In [21]:
# Matriz de dispersión de los atributos
sns.pairplot(df.iloc[:,1:5])
#plt.show()
plt.savefig('charts/Matriz de dispersion.png')
plt.close()

In [22]:
# Escalamiento con MinMaxScaler
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [34]:
df.drop('MILLISEC', axis=1)

Unnamed: 0,Acc-RKN^-accX,Acc-RKN^-accY,Acc-RKN^-accZ,Acc-HIP-accX,Acc-HIP-accY,Acc-HIP-accZ,Acc-LUA^-accX,Acc-LUA^-accY,Acc-LUA^-accZ,Acc-RUA_-accX,...,IMU-R-SHOE-Body_Ay,IMU-R-SHOE-Body_Az,IMU-R-SHOE-AngVelBodyFrameX,IMU-R-SHOE-AngVelBodyFrameY,IMU-R-SHOE-AngVelBodyFrameZ,IMU-R-SHOE-AngVelNavFrameX,IMU-R-SHOE-AngVelNavFrameY,IMU-R-SHOE-AngVelNavFrameZ,IMU-R-SHOE-Compass,Locomotion
0,0.499118,0.440255,0.556707,0.547489,0.588990,0.576399,0.375658,0.488564,0.679083,0.563620,...,0.469747,0.492993,0.520967,0.589545,0.402946,0.589545,0.479033,0.402946,0.840796,0.000000
1,0.500882,0.437548,0.555986,0.547965,0.586626,0.572836,0.375658,0.486844,0.681497,0.561792,...,0.469294,0.492698,0.522004,0.589758,0.402583,0.589758,0.477996,0.402583,0.840796,0.000000
2,0.500529,0.439288,0.554545,0.545108,0.589666,0.575770,0.375658,0.487876,0.677358,0.563985,...,0.469747,0.492846,0.520838,0.590113,0.405811,0.590113,0.479162,0.405811,0.840796,0.000000
3,0.499471,0.436388,0.556526,0.546775,0.589328,0.577657,0.375658,0.487360,0.677186,0.564899,...,0.470050,0.492772,0.520320,0.589368,0.405771,0.589368,0.479680,0.405771,0.840796,0.000000
4,0.496119,0.439869,0.555806,0.548917,0.583249,0.575561,0.375498,0.492691,0.678220,0.563803,...,0.469369,0.492772,0.522004,0.589332,0.403793,0.589332,0.477996,0.403793,0.840796,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411719,0.440896,0.442769,0.656157,0.460129,0.582911,0.491092,0.302281,0.298194,0.845663,0.425777,...,0.486462,0.505532,0.521615,0.589758,0.403349,0.589758,0.478385,0.403349,0.550580,0.333333
411720,0.483945,0.459783,0.637600,0.456796,0.566700,0.484804,0.307226,0.296819,0.842559,0.426691,...,0.486462,0.505606,0.521615,0.589723,0.403390,0.589723,0.478385,0.403390,0.550580,0.333333
411721,0.460656,0.445862,0.591028,0.463223,0.561635,0.495284,0.305790,0.294927,0.836868,0.425960,...,0.486538,0.506564,0.522393,0.588231,0.402986,0.588231,0.477607,0.402986,0.550580,0.333333
411722,0.466831,0.481052,0.583641,0.461319,0.572104,0.493607,0.296219,0.296647,0.842042,0.427788,...,0.486613,0.506417,0.522911,0.587059,0.403107,0.587059,0.477089,0.403107,0.550580,0.333333


In [36]:
section_size = 90
div = df.shape[0] // section_size

columns_sample = pd.MultiIndex.from_product([df.columns, ['mean', 'std', 'max', 'min']], names=['column', 'description'])
df_features = pd.DataFrame(index=range(div), columns=columns_sample)

for i in range(df.shape[1]):
    for j in range(div):
        start_index = j * section_size
        end_index = (j + 1) * section_size
        data_section = df.iloc[start_index:end_index, i]

        df_features.loc[j, (df.columns[i], 'mean')] = data_section.mean()
        df_features.loc[j, (df.columns[i], 'std')] = data_section.std()
        df_features.loc[j, (df.columns[i], 'max')] = data_section.max()
        df_features.loc[j, (df.columns[i], 'min')] = data_section.min()

In [37]:
df_features

column,MILLISEC,MILLISEC,MILLISEC,MILLISEC,Acc-RKN^-accX,Acc-RKN^-accX,Acc-RKN^-accX,Acc-RKN^-accX,Acc-RKN^-accY,Acc-RKN^-accY,...,IMU-R-SHOE-AngVelNavFrameZ,IMU-R-SHOE-AngVelNavFrameZ,IMU-R-SHOE-Compass,IMU-R-SHOE-Compass,IMU-R-SHOE-Compass,IMU-R-SHOE-Compass,Locomotion,Locomotion,Locomotion,Locomotion
description,mean,std,max,min,mean,std,max,min,mean,std,...,max,min,mean,std,max,min,mean,std,max,min
0,0.031842,0.000381,0.032492,0.031192,0.500186,0.006964,0.522054,0.485356,0.437426,0.002542,...,0.410533,0.395561,0.840796,0.0,0.840796,0.840796,0.0,0.0,0.0,0.0
1,0.033156,0.000381,0.033806,0.032507,0.503597,0.015383,0.542343,0.446366,0.438418,0.007741,...,0.407708,0.334665,0.831564,0.007598,0.840796,0.824212,0.0,0.0,0.0,0.0
2,0.034471,0.000381,0.03512,0.033821,0.507247,0.00427,0.517819,0.49753,0.436863,0.002303,...,0.405367,0.401977,0.824526,0.000653,0.825871,0.824212,0.0,0.0,0.0,0.0
3,0.035785,0.000381,0.036435,0.035135,0.511662,0.009954,0.53705,0.459245,0.438107,0.00589,...,0.406618,0.40109,0.82471,0.000764,0.825871,0.824212,0.0,0.0,0.0,0.0
4,0.037099,0.000381,0.037749,0.036449,0.507179,0.010828,0.548871,0.469654,0.438236,0.005181,...,0.419734,0.394512,0.825557,0.00157,0.827529,0.822554,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4569,0.993158,0.000382,0.993808,0.992509,0.51521,0.026954,0.614679,0.413903,0.41171,0.045081,...,0.539992,0.246529,0.538272,0.017802,0.565506,0.512438,0.0,0.0,0.0,0.0
4570,0.994473,0.000382,0.995123,0.993823,0.516516,0.013231,0.538814,0.494001,0.287651,0.012808,...,0.433495,0.392978,0.548959,0.000246,0.55058,0.548922,0.355556,0.334455,0.666667,0.0
4571,0.995787,0.000382,0.996437,0.995137,0.52433,0.002411,0.530346,0.51729,0.283196,0.001859,...,0.405408,0.401735,0.548922,0.0,0.548922,0.548922,0.666667,0.0,0.666667,0.666667
4572,0.997101,0.000382,0.997751,0.996451,0.522216,0.009158,0.58204,0.502117,0.284392,0.003807,...,0.406215,0.343624,0.548296,0.001306,0.548922,0.54063,0.666667,0.0,0.666667,0.666667
