In [1]:
#Librerías para la carga, análisis y preprocesamiento de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
# División de los datos de entrenamiento y prueba
training_files_1 = ['OpportunityUCIDataset/dataset/S1-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S1-Drill.dat']
training_files_2 = ['OpportunityUCIDataset/dataset/S2-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S2-Drill.dat']
training_files_3 = ['OpportunityUCIDataset/dataset/S3-ADL1.dat',
                'OpportunityUCIDataset/dataset/S3-ADL2.dat',
                'OpportunityUCIDataset/dataset/S3-ADL3.dat',
                'OpportunityUCIDataset/dataset/S3-Drill.dat']
test_files = ['OpportunityUCIDataset/dataset/S4-ADL1.dat',
                'OpportunityUCIDataset/dataset/S4-ADL2.dat',
                'OpportunityUCIDataset/dataset/S4-ADL3.dat',
                'OpportunityUCIDataset/dataset/S4-Drill.dat']

In [3]:
# Leer columnas del dataset que se ecuentran en otro archivo
col_names = []
with open('col_names.txt','r') as f:
    lines = f.read().splitlines()
    for line in lines:
        col_names.append(line)

In [4]:
# Leer datos de entrenamiento
df_training_1 = pd.DataFrame()
for i, file in enumerate(training_files_1):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training_1 = df_training_1._append(file_data, ignore_index=True)
df_training_1.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

df_training_2 = pd.DataFrame()
for i, file in enumerate(training_files_2):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training_2 = df_training_2._append(file_data, ignore_index=True)
df_training_2.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

df_training_3 = pd.DataFrame()
for i, file in enumerate(training_files_3):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training_3 = df_training_3._append(file_data, ignore_index=True)
df_training_3.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

# Leer datos de prueba
df_test = pd.DataFrame()
for i, file in enumerate(test_files):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_test = df_test._append(file_data, ignore_index=True)
df_test.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

OpportunityUCIDataset/dataset/S1-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S2-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S3-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S4-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-Dril

In [5]:
def segmentation_to_simple_activities(df):
    # Segmentación de la base de datos a solo las simples (Parado, caminando, sentado, acostado)
    df = df.drop(['HL_Activity','LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm',
                  'LL_Right_Arm_Object', 'ML_Both_Arms'], axis = 1)
    # Se eliminan los ejemplos que no entran en alguna de las 4 actividades
    df = df[df['Locomotion'] != 0]
    return df

In [6]:
df_training_1 = segmentation_to_simple_activities(df_training_1)
df_training_2 = segmentation_to_simple_activities(df_training_2)
df_training_3 = segmentation_to_simple_activities(df_training_3)
df_test = segmentation_to_simple_activities(df_test)

In [7]:
def mapping_new_labels(df):
    # Se mapean los nuevos labels
    mapping = {1:1, 2:2, 4:3, 5:4}
    df['Locomotion'] = df['Locomotion'].map(mapping)
    return df

In [8]:
df_training_1 = mapping_new_labels(df_training_1)
df_training_2 = mapping_new_labels(df_training_2)
df_training_3 = mapping_new_labels(df_training_3)
df_test = mapping_new_labels(df_test)

In [9]:
def cut_no_body_sensors(df):
    # Se dejan solamente los features relacionados a sensores en el cuerpo humano
    df = df.drop(df.iloc[:,134:243], axis=1)
    df = df.drop(['IMU-L-SHOE-Compass', 'IMU-R-SHOE-Compass'], axis=1)

    columns_to_drop = [col for col in df.columns if 'Quaternion' in col]
    df = df.drop(columns=columns_to_drop)
    return df

In [10]:
df_training_1 = cut_no_body_sensors(df_training_1)
df_training_2 = cut_no_body_sensors(df_training_2)
df_training_3 = cut_no_body_sensors(df_training_3)
df_test = cut_no_body_sensors(df_test)

In [11]:
def handle_missing_values(df):
    # Manejo de valores nulos por medio de imputación hacia delante
    limit = df.shape[1]*0.9
    df = df.dropna(axis='rows',thresh = limit)
    df.iloc[0] = df.iloc[0].fillna(0)
    df = df.ffill()
    return df

In [12]:
df_training_1 = handle_missing_values(df_training_1)
df_training_2 = handle_missing_values(df_training_2)
df_training_3 = handle_missing_values(df_training_3)
df_test = handle_missing_values(df_test)

In [13]:
def feature_extraction(df):
    window_size = 90
    total_sensors = 37
    total_samples = df.shape[0]//window_size
    
    first_column_from_sensor = 0
    
    resultados_caracteristicas = {
        'Mode Locomotion': [],
    }
    
    for i in range(total_sensors):
        start_rows = 0
        end_rows = window_size
            
        for j in range(total_samples):
            col_x = df.iloc[start_rows:end_rows, first_column_from_sensor]
            col_y = df.iloc[start_rows:end_rows, first_column_from_sensor+1]
            col_z = df.iloc[start_rows:end_rows, first_column_from_sensor+2]

            # Estadísticos juntando los 3 ejes
            correlation_xy = np.corrcoef(col_x, col_y)[0, 1]
            correlation_xz = np.corrcoef(col_x, col_z)[0, 1]
            correlation_yz = np.corrcoef(col_y, col_z)[0, 1]
    
            magnitud = np.sqrt(col_x**2 + col_y**2 + col_z**2)
            media_magnitud = np.mean(magnitud)
            std_magnitud = np.std(magnitud)
            auc_magnitud = np.sum(magnitud)

            # Estadísticos de cada eje
            mean_axis_x = col_x.mean()
            std_axis_x = col_x.std()
            max_axis_x = col_x.max()

            mean_axis_y = col_y.mean()
            std_axis_y = col_y.std()
            max_axis_y = col_y.max()

            mean_axis_z = col_z.mean()
            std_axis_z = col_z.std()
            max_axis_z = col_z.max()

            # Nombre de cada sensor
            sensor_name = df.columns[first_column_from_sensor][:-1]

            # Nombres estadísticos eje X
            mean_axis_x_name = 'mean ' + df.columns[first_column_from_sensor]
            std_axis_x_name = 'std ' + df.columns[first_column_from_sensor]
            max_axis_x_name = 'max ' + df.columns[first_column_from_sensor]

            # Nombres estadísticos eje Y
            mean_axis_y_name = 'mean ' + df.columns[first_column_from_sensor+1]
            std_axis_y_name = 'std ' + df.columns[first_column_from_sensor+1]
            max_axis_y_name = 'max ' + df.columns[first_column_from_sensor+1]

            # Nombres estadísticos eje Z
            mean_axis_z_name = 'mean ' + df.columns[first_column_from_sensor+2]
            std_axis_z_name = 'std ' + df.columns[first_column_from_sensor+2]
            max_axis_z_name = 'max ' + df.columns[first_column_from_sensor+2]
            
            # Nombres estadísticos de los 3 ejes 
            mean_magnitude_name = 'MM ' + sensor_name
            std_magnitude_name = 'StdM ' + sensor_name
            AUC_magnitude_name = 'AUCM ' + sensor_name
            corr_xy_name = 'corrXY ' + sensor_name
            corr_xz_name = 'corrXZ ' + sensor_name
            corr_yz_name = 'corrYZ ' + sensor_name

            if mean_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_magnitude_name] = []
            if std_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_magnitude_name] = []
            if AUC_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[AUC_magnitude_name] = []
            if corr_xy_name not in resultados_caracteristicas:
                resultados_caracteristicas[corr_xy_name] = []
            if corr_xz_name not in resultados_caracteristicas:
                resultados_caracteristicas[corr_xz_name] = []
            if corr_yz_name not in resultados_caracteristicas:
                resultados_caracteristicas[corr_yz_name] = []
            if mean_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_x_name] = []
            if std_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_x_name] = []
            if max_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_x_name] = []
            if mean_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_y_name] = []
            if std_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_y_name] = []
            if max_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_y_name] = []
            if mean_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_z_name] = []
            if std_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_z_name] = []
            if max_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_z_name] = []
    
            resultados_caracteristicas[mean_magnitude_name].append(media_magnitud)
            resultados_caracteristicas[std_magnitude_name].append(std_magnitud)
            resultados_caracteristicas[AUC_magnitude_name].append(auc_magnitud)
            resultados_caracteristicas[corr_xy_name].append(correlation_xy)
            resultados_caracteristicas[corr_xz_name].append(correlation_xz)
            resultados_caracteristicas[corr_yz_name].append(correlation_yz)
            resultados_caracteristicas[mean_axis_x_name].append(mean_axis_x)
            resultados_caracteristicas[std_axis_x_name].append(mean_axis_x)
            resultados_caracteristicas[max_axis_x_name].append(mean_axis_x)
            resultados_caracteristicas[mean_axis_y_name].append(mean_axis_y)
            resultados_caracteristicas[std_axis_y_name].append(mean_axis_y)
            resultados_caracteristicas[max_axis_y_name].append(mean_axis_y)
            resultados_caracteristicas[mean_axis_z_name].append(mean_axis_z)
            resultados_caracteristicas[std_axis_z_name].append(mean_axis_z)
            resultados_caracteristicas[max_axis_z_name].append(mean_axis_z)
            
            start_rows = end_rows
            end_rows += 90
        
        first_column_from_sensor += 3
        
    start_rows = 0
    end_rows = window_size
    for k in range(total_samples):
        mode = df.iloc[start_rows:end_rows, 111].mode()[0]
        resultados_caracteristicas['Mode Locomotion'].append(mode)
        
        start_rows = end_rows
        end_rows += 90
    
    df = pd.DataFrame(resultados_caracteristicas)

    return df


In [14]:
df_training_1 = feature_extraction(df_training_1.iloc[:,1:])
df_training_2 = feature_extraction(df_training_2.iloc[:,1:])
#df_training_3 = feature_extraction(df_training_3.iloc[:,1:])
df_test = feature_extraction(df_test.iloc[:,1:])

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,Mode Locomotion,MM Acc-RKN^-acc,StdM Acc-RKN^-acc,AUCM Acc-RKN^-acc,corrXY Acc-RKN^-acc,corrXZ Acc-RKN^-acc,corrYZ Acc-RKN^-acc,mean Acc-RKN^-accX,std Acc-RKN^-accX,max Acc-RKN^-accX,...,corrYZ IMU-R-SHOE-AngVelNavFrame,mean IMU-R-SHOE-AngVelNavFrameX,std IMU-R-SHOE-AngVelNavFrameX,max IMU-R-SHOE-AngVelNavFrameX,mean IMU-R-SHOE-AngVelNavFrameY,std IMU-R-SHOE-AngVelNavFrameY,max IMU-R-SHOE-AngVelNavFrameY,mean IMU-R-SHOE-AngVelNavFrameZ,std IMU-R-SHOE-AngVelNavFrameZ,max IMU-R-SHOE-AngVelNavFrameZ
0,1,1027.437893,12.862027,92469.410363,0.071737,-0.190243,0.373577,-36.944444,-36.944444,-36.944444,...,-0.283995,-17.955556,-17.955556,-17.955556,8.766667,8.766667,8.766667,-3.566667,-3.566667,-3.566667
1,1,1030.797048,32.340220,92771.734338,0.354865,0.161109,0.389895,-17.611111,-17.611111,-17.611111,...,-0.781102,30.444444,30.444444,30.444444,7.177778,7.177778,7.177778,-53.811111,-53.811111,-53.811111
2,1,1026.393735,10.819752,92375.436167,0.102291,0.354481,0.345732,3.077778,3.077778,3.077778,...,-0.141450,1.822222,1.822222,1.822222,-6.644444,-6.644444,-6.644444,-0.755556,-0.755556,-0.755556
3,1,1030.919520,22.513172,92782.756816,0.440146,0.776741,0.658216,28.100000,28.100000,28.100000,...,-0.432515,-10.800000,-10.800000,-10.800000,0.988889,0.988889,0.988889,-6.277778,-6.277778,-6.277778
4,1,1030.558733,18.705833,92750.286003,-0.200334,-0.036541,0.728735,2.688889,2.688889,2.688889,...,-0.049682,10.955556,10.955556,10.955556,-11.811111,-11.811111,-11.811111,12.111111,12.111111,12.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,3,1056.177797,140.726351,95056.001716,0.247010,-0.098628,-0.206791,-228.466667,-228.466667,-228.466667,...,-0.751157,154.400000,154.400000,154.400000,125.800000,125.800000,125.800000,-218.988889,-218.988889,-218.988889
1541,2,1169.585881,174.299840,105262.729268,0.085016,-0.154512,-0.117530,-30.033333,-30.033333,-30.033333,...,0.597528,238.422222,238.422222,238.422222,-27.433333,-27.433333,-27.433333,25.322222,25.322222,25.322222
1542,2,1065.716554,155.592142,95914.489895,-0.305236,-0.358235,-0.254739,-79.700000,-79.700000,-79.700000,...,0.790246,27.544444,27.544444,27.544444,-27.066667,-27.066667,-27.066667,-26.666667,-26.666667,-26.666667
1543,2,1106.360071,151.558146,99572.406368,-0.279387,-0.566188,0.160069,-66.688889,-66.688889,-66.688889,...,0.690811,-86.533333,-86.533333,-86.533333,2.833333,2.833333,2.833333,96.377778,96.377778,96.377778


In [15]:
def scale_training_data(df_1, df_2):
    df = pd.concat([df_1, df_2], ignore_index=True)
    
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    
    scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df.insert(0, df.columns[0], labels)
    
    return scaled_df, scaler

In [16]:
def scale_test_data(df, scaler):
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]
    
    scaled_data = scaler.transform(data)
    
    scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df.insert(0, df.columns[0], labels)

    return scaled_df

In [17]:
df_training, scaler = scale_training_data(df_training_1, df_training_2)
df_test = scale_test_data(df_test, scaler)

In [18]:
df_reduced = df_training

In [19]:
labels = df_reduced.iloc[:, 0]
features = df_reduced.iloc[:, 1:]

lda = LinearDiscriminantAnalysis(n_components=3)
features_lda = lda.fit_transform(features, labels)

columns_lda = ['Componente_1', 'Componente_2', 'Componente_3']
df_lda = pd.DataFrame(data=features_lda, columns=columns_lda)

df_lda['Etiqueta'] = labels

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

colores = ['r', 'g', 'b', 'y', 'c', 'm']
labels_unicas = df_lda['Etiqueta'].unique()

for etiqueta, color in zip(labels_unicas, colores):
    indices = df_lda['Etiqueta'] == etiqueta
    ax.scatter(df_lda.loc[indices, 'Componente_1'],
               df_lda.loc[indices, 'Componente_2'],
               df_lda.loc[indices, 'Componente_3'],
               c=color,
               label=etiqueta,
               s=50,
               alpha=0.7)

ax.set_xlabel('Componente 1')
ax.set_ylabel('Componente 2')
ax.set_zlabel('Componente 3')
ax.set_title('Visualización de Clases en 3D usando LDA')
ax.legend()
ax.view_init(elev=80, azim=100)

display(plt.show())


ValueError: Input X contains NaN.
LinearDiscriminantAnalysis does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
for value in df_reduced.loc[:,'Mode Locomotion']:
    if 1 <= value <= 4:
        if value == 1:
            locomotion['Stan'] += 1
        elif value == 2:
            locomotion['Walk'] += 1
        elif value == 3:
            locomotion['Sit'] += 1
        elif value == 4:
            locomotion['Lie'] += 1

display(locomotion)

In [None]:
X_train = df_training.iloc[:, 1:]  
y_train = df_training.iloc[:, 0]   

X_test = df_test.iloc[:, 1:]  
y_test = df_test.iloc[:, 0]   

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)
print("Reporte de Clasificación:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=rf_classifier.classes_, columns=rf_classifier.classes_)
print("Matriz de Confusión:\n", conf_matrix_df)

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, confusion_matrix

perceptron_model = Perceptron(class_weight='balanced', max_iter=1000, eta0=0.1)
perceptron_model.fit(X_train, y_train)

y_pred = perceptron_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print(classification_report(y_test, y_pred))
