In [81]:
#Librerías para la carga, análisis y preprocesamiento de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [82]:
# División de los datos de entrenamiento y prueba
training_files_1 = ['OpportunityUCIDataset/dataset/S1-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S1-Drill.dat']
training_files_2 = ['OpportunityUCIDataset/dataset/S2-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S2-Drill.dat']
training_files_3 = ['OpportunityUCIDataset/dataset/S3-ADL1.dat',
                'OpportunityUCIDataset/dataset/S3-ADL2.dat',
                'OpportunityUCIDataset/dataset/S3-ADL3.dat',
                'OpportunityUCIDataset/dataset/S3-Drill.dat']
test_files = ['OpportunityUCIDataset/dataset/S4-ADL1.dat',
                'OpportunityUCIDataset/dataset/S4-ADL2.dat',
                'OpportunityUCIDataset/dataset/S4-ADL3.dat',
                'OpportunityUCIDataset/dataset/S4-Drill.dat']

In [83]:
# Leer columnas del dataset que se ecuentran en otro archivo
col_names = []
with open('col_names.txt','r') as f:
    lines = f.read().splitlines()
    for line in lines:
        col_names.append(line)

In [84]:
# Leer datos de entrenamiento
df_training_1 = pd.DataFrame()
for i, file in enumerate(training_files_1):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training_1 = df_training_1._append(file_data, ignore_index=True)
df_training_1.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

df_training_2 = pd.DataFrame()
for i, file in enumerate(training_files_2):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training_2 = df_training_2._append(file_data, ignore_index=True)
df_training_2.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

df_training_3 = pd.DataFrame()
for i, file in enumerate(training_files_3):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_training_3 = df_training_3._append(file_data, ignore_index=True)
df_training_3.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

# Leer datos de prueba
df_test = pd.DataFrame()
for i, file in enumerate(test_files):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_test = df_test._append(file_data, ignore_index=True)
df_test.reset_index(drop=True, inplace=True)
print("Lectura hecha!")

OpportunityUCIDataset/dataset/S1-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S2-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S3-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S4-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-Dril

In [85]:
def segmentation_to_simple_activities(df):
    # Segmentación de la base de datos a solo las simples (Parado, caminando, sentado, acostado)
    df = df.drop(['HL_Activity','LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm',
                  'LL_Right_Arm_Object', 'ML_Both_Arms'], axis = 1)
    # Se eliminan los ejemplos que no entran en alguna de las 4 actividades
    df = df[df['Locomotion'] != 0]
    return df

In [86]:
df_training_1 = segmentation_to_simple_activities(df_training_1)
df_training_2 = segmentation_to_simple_activities(df_training_2)
df_training_3 = segmentation_to_simple_activities(df_training_3)
df_test = segmentation_to_simple_activities(df_test)

In [87]:
def mapping_new_labels(df):
    # Se mapean los nuevos labels
    mapping = {1:1, 2:2, 4:3, 5:4}
    df['Locomotion'] = df['Locomotion'].map(mapping)
    return df

In [88]:
df_training_1 = mapping_new_labels(df_training_1)
df_training_2 = mapping_new_labels(df_training_2)
df_training_3 = mapping_new_labels(df_training_3)
df_test = mapping_new_labels(df_test)

In [89]:
def cut_no_body_sensors(df):
    df_labels = df['Locomotion']
    df = df.drop(df.iloc[:,134:243], axis=1)

    columns_to_drop = [col for col in df.columns if 'acc' not in col]
    df = df.drop(columns=columns_to_drop)
    df = df.assign(label=df_labels)

    return df

In [90]:
df_training_1 = cut_no_body_sensors(df_training_1)
df_training_2 = cut_no_body_sensors(df_training_2)
df_training_3 = cut_no_body_sensors(df_training_3)
df_test = cut_no_body_sensors(df_test)

In [91]:
def handle_missing_values(df):
    # Manejo de valores nulos por medio de imputación hacia delante
    limit = df.shape[1]*0.9
    df = df.dropna(axis='rows',thresh = limit)
    df.iloc[0] = df.iloc[0].fillna(0)
    df = df.ffill()
    return df

In [92]:
df_training_1 = handle_missing_values(df_training_1)
df_training_2 = handle_missing_values(df_training_2)
df_training_3 = handle_missing_values(df_training_3)
df_test = handle_missing_values(df_test)

In [93]:
def feature_extraction(df):
    window_size = 90
    total_sensors = int(df.shape[1]/3)
    total_samples = df.shape[0]//window_size
    
    first_column_from_sensor = 0
    
    resultados_caracteristicas = {
        'mode label': [],
    }
    
    for i in range(total_sensors):
        start_rows = 0
        end_rows = window_size
            
        for j in range(total_samples):
            col_x = df.iloc[start_rows:end_rows, first_column_from_sensor]
            col_y = df.iloc[start_rows:end_rows, first_column_from_sensor+1]
            col_z = df.iloc[start_rows:end_rows, first_column_from_sensor+2]

            # Estadísticos juntando los 3 ejes
            
            #correlation_xy = df[first_column_from_sensor].corr(df[first_column_from_sensor+1])
            #correlation_xz = df[first_column_from_sensor].corr(df[first_column_from_sensor+2])
            #correlation_yz = df[first_column_from_sensor+1].corr(df[first_column_from_sensor+2])
    
            magnitud = np.sqrt(col_x**2 + col_y**2 + col_z**2)
            media_magnitud = np.mean(magnitud)
            std_magnitud = np.std(magnitud)
            auc_magnitud = np.sum(magnitud)

            # Estadísticos de cada eje
            mean_axis_x = col_x.mean()
            std_axis_x = col_x.std()
            max_axis_x = col_x.max()

            mean_axis_y = col_y.mean()
            std_axis_y = col_y.std()
            max_axis_y = col_y.max()

            mean_axis_z = col_z.mean()
            std_axis_z = col_z.std()
            max_axis_z = col_z.max()

            # Nombre de cada sensor
            sensor_name = df.columns[first_column_from_sensor][:-1]

            # Nombres estadísticos eje X
            mean_axis_x_name = 'mean ' + df.columns[first_column_from_sensor]
            std_axis_x_name = 'std ' + df.columns[first_column_from_sensor]
            max_axis_x_name = 'max ' + df.columns[first_column_from_sensor]

            # Nombres estadísticos eje Y
            mean_axis_y_name = 'mean ' + df.columns[first_column_from_sensor+1]
            std_axis_y_name = 'std ' + df.columns[first_column_from_sensor+1]
            max_axis_y_name = 'max ' + df.columns[first_column_from_sensor+1]

            # Nombres estadísticos eje Z
            mean_axis_z_name = 'mean ' + df.columns[first_column_from_sensor+2]
            std_axis_z_name = 'std ' + df.columns[first_column_from_sensor+2]
            max_axis_z_name = 'max ' + df.columns[first_column_from_sensor+2]
            
            # Nombres estadísticos de los 3 ejes 
            mean_magnitude_name = 'MM ' + sensor_name
            std_magnitude_name = 'StdM ' + sensor_name
            AUC_magnitude_name = 'AUCM ' + sensor_name
            corr_xy_name = 'corrXY ' + sensor_name
            corr_xz_name = 'corrXZ ' + sensor_name
            corr_yz_name = 'corrYZ ' + sensor_name

            if mean_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_magnitude_name] = []
            if std_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_magnitude_name] = []
            if AUC_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[AUC_magnitude_name] = []
            #if corr_xy_name not in resultados_caracteristicas:
                #resultados_caracteristicas[corr_xy_name] = []
            #if corr_xz_name not in resultados_caracteristicas:
                #resultados_caracteristicas[corr_xz_name] = []
            #if corr_yz_name not in resultados_caracteristicas:
                #resultados_caracteristicas[corr_yz_name] = []
            if mean_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_x_name] = []
            if std_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_x_name] = []
            if max_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_x_name] = []
            if mean_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_y_name] = []
            if std_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_y_name] = []
            if max_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_y_name] = []
            if mean_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_z_name] = []
            if std_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_z_name] = []
            if max_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_z_name] = []
    
            resultados_caracteristicas[mean_magnitude_name].append(media_magnitud)
            resultados_caracteristicas[std_magnitude_name].append(std_magnitud)
            resultados_caracteristicas[AUC_magnitude_name].append(auc_magnitud)
            #resultados_caracteristicas[corr_xy_name].append(correlation_xy)
            #resultados_caracteristicas[corr_xz_name].append(correlation_xz)
            #resultados_caracteristicas[corr_yz_name].append(correlation_yz)
            resultados_caracteristicas[mean_axis_x_name].append(mean_axis_x)
            resultados_caracteristicas[std_axis_x_name].append(std_axis_x)
            resultados_caracteristicas[max_axis_x_name].append(max_axis_x)
            resultados_caracteristicas[mean_axis_y_name].append(mean_axis_y)
            resultados_caracteristicas[std_axis_y_name].append(std_axis_y)
            resultados_caracteristicas[max_axis_y_name].append(max_axis_y)
            resultados_caracteristicas[mean_axis_z_name].append(mean_axis_z)
            resultados_caracteristicas[std_axis_z_name].append(std_axis_z)
            resultados_caracteristicas[max_axis_z_name].append(max_axis_z)
            
            start_rows = end_rows
            end_rows += 90
        
        first_column_from_sensor += 3
        
    start_rows = 0
    end_rows = window_size
    for k in range(total_samples):
        mode = df.iloc[start_rows:end_rows, 51].mode()[0]
        resultados_caracteristicas['mode label'].append(mode)
        
        start_rows = end_rows
        end_rows += 90
    
    df = pd.DataFrame(resultados_caracteristicas)

    return df


In [94]:
df_training_1 = feature_extraction(df_training_1)
df_training_2 = feature_extraction(df_training_2)
df_training_3 = feature_extraction(df_training_3)
df_test = feature_extraction(df_test)

In [95]:
def scale_training_data(df_1, df_2, df_3):
    df = pd.concat([df_1, df_2, df_3], ignore_index=True)
    
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    
    scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df.insert(0, df.columns[0], labels)
    
    return scaled_df, scaler

In [96]:
def scale_test_data(df, scaler):
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]
    
    scaled_data = scaler.transform(data)
    
    scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df.insert(0, df.columns[0], labels)

    return scaled_df

In [97]:
df_training, scaler = scale_training_data(df_training_1, df_training_2, df_training_3)
df_test = scale_test_data(df_test, scaler)

In [98]:
def select_best_features(df):
    from sklearn.feature_selection import SelectKBest, f_classif

    X = df.iloc[:, 1:]  
    y = df.iloc[:, 0]   

    k_best_features = 100  
    selector = SelectKBest(score_func=f_classif, k=k_best_features)

    X_best = selector.fit_transform(X, y)

    selected_columns = X.columns[selector.get_support()]
    X_best = pd.DataFrame(X_best, columns=selected_columns)

    X_best.insert(0, df.columns[0], y)

    return X_best

In [99]:
df_training = select_best_features(df_training)

In [101]:
columns_training = df_training.columns.tolist()
for column in df_test.columns:
    if column not in columns_training:
        df_test = df_test.drop(column, axis = 1)

In [102]:
df_test.shape[1]

101

In [103]:
df_reduced = df_training

In [104]:
labels = df_reduced.iloc[:, 0]
features = df_reduced.iloc[:, 1:]

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Supongamos que ya tienes df_reduced, labels y features definidos

lda = LinearDiscriminantAnalysis(n_components=3)
features_lda = lda.fit_transform(features, labels)

columns_lda = ['Componente_1', 'Componente_2', 'Componente_3']
df_lda = pd.DataFrame(data=features_lda, columns=columns_lda)
df_lda['Etiqueta'] = labels

%matplotlib notebook

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

colores = ['r', 'g', 'b', 'y', 'c', 'm']
labels_unicas = df_lda['Etiqueta'].unique()

for etiqueta, color in zip(labels_unicas, colores):
    indices = df_lda['Etiqueta'] == etiqueta
    ax.scatter(df_lda.loc[indices, 'Componente_1'],
               df_lda.loc[indices, 'Componente_2'],
               df_lda.loc[indices, 'Componente_3'],
               c=color,
               label=etiqueta,
               s=50,
               alpha=0.7)

ax.set_xlabel('Componente 1')
ax.set_ylabel('Componente 2')
ax.set_zlabel('Componente 3')
ax.set_title('Visualización de Clases en 3D usando LDA')
ax.legend()

plt.show()


<IPython.core.display.Javascript object>

In [105]:
locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
for value in df_reduced.loc[:,'mode label']:
    if 1 <= value <= 4:
        if value == 1:
            locomotion['Stan'] += 1
        elif value == 2:
            locomotion['Walk'] += 1
        elif value == 3:
            locomotion['Sit'] += 1
        elif value == 4:
            locomotion['Lie'] += 1

display(locomotion)

Stan    1819
Walk     872
Sit      611
Lie      115
dtype: int64

In [106]:
X_train = df_training.iloc[:, 1:]  
y_train = df_training.iloc[:, 0]   

X_test = df_test.iloc[:, 1:]  
y_test = df_test.iloc[:, 0]   

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_classifier.fit(X_train, y_train)

In [108]:
y_pred = rf_classifier.predict(X_test)
print("Reporte de Clasificación:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=rf_classifier.classes_, columns=rf_classifier.classes_)
print("Matriz de Confusión:\n", conf_matrix_df)

Reporte de Clasificación:
               precision    recall  f1-score   support

           1       0.87      0.89      0.88       579
           2       0.72      0.73      0.73       229
           3       0.97      0.89      0.93       169
           4       1.00      1.00      1.00        23

    accuracy                           0.86      1000
   macro avg       0.89      0.88      0.88      1000
weighted avg       0.86      0.86      0.86      1000

Matriz de Confusión:
      1    2    3   4
1  515   60    4   0
2   61  168    0   0
3   15    4  150   0
4    0    0    0  23


In [109]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, confusion_matrix

perceptron_model = Perceptron(class_weight='balanced', max_iter=1000, eta0=0.1)
perceptron_model.fit(X_train, y_train)

y_pred = perceptron_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print(classification_report(y_test, y_pred))


Accuracy: 0.805
Confusion Matrix:
[[432 129  15   3]
 [ 22 201   6   0]
 [ 17   2 150   0]
 [  0   0   1  22]]
              precision    recall  f1-score   support

           1       0.92      0.75      0.82       579
           2       0.61      0.88      0.72       229
           3       0.87      0.89      0.88       169
           4       0.88      0.96      0.92        23

    accuracy                           0.81      1000
   macro avg       0.82      0.87      0.83      1000
weighted avg       0.84      0.81      0.81      1000

