In [1]:
#Librerías para la carga, análisis y preprocesamiento de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [2]:
# División de los datos de entrenamiento y prueba
subject_1 = ['OpportunityUCIDataset/dataset/S1-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S1-Drill.dat']
subject_2 = ['OpportunityUCIDataset/dataset/S2-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S2-Drill.dat']
subject_3 = ['OpportunityUCIDataset/dataset/S3-ADL1.dat',
                'OpportunityUCIDataset/dataset/S3-ADL2.dat',
                'OpportunityUCIDataset/dataset/S3-ADL3.dat',
                'OpportunityUCIDataset/dataset/S3-Drill.dat']
subject_4 = ['OpportunityUCIDataset/dataset/S4-ADL1.dat',
                'OpportunityUCIDataset/dataset/S4-ADL2.dat',
                'OpportunityUCIDataset/dataset/S4-ADL3.dat',
                'OpportunityUCIDataset/dataset/S4-Drill.dat']

In [3]:
# Leer columnas del dataset que se ecuentran en otro archivo
col_names = []
with open('col_names.txt','r') as f:
    lines = f.read().splitlines()
    for line in lines:
        col_names.append(line)

In [4]:
############################################################
df_subject_1 = pd.DataFrame()
for i, file in enumerate(subject_1):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_1 = df_subject_1._append(file_data, ignore_index=True)
df_subject_1.reset_index(drop=True, inplace=True)
print("Lectura hecha!")
############################################################
df_subject_2 = pd.DataFrame()
for i, file in enumerate(subject_2):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_2= df_subject_2._append(file_data, ignore_index=True)
df_subject_2.reset_index(drop=True, inplace=True)
print("Lectura hecha!")
############################################################
df_subject_3 = pd.DataFrame()
for i, file in enumerate(subject_3):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_3 = df_subject_3._append(file_data, ignore_index=True)
df_subject_3.reset_index(drop=True, inplace=True)
print("Lectura hecha!")
############################################################
df_subject_4 = pd.DataFrame()
for i, file in enumerate(subject_4):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_4 = df_subject_4._append(file_data, ignore_index=True)
df_subject_4.reset_index(drop=True, inplace=True)
print("Lectura hecha!")


OpportunityUCIDataset/dataset/S1-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S2-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S3-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S3-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S4-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S4-Dril

In [5]:
def segmentation_to_simple_activities(df):
    # Segmentación de la base de datos a solo las simples (Parado, caminando, sentado, acostado)
    df = df.drop(['HL_Activity','LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm',
                  'LL_Right_Arm_Object', 'ML_Both_Arms'], axis = 1)
    # Se eliminan los ejemplos que no entran en alguna de las 4 actividades
    df = df[df['Locomotion'] != 0]
    return df

In [6]:
def mapping_new_labels(df):
    # Se mapean los nuevos labels
    mapping = {1:0, 2:1, 4:2, 5:3}
    df['Locomotion'] = df['Locomotion'].map(mapping)
    return df

In [7]:
def cut_no_body_sensors(df):
    df_labels = df['Locomotion']
    df = df.drop(df.iloc[:,134:243], axis=1)
    
    three_axis_columns = [col for col in df.columns if 'X' in col or 'Y' in col or 'Z' in col]
    df = df[three_axis_columns]
    df = df.assign(label=df_labels)

    return df

In [8]:
def handle_missing_values(df):
    # Manejo de valores nulos por medio de imputación hacia delante
    limit = df.shape[1]*0.9
    df = df.dropna(axis='rows',thresh = limit)
    df.iloc[0] = df.iloc[0].fillna(0)
    df = df.ffill()
    return df

In [9]:
def feature_extraction(df):
    window_size = 90
    total_sensors = int(df.shape[1]/3)
    total_samples = df.shape[0]//window_size
    
    first_column_from_sensor = 0
    
    resultados_caracteristicas = {
        'mode label': [],
    }
    
    for i in range(total_sensors):
        start_rows = 0
        end_rows = window_size
            
        for j in range(total_samples):
            col_x = df.iloc[start_rows:end_rows, first_column_from_sensor]
            col_y = df.iloc[start_rows:end_rows, first_column_from_sensor+1]
            col_z = df.iloc[start_rows:end_rows, first_column_from_sensor+2]

            # Estadísticos juntando los 3 ejes
            magnitud = np.sqrt(col_x**2 + col_y**2 + col_z**2)
            media_magnitud = np.mean(magnitud)
            std_magnitud = np.std(magnitud)
            auc_magnitud = np.sum(magnitud)

            # Estadísticos de cada eje
            mean_axis_x = col_x.mean()
            std_axis_x = col_x.std()
            max_axis_x = col_x.max()

            mean_axis_y = col_y.mean()
            std_axis_y = col_y.std()
            max_axis_y = col_y.max()

            mean_axis_z = col_z.mean()
            std_axis_z = col_z.std()
            max_axis_z = col_z.max()

            # Nombre de cada sensor
            sensor_name = df.columns[first_column_from_sensor][:-1]

            # Nombres estadísticos eje X
            mean_axis_x_name = 'mean ' + df.columns[first_column_from_sensor]
            std_axis_x_name = 'std ' + df.columns[first_column_from_sensor]
            max_axis_x_name = 'max ' + df.columns[first_column_from_sensor]

            # Nombres estadísticos eje Y
            mean_axis_y_name = 'mean ' + df.columns[first_column_from_sensor+1]
            std_axis_y_name = 'std ' + df.columns[first_column_from_sensor+1]
            max_axis_y_name = 'max ' + df.columns[first_column_from_sensor+1]

            # Nombres estadísticos eje Z
            mean_axis_z_name = 'mean ' + df.columns[first_column_from_sensor+2]
            std_axis_z_name = 'std ' + df.columns[first_column_from_sensor+2]
            max_axis_z_name = 'max ' + df.columns[first_column_from_sensor+2]
            
            # Nombres estadísticos de los 3 ejes 
            mean_magnitude_name = 'MM ' + sensor_name
            std_magnitude_name = 'StdM ' + sensor_name
            AUC_magnitude_name = 'AUCM ' + sensor_name


            if mean_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_magnitude_name] = []
            if std_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_magnitude_name] = []
            if AUC_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[AUC_magnitude_name] = []
            if mean_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_x_name] = []
            if std_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_x_name] = []
            if max_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_x_name] = []
            if mean_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_y_name] = []
            if std_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_y_name] = []
            if max_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_y_name] = []
            if mean_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_z_name] = []
            if std_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_z_name] = []
            if max_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_z_name] = []
    
            resultados_caracteristicas[mean_magnitude_name].append(media_magnitud)
            resultados_caracteristicas[std_magnitude_name].append(std_magnitud)
            resultados_caracteristicas[AUC_magnitude_name].append(auc_magnitud)
            resultados_caracteristicas[mean_axis_x_name].append(mean_axis_x)
            resultados_caracteristicas[std_axis_x_name].append(std_axis_x)
            resultados_caracteristicas[max_axis_x_name].append(max_axis_x)
            resultados_caracteristicas[mean_axis_y_name].append(mean_axis_y)
            resultados_caracteristicas[std_axis_y_name].append(std_axis_y)
            resultados_caracteristicas[max_axis_y_name].append(max_axis_y)
            resultados_caracteristicas[mean_axis_z_name].append(mean_axis_z)
            resultados_caracteristicas[std_axis_z_name].append(std_axis_z)
            resultados_caracteristicas[max_axis_z_name].append(max_axis_z)
            
            start_rows = end_rows
            end_rows += 90
        
        first_column_from_sensor += 3
        
    start_rows = 0
    end_rows = window_size
    for k in range(total_samples):
        mode = df.iloc[start_rows:end_rows, 111].mode()[0]
        resultados_caracteristicas['mode label'].append(mode)
        
        start_rows = end_rows
        end_rows += 90
    
    df = pd.DataFrame(resultados_caracteristicas)

    return df


In [10]:
def scale_training_data(df_training):
    
    labels = df_training.iloc[:, 0]
    data = df_training.iloc[:, 1:]

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    scaled_df_training = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df_training.insert(0, df_training.columns[0], labels)

    return scaled_df_training, scaler

In [11]:
def scale_test_data(df, scaler):
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]
    
    scaled_data = scaler.transform(data)
    
    scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df.insert(0, df.columns[0], labels)

    return scaled_df

In [12]:
def select_best_features(df):
    from sklearn.feature_selection import SelectKBest, f_classif

    X = df.iloc[:, 1:]  
    y = df.iloc[:, 0]   

    k_best_features = 300  
    selector = SelectKBest(score_func=f_classif, k=k_best_features)

    X_best = selector.fit_transform(X, y)

    selected_columns = X.columns[selector.get_support()]
    X_best = pd.DataFrame(X_best, columns=selected_columns)

    X_best.insert(0, df.columns[0], y)

    return X_best

In [13]:
def select_best_features_test(df_training, df_test):
    columns_training = df_training.columns.tolist()
    for column in df_test.columns:
        if column not in columns_training:
            df_test = df_test.drop(column, axis = 1)
    return df_test

In [14]:
def transform_using_lda(df):
    labels = df.iloc[:, 0]
    features = df.iloc[:, 1:]

    lda = LinearDiscriminantAnalysis(n_components=3)
    features_lda = lda.fit_transform(features, labels)

    print("Varianza explicada por cada componente:", lda.explained_variance_ratio_)
    
    columns_lda = ['Component_1', 'Component_2', 'Component_3']
    df_lda = pd.DataFrame(data=features_lda, columns=columns_lda)
    df_lda['labels'] = labels
    return df_lda

In [15]:
def print_3d_chart(df):
    %matplotlib notebook

    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')

    colors = ['r', 'g', 'b', 'y', 'c', 'm']
    unic_labels = df['labels'].unique()

    for label, color in zip(unic_labels, colors):
        index = df['labels'] == label
        ax.scatter(df.loc[index, 'Component_1'],
                   df.loc[index, 'Component_2'],
                   df.loc[index, 'Component_3'],
                   c=color,
                   label=label,
                   s=50,
                   alpha=0.7)

    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_zlabel('Component 3')
    ax.set_title('3D Display with LDA')
    ax.legend()

    plt.show()


In [16]:
def print_classes(df):
    locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
    for value in df.loc[:,'labels']:
        if 0 <= value <= 3:
            if value == 0:
                locomotion['Stan'] += 1
            elif value == 1:
                locomotion['Walk'] += 1
            elif value == 2:
                locomotion['Sit'] += 1
            elif value == 3:
                locomotion['Lie'] += 1
    print(locomotion)

In [17]:
def divide_training_and_test(df_training, df_test):
    X_train = df_training.iloc[:, :3]  
    y_train = df_training.iloc[:, 3]   

    X_test = df_test.iloc[:, :3]  
    y_test = df_test.iloc[:, 3]
    
    return X_train, y_train, X_test, y_test

In [18]:
def preprocess_subject_data(df):
    df = segmentation_to_simple_activities(df)
    df = mapping_new_labels(df)
    df = cut_no_body_sensors(df)
    df = handle_missing_values(df)
    df = feature_extraction(df)
    return df

In [19]:
def preprocess_training_data(df_training_1, df_training_2, df_training_3):

    df_training_1 = preprocess_subject_data(df_training_1)
    df_training_2 = preprocess_subject_data(df_training_2)
    df_training_3 = preprocess_subject_data(df_training_3)
    
    df_training = pd.concat([df_training_1, df_training_2, df_training_3], ignore_index=True)
    df_training = select_best_features(df_training)
    df_training, scaler = scale_training_data(df_training)
    
    df_training_before_lda = df_training
    
    df_training = transform_using_lda(df_training)
    
    return df_training, df_training_before_lda, scaler

In [20]:
def preprocess_test_data(df_test, df_training_before_lda, scaler):
    df_test = preprocess_subject_data(df_test)
    df_test = select_best_features_test(df_training_before_lda, df_test)
    df_test = scale_test_data(df_test, scaler)
    df_test = transform_using_lda(df_test)
    
    return df_test

In [23]:
def preprocess_all_data(df_training_1, df_training_2, df_training_3, df_test):
    df_training, df_training_before_lda, scaler = preprocess_training_data(df_training_1, df_training_2, df_training_3)
    df_test = preprocess_test_data(df_test, df_training_before_lda, scaler)
    
    return df_training, df_test

In [24]:
df_training, df_test = preprocess_all_data(df_subject_1, df_subject_2, df_subject_3, df_subject_4)

Varianza explicada por cada componente: [0.60286357 0.31560239 0.08153403]
Varianza explicada por cada componente: [0.73912645 0.21218441 0.04868914]


In [25]:
X_train, y_train, X_test, y_test = divide_training_and_test(df_training, df_test)

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
print("Reporte de Clasificación:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=rf_classifier.classes_, columns=rf_classifier.classes_)
print("Matriz de Confusión:\n", conf_matrix_df)

Reporte de Clasificación:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       667
           1       0.90      0.97      0.93       282
           2       0.99      0.95      0.97       196
           3       1.00      0.97      0.99        35

    accuracy                           0.96      1180
   macro avg       0.97      0.96      0.96      1180
weighted avg       0.96      0.96      0.96      1180

Matriz de Confusión:
      0    1    2   3
0  636   31    0   0
1    9  273    0   0
2    8    1  187   0
3    0    0    1  34


In [27]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, confusion_matrix

perceptron_model = Perceptron(class_weight='balanced', max_iter=1000, eta0=0.1)
perceptron_model.fit(X_train, y_train)

y_pred = perceptron_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print(classification_report(y_test, y_pred))

Accuracy: 0.9483050847457627
Confusion Matrix:
[[625  41   1   0]
 [ 18 264   0   0]
 [  0   1 195   0]
 [  0   0   0  35]]
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       667
           1       0.86      0.94      0.90       282
           2       0.99      0.99      0.99       196
           3       1.00      1.00      1.00        35

    accuracy                           0.95      1180
   macro avg       0.96      0.97      0.96      1180
weighted avg       0.95      0.95      0.95      1180



In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


logistic_reg = LogisticRegression(class_weight='balanced', max_iter=1000)
logistic_reg.fit(X_train, y_train)

y_pred = logistic_reg.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = logistic_reg.score(X_test, y_test)
print("Precisión global del modelo:", accuracy)

              precision    recall  f1-score   support

           0       0.99      0.92      0.95       667
           1       0.84      0.98      0.91       282
           2       1.00      0.99      0.99       196
           3       1.00      1.00      1.00        35

    accuracy                           0.95      1180
   macro avg       0.96      0.97      0.96      1180
weighted avg       0.96      0.95      0.95      1180

Precisión global del modelo: 0.9491525423728814
