In [2]:
#Librerías para la carga, análisis y preprocesamiento de datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [3]:
# División de los datos de entrenamiento y prueba
subject_1 = ['OpportunityUCIDataset/dataset/S1-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S1-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S1-Drill.dat']
subject_2 = ['OpportunityUCIDataset/dataset/S2-ADL1.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL2.dat',
                 'OpportunityUCIDataset/dataset/S2-ADL3.dat',
                 'OpportunityUCIDataset/dataset/S2-Drill.dat']
subject_3 = ['OpportunityUCIDataset/dataset/S3-ADL1.dat',
                'OpportunityUCIDataset/dataset/S3-ADL2.dat',
                'OpportunityUCIDataset/dataset/S3-ADL3.dat',
                'OpportunityUCIDataset/dataset/S3-Drill.dat']
subject_4 = ['OpportunityUCIDataset/dataset/S4-ADL1.dat',
                'OpportunityUCIDataset/dataset/S4-ADL2.dat',
                'OpportunityUCIDataset/dataset/S4-ADL3.dat',
                'OpportunityUCIDataset/dataset/S4-Drill.dat']

In [4]:
# Leer columnas del dataset que se ecuentran en otro archivo
col_names = []
with open('col_names.txt','r') as f:
    lines = f.read().splitlines()
    for line in lines:
        col_names.append(line)

In [None]:
############################################################
df_subject_1 = pd.DataFrame()
for i, file in enumerate(subject_1):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_1 = df_subject_1._append(file_data, ignore_index=True)
df_subject_1.reset_index(drop=True, inplace=True)
print("Lectura hecha!")
############################################################
df_subject_2 = pd.DataFrame()
for i, file in enumerate(subject_2):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_2= df_subject_2._append(file_data, ignore_index=True)
df_subject_2.reset_index(drop=True, inplace=True)
print("Lectura hecha!")
############################################################
df_subject_3 = pd.DataFrame()
for i, file in enumerate(subject_3):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_3 = df_subject_3._append(file_data, ignore_index=True)
df_subject_3.reset_index(drop=True, inplace=True)
print("Lectura hecha!")
############################################################
df_subject_4 = pd.DataFrame()
for i, file in enumerate(subject_4):
    print(file,"se está leyendo...")
    file_data = pd.read_table(file, header=None, sep='\s+')
    file_data.columns = col_names      
    df_subject_4 = df_subject_4._append(file_data, ignore_index=True)
df_subject_4.reset_index(drop=True, inplace=True)
print("Lectura hecha!")


OpportunityUCIDataset/dataset/S1-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S1-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S2-ADL1.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL2.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-ADL3.dat se está leyendo...
OpportunityUCIDataset/dataset/S2-Drill.dat se está leyendo...
Lectura hecha!
OpportunityUCIDataset/dataset/S3-ADL1.dat se está leyendo...


In [None]:
def segmentation_to_simple_activities(df):
    # Segmentación de la base de datos a solo las simples (Parado, caminando, sentado, acostado)
    df = df.drop(['HL_Activity','LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm',
                  'LL_Right_Arm_Object', 'ML_Both_Arms'], axis = 1)
    # Se eliminan los ejemplos que no entran en alguna de las 4 actividades
    df = df[df['Locomotion'] != 0]
    return df

In [None]:
def mapping_new_labels(df):
    # Se mapean los nuevos labels
    mapping = {1:0, 2:1, 4:2, 5:3}
    df['Locomotion'] = df['Locomotion'].map(mapping)
    return df

In [None]:
def cut_no_body_sensors(df):
    df_labels = df['Locomotion']
    df = df.drop(df.iloc[:,134:243], axis=1)
    
    three_axis_columns = [col for col in df.columns if 'X' in col or 'Y' in col or 'Z' in col]
    df = df[three_axis_columns]
    df = df.assign(label=df_labels)

    return df

In [None]:
def handle_missing_values(df):
    # Manejo de valores nulos por medio de imputación hacia delante
    limit = df.shape[1]*0.9
    df = df.dropna(axis='rows',thresh = limit)
    df.iloc[0] = df.iloc[0].fillna(0)
    df = df.ffill()
    return df

In [None]:
def feature_extraction(df):
    window_size = 90
    total_sensors = int(df.shape[1]/3)
    total_samples = df.shape[0]//window_size
    
    first_column_from_sensor = 0
    
    resultados_caracteristicas = {
        'mode label': [],
    }
    
    for i in range(total_sensors):
        start_rows = 0
        end_rows = window_size
            
        for j in range(total_samples):
            col_x = df.iloc[start_rows:end_rows, first_column_from_sensor]
            col_y = df.iloc[start_rows:end_rows, first_column_from_sensor+1]
            col_z = df.iloc[start_rows:end_rows, first_column_from_sensor+2]

            # Estadísticos juntando los 3 ejes
            magnitud = np.sqrt(col_x**2 + col_y**2 + col_z**2)
            media_magnitud = np.mean(magnitud)
            std_magnitud = np.std(magnitud)
            auc_magnitud = np.sum(magnitud)

            # Estadísticos de cada eje
            mean_axis_x = col_x.mean()
            std_axis_x = col_x.std()
            max_axis_x = col_x.max()

            mean_axis_y = col_y.mean()
            std_axis_y = col_y.std()
            max_axis_y = col_y.max()

            mean_axis_z = col_z.mean()
            std_axis_z = col_z.std()
            max_axis_z = col_z.max()

            # Nombre de cada sensor
            sensor_name = df.columns[first_column_from_sensor][:-1]

            # Nombres estadísticos eje X
            mean_axis_x_name = 'mean ' + df.columns[first_column_from_sensor]
            std_axis_x_name = 'std ' + df.columns[first_column_from_sensor]
            max_axis_x_name = 'max ' + df.columns[first_column_from_sensor]

            # Nombres estadísticos eje Y
            mean_axis_y_name = 'mean ' + df.columns[first_column_from_sensor+1]
            std_axis_y_name = 'std ' + df.columns[first_column_from_sensor+1]
            max_axis_y_name = 'max ' + df.columns[first_column_from_sensor+1]

            # Nombres estadísticos eje Z
            mean_axis_z_name = 'mean ' + df.columns[first_column_from_sensor+2]
            std_axis_z_name = 'std ' + df.columns[first_column_from_sensor+2]
            max_axis_z_name = 'max ' + df.columns[first_column_from_sensor+2]
            
            # Nombres estadísticos de los 3 ejes 
            mean_magnitude_name = 'MM ' + sensor_name
            std_magnitude_name = 'StdM ' + sensor_name
            AUC_magnitude_name = 'AUCM ' + sensor_name


            if mean_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_magnitude_name] = []
            if std_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_magnitude_name] = []
            if AUC_magnitude_name not in resultados_caracteristicas:
                resultados_caracteristicas[AUC_magnitude_name] = []
            if mean_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_x_name] = []
            if std_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_x_name] = []
            if max_axis_x_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_x_name] = []
            if mean_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_y_name] = []
            if std_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_y_name] = []
            if max_axis_y_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_y_name] = []
            if mean_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[mean_axis_z_name] = []
            if std_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[std_axis_z_name] = []
            if max_axis_z_name not in resultados_caracteristicas:
                resultados_caracteristicas[max_axis_z_name] = []
    
            resultados_caracteristicas[mean_magnitude_name].append(media_magnitud)
            resultados_caracteristicas[std_magnitude_name].append(std_magnitud)
            resultados_caracteristicas[AUC_magnitude_name].append(auc_magnitud)
            resultados_caracteristicas[mean_axis_x_name].append(mean_axis_x)
            resultados_caracteristicas[std_axis_x_name].append(std_axis_x)
            resultados_caracteristicas[max_axis_x_name].append(max_axis_x)
            resultados_caracteristicas[mean_axis_y_name].append(mean_axis_y)
            resultados_caracteristicas[std_axis_y_name].append(std_axis_y)
            resultados_caracteristicas[max_axis_y_name].append(max_axis_y)
            resultados_caracteristicas[mean_axis_z_name].append(mean_axis_z)
            resultados_caracteristicas[std_axis_z_name].append(std_axis_z)
            resultados_caracteristicas[max_axis_z_name].append(max_axis_z)
            
            start_rows = end_rows
            end_rows += 90
        
        first_column_from_sensor += 3
        
    start_rows = 0
    end_rows = window_size
    for k in range(total_samples):
        mode = df.iloc[start_rows:end_rows, 111].mode()[0]
        resultados_caracteristicas['mode label'].append(mode)
        
        start_rows = end_rows
        end_rows += 90
    
    df = pd.DataFrame(resultados_caracteristicas)

    return df


In [None]:
def scale_training_data(df_train):
    
    labels = df_train.iloc[:, 0]
    data = df_train.iloc[:, 1:]

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    scaled_df_train = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df_train.insert(0, df_train.columns[0], labels)

    return scaled_df_train, scaler

In [None]:
def scale_test_data(df, scaler):
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]
    
    scaled_data = scaler.transform(data)
    
    scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
    scaled_df.insert(0, df.columns[0], labels)

    return scaled_df

In [None]:
def select_best_features(df):
    from sklearn.feature_selection import SelectKBest, f_classif

    X = df.iloc[:, 1:]  
    y = df.iloc[:, 0]   

    k_best_features = 300  
    selector = SelectKBest(score_func=f_classif, k=k_best_features)

    X_best = selector.fit_transform(X, y)

    selected_columns = X.columns[selector.get_support()]
    X_best = pd.DataFrame(X_best, columns=selected_columns)

    X_best.insert(0, df.columns[0], y)

    return X_best

In [None]:
def select_best_features_test(df_train, df_test):
    columns_training = df_train.columns.tolist()
    for column in df_test.columns:
        if column not in columns_training:
            df_test = df_test.drop(column, axis = 1)
    return df_test

In [None]:
def transform_using_lda(df):
    labels = df.iloc[:, 0]
    features = df.iloc[:, 1:]

    lda = LinearDiscriminantAnalysis(n_components=3)
    features_lda = lda.fit_transform(features, labels)

    print("Varianza explicada por cada componente:", lda.explained_variance_ratio_)
    
    columns_lda = ['Component_1', 'Component_2', 'Component_3']
    df_lda = pd.DataFrame(data=features_lda, columns=columns_lda)
    df_lda['labels'] = labels
    return df_lda

In [None]:
def print_3d_chart(df):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')

    colors = ['r', 'g', 'b', 'y', 'c', 'm']
    unic_labels = df['labels'].unique()

    for label, color in zip(unic_labels, colors):
        index = df['labels'] == label
        ax.scatter(df.loc[index, 'Component_1'],
                   df.loc[index, 'Component_2'],
                   df.loc[index, 'Component_3'],
                   c=color,
                   label=label,
                   s=50,
                   alpha=0.7)

    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_zlabel('Component 3')
    ax.set_title('3D Display with LDA')
    ax.legend()

    plt.show()


In [None]:
def print_classes(df):
    locomotion = pd.Series([0,0,0,0],index=['Stan', 'Walk', 'Sit', 'Lie'])
    for value in df.loc[:,'labels']:
        if 0 <= value <= 3:
            if value == 0:
                locomotion['Stan'] += 1
            elif value == 1:
                locomotion['Walk'] += 1
            elif value == 2:
                locomotion['Sit'] += 1
            elif value == 3:
                locomotion['Lie'] += 1
    print(locomotion)

In [None]:
def divide_training_and_test(df_train, df_test):
    X_train = df_train.iloc[:, :3]  
    y_train = df_train.iloc[:, 3]   

    X_test = df_test.iloc[:, :3]  
    y_test = df_test.iloc[:, 3]
    
    return X_train, y_train, X_test, y_test

In [None]:
def preprocess_subject_data(df):
    df = segmentation_to_simple_activities(df)
    df = mapping_new_labels(df)
    df = cut_no_body_sensors(df)
    df = handle_missing_values(df)
    df = feature_extraction(df)
    return df

In [None]:
def preprocess_training_data(df_train_1, df_train_2, df_train_3):

    df_train_1 = preprocess_subject_data(df_train_1)
    df_train_2 = preprocess_subject_data(df_train_2)
    df_train_3 = preprocess_subject_data(df_train_3)
    
    df_train = pd.concat([df_train_1, df_train_2, df_train_3], ignore_index=True)
    df_train = select_best_features(df_train)
    df_train, scaler = scale_training_data(df_train)
    
    df_train_before_lda = df_train
    
    df_train = transform_using_lda(df_train)
    
    return df_train, df_train_before_lda, scaler

In [None]:
def preprocess_test_data(df_test, df_train_before_lda, scaler):
    df_test = preprocess_subject_data(df_test)
    df_test = select_best_features_test(df_train_before_lda, df_test)
    df_test = scale_test_data(df_test, scaler)
    df_test = transform_using_lda(df_test)
    
    return df_test

In [None]:
def preprocess_all_data(df_train_1, df_train_2, df_train_3, df_test):
    df_train, df_train_before_lda, scaler = preprocess_training_data(df_train_1, df_train_2, df_train_3)
    df_test = preprocess_test_data(df_test, df_train_before_lda, scaler)
    
    return df_train, df_test

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

total_combinations = [
[df_subject_1, df_subject_2, df_subject_3, df_subject_4],
[df_subject_1, df_subject_2, df_subject_4, df_subject_3],
[df_subject_1, df_subject_3, df_subject_4, df_subject_2],
[df_subject_2, df_subject_3, df_subject_4, df_subject_1]
]

classifiers = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Perceptron": Perceptron(class_weight='balanced')
}

metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score,
    "F1-score": f1_score
}

cross_validation_results = {
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-score': []
}

def train_and_evaluate(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

for combination in total_combinations:
    print("Procesando combinación...")
    df_train, df_test = preprocess_all_data(*combination)
    X_train, y_train, X_test, y_test = divide_training_and_test(df_train, df_test)
    for clf_name, clf in classifiers.items():
        accuracy, precision, recall, f1 = train_and_evaluate(clf, X_train, y_train, X_test, y_test)

        cross_validation_results['Accuracy'].append(accuracy)
        cross_validation_results['Precision'].append(precision)
        cross_validation_results['Recall'].append(recall)
        cross_validation_results['F1-score'].append(f1)
        

df_cross_validation_results = pd.DataFrame(cross_validation_results)