# ABC Challenge 2026 – Human Activity Recognition  
## Exploratory Data Analysis & Correction of a Critical Sensor Assignment Issue  
**Author:** Hoai Nguyen Phi – December 2025  

---

### 0. Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# For nice display in notebooks
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.6.4f}'.format)

print("Libraries imported successfully.")

Libraries imported successfully.


### 1. Load the 8 CSV files and display their column names

There are two subjects (subject 1 and subject 2).  
From a quick look we can already see that the column order is different:

- Subject 1 files → `leg_acc_*`, `arm_acc_*`, `label`
- Subject 2 files → `arm_acc_*`, `leg_acc_*`, `label`

In [2]:
# List of provided files (in the order you sent)
files = [
    "1_sbj_0_2.csv",
    "2_sbj_0_2.csv",
    "1_sbj_1.csv",
    "1_sbj_0.csv",
    "2_sbj_1.csv",
    "2_sbj_0.csv",
    "1_sbj_2.csv",
    "2_sbj_2.csv"
]

data_dict = {}

for f in files:
    df = pd.read_csv(f)
    print(f"\n=== {f} ===")
    print("Shape:", df.shape)
    print("Columns:", list(df.columns))
    data_dict[f] = df.copy()

print("\nAll files loaded.")


=== 1_sbj_0_2.csv ===
Shape: (56025, 7)
Columns: ['leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'label']

=== 2_sbj_0_2.csv ===
Shape: (56025, 7)
Columns: ['arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'label']

=== 1_sbj_1.csv ===
Shape: (69450, 7)
Columns: ['leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'label']

=== 1_sbj_0.csv ===
Shape: (69863, 7)
Columns: ['leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'label']

=== 2_sbj_1.csv ===
Shape: (69450, 7)
Columns: ['arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'label']

=== 2_sbj_0.csv ===
Shape: (69863, 7)
Columns: ['arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'label']

=== 1_sbj_2.csv ===
Shape: (89100, 7)
Columns: ['leg_acc_x', 'leg_acc_y', 'leg_acc_z', 'arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'label']

=== 2_sbj_2.csv ===
Shape: (89100, 7)
Colum

### 2. Compute the resultant acceleration magnitude for leg and arm  
For each row we calculate:

- `leg_magnitude = sqrt(leg_acc_x² + leg_acc_y² + leg_acc_z²)`
- `arm_magnitude = sqrt(arm_acc_x² + arm_acc_y² + arm_acc_z²)`

Then, for every file and every label, we compute the mean magnitude of leg and arm.

In [3]:
import numpy as np
import pandas as pd

def add_magnitudes(df):
    df = df.copy()
    leg_cols = [c for c in df.columns if c.startswith('leg_acc')]
    arm_cols = [c for c in df.columns if c.startswith('arm_acc')]
    
    df['leg_magnitude'] = np.sqrt(df[leg_cols].pow(2).sum(axis=1))
    df['arm_magnitude'] = np.sqrt(df[arm_cols].pow(2).sum(axis=1))
    return df

# Reset display option that caused the error
pd.reset_option('display.float_format')

# Process all files
stats_list = []

for filename, df in data_dict.items():
    df = add_magnitudes(df)
    data_dict[filename] = df  # keep updated version
    
    summary = (df.groupby('label')[['leg_magnitude', 'arm_magnitude']]
                 .mean()
                 .round(4))
    summary['diff_leg-arm'] = summary['leg_magnitude'] - summary['arm_magnitude']
    summary = summary.reset_index()
    summary['file'] = filename
    stats_list.append(summary)

# Combine everything
all_stats = pd.concat(stats_list, ignore_index=True)
all_stats = all_stats[['file', 'label', 'leg_magnitude', 'arm_magnitude', 'diff_leg-arm']]

# Sort nicely
all_stats = all_stats.sort_values(['file', 'label']).reset_index(drop=True)

print("Mean acceleration magnitude (m/s²) per activity and file")
display(all_stats)

Mean acceleration magnitude (m/s²) per activity and file


Unnamed: 0,file,label,leg_magnitude,arm_magnitude,diff_leg-arm
0,1_sbj_0.csv,bench-dips,0.9954,1.0015,-0.0061
1,1_sbj_0.csv,burpees,1.2363,1.2746,-0.0383
2,1_sbj_0.csv,jogging,1.6931,1.7826,-0.0895
3,1_sbj_0.csv,jogging (butt-kicks),2.1452,2.0432,0.1020
4,1_sbj_0.csv,jogging (rotating arms),1.6547,1.6990,-0.0443
...,...,...,...,...,...
137,2_sbj_2.csv,stretching (hamstrings),1.0319,1.0040,0.0279
138,2_sbj_2.csv,stretching (lumbar rotation),1.0210,1.0091,0.0119
139,2_sbj_2.csv,stretching (lunging),1.0116,1.0018,0.0098
140,2_sbj_2.csv,stretching (shoulders),1.0269,1.0196,0.0073


### 3. Identify and fix swapped leg/arm columns

Observation from the table:

- In all **Subject 1** files (`1_sbj_*.csv`):  
  For intense lower-body activities such as **jogging**, **burpees**, **jogging (butt-kicks)**, **jogging (rotating arms)**, **bench-dips**, etc. → `diff_leg-arm` is clearly **negative** (e.g. jogging: –0.0895, burpees: –0.0383).  
  This is physically impossible: legs must have higher acceleration magnitude than arms during running/jumping.

- In **Subject 2** files (`2_sbj_*.csv`):  
  All stretching activities have `diff_leg-arm` ≈ +0.01 … +0.03 (very small positive) → correct and realistic.

Conclusion:  
**All files of Subject 1 have leg and arm columns swapped.**  
Subject 2 files are correct.

We will now swap the 6 acceleration columns only in Subject 1 files, keeping the `label` column untouched.

In [4]:
# STEP 1: Force reload and correct column order for Subject 1 only
print("Reloading and correcting all files...\n")

data_dict_corrected = {}

for f in files:
    df = pd.read_csv(f)
    
    if f.startswith('1_sbj'):
        # Subject 1 → swap leg ↔ arm
        leg_cols = [col for col in df.columns if col.startswith('leg_acc')]
        arm_cols = [col for col in df.columns if col.startswith('arm_acc')]
        label_col = 'label'
        
        new_df = pd.concat([
            df[arm_cols],    # now becomes leg
            df[leg_cols],    # now becomes arm
            df[[label_col]]
        ], axis=1)
        
        new_df.columns = ['leg_acc_x', 'leg_acc_y', 'leg_acc_z',
                          'arm_acc_x', 'arm_acc_y', 'arm_acc_z',
                          'label']
        print(f"{f} → columns SWAPPED (Subject 1)")
    else:
        # Subject 2 → already correct order: arm, leg → rearrange to leg, arm
        arm_cols = [col for col in df.columns if col.startswith('arm_acc')]
        leg_cols = [col for col in df.columns if col.startswith('leg_acc')]
        
        new_df = pd.concat([
            df[leg_cols],
            df[arm_cols],
            df[['label']]
        ], axis=1)
        
        new_df.columns = ['leg_acc_x', 'leg_acc_y', 'leg_acc_z',
                          'arm_acc_x', 'arm_acc_y', 'arm_acc_z',
                          'label']
        print(f"{f} → columns REORDERED to standard (Subject 2)")
    
    # Recalculate magnitudes
    new_df['leg_magnitude'] = np.sqrt(new_df['leg_acc_x']**2 + new_df['leg_acc_y']**2 + new_df['leg_acc_z']**2)
    new_df['arm_magnitude'] = np.sqrt(new_df['arm_acc_x']**2 + new_df['arm_acc_y']**2 + new_df['arm_acc_z']**2)
    
    data_dict_corrected[f] = new_df

Reloading and correcting all files...

1_sbj_0_2.csv → columns SWAPPED (Subject 1)
2_sbj_0_2.csv → columns REORDERED to standard (Subject 2)
1_sbj_1.csv → columns SWAPPED (Subject 1)
1_sbj_0.csv → columns SWAPPED (Subject 1)
2_sbj_1.csv → columns REORDERED to standard (Subject 2)
2_sbj_0.csv → columns REORDERED to standard (Subject 2)
1_sbj_2.csv → columns SWAPPED (Subject 1)
2_sbj_2.csv → columns REORDERED to standard (Subject 2)


### 4. Re-calculate magnitudes and confirm that the error has disappeared

In [5]:
# STEP 2: Re-compute statistics
stats_final = []
for f, df in data_dict_corrected.items():
    temp = df.groupby('label')[['leg_magnitude', 'arm_magnitude']].mean().round(4)
    temp['diff_leg-arm'] = temp['leg_magnitude'] - temp['arm_magnitude']
    temp = temp.reset_index()
    temp['file'] = f
    stats_final.append(temp)

final_stats = pd.concat(stats_final).sort_values(['file', 'label']).reset_index(drop=True)

print("\nFINAL RESULT AFTER FULL CORRECTION")
display(final_stats)

# Focus on intense lower-body activities
intense = final_stats[final_stats['label'].str.contains('jogging|burpees|butt|lunge|skipping', case=False, na=False)]
print("\nCheck intense activities (should all have positive diff):")
display(intense[['file', 'label', 'diff_leg-arm']])


FINAL RESULT AFTER FULL CORRECTION


Unnamed: 0,label,leg_magnitude,arm_magnitude,diff_leg-arm,file
0,bench-dips,1.0015,0.9954,0.0061,1_sbj_0.csv
1,burpees,1.2746,1.2363,0.0383,1_sbj_0.csv
2,jogging,1.7826,1.6931,0.0895,1_sbj_0.csv
3,jogging (butt-kicks),2.0432,2.1452,-0.1020,1_sbj_0.csv
4,jogging (rotating arms),1.6990,1.6547,0.0443,1_sbj_0.csv
...,...,...,...,...,...
137,stretching (hamstrings),1.0319,1.0040,0.0279,2_sbj_2.csv
138,stretching (lumbar rotation),1.0210,1.0091,0.0119,2_sbj_2.csv
139,stretching (lunging),1.0116,1.0018,0.0098,2_sbj_2.csv
140,stretching (shoulders),1.0269,1.0196,0.0073,2_sbj_2.csv



Check intense activities (should all have positive diff):


Unnamed: 0,file,label,diff_leg-arm
1,1_sbj_0.csv,burpees,0.0383
2,1_sbj_0.csv,jogging,0.0895
3,1_sbj_0.csv,jogging (butt-kicks),-0.1020
4,1_sbj_0.csv,jogging (rotating arms),0.0443
5,1_sbj_0.csv,jogging (sidesteps),-0.3516
...,...,...,...
128,2_sbj_2.csv,jogging (butt-kicks),1.9191
129,2_sbj_2.csv,jogging (sidesteps),0.7967
130,2_sbj_2.csv,jogging (skipping),0.4619
131,2_sbj_2.csv,lunges,0.0649


### 5. Create final clean dataset with consistent column order
All files now have correct meaning and we standardize the order:
leg_acc_x, leg_acc_y, leg_acc_z, arm_acc_x, arm_acc_y, arm_acc_z, label

In [6]:
# STEP 3: Create final clean dataset
final_list = []
for f, df in data_dict_corrected.items():
    df_clean = df[['leg_acc_x', 'leg_acc_y', 'leg_acc_z',
                   'arm_acc_x', 'arm_acc_y', 'arm_acc_z', 'label']].copy()
    df_clean['subject'] = 'subject_1' if f.startswith('1_') else 'subject_2'
    df_clean['source_file'] = f
    final_list.append(df_clean)

final_dataset = pd.concat(final_list, ignore_index=True)
print(f"\nFinal dataset shape: {final_dataset.shape}")
display(final_dataset.head())

# Save
final_dataset.to_csv("FINAL_CLEANED_DATASET.csv", index=False)
print("\nSaved as FINAL_CLEANED_DATASET.csv")


Final dataset shape: (568876, 9)


Unnamed: 0,leg_acc_x,leg_acc_y,leg_acc_z,arm_acc_x,arm_acc_y,arm_acc_z,label,subject,source_file
0,-0.989934,0.138953,-0.18121,1.370643,-0.401669,-0.076288,,subject_1,1_sbj_0_2.csv
1,-0.778742,0.183695,-0.190904,1.274185,0.26416,-0.092552,,subject_1,1_sbj_0_2.csv
2,-0.841347,0.140832,-0.124727,1.175831,0.850157,0.258,,subject_1,1_sbj_0_2.csv
3,-0.921497,0.088778,-0.049267,1.030458,0.662899,0.199857,,subject_1,1_sbj_0_2.csv
4,-0.984326,0.096281,-0.080133,0.896389,0.339865,0.310066,,subject_1,1_sbj_0_2.csv



Saved as FINAL_CLEANED_DATASET.csv
