# Import Libraries

In [1]:
import pandas as pd
import glob

# Load EEG Files for Subject 2

In [2]:
column_names = [
    "Sample Index", "EXG Channel 0", "EXG Channel 1", "EXG Channel 2", "EXG Channel 3",
    "EXG Channel 4", "EXG Channel 5", "EXG Channel 6", "EXG Channel 7", "EXG Channel 8",
    "EXG Channel 9", "EXG Channel 10", "EXG Channel 11", "EXG Channel 12", "EXG Channel 13",
    "EXG Channel 14", "EXG Channel 15", "Accel Channel 0", "Accel Channel 1", "Accel Channel 2",
    "Not Used 1", "Digital Channel 0 (D11)", "Digital Channel 1 (D12)", "Digital Channel 2 (D13)",
    "Digital Channel 3 (D17)", "Not Used 2", "Digital Channel 4 (D18)", "Analog Channel 0",
    "Analog Channel 1", "Analog Channel 2", "Timestamp", "Marker Channel", "Timestamp (Formatted)"
]

eeg_files = sorted(glob.glob("/data0/HAR-datasets/PLHI-HAR_EEG-2025/OpenBCISession_s2-*/BrainFlow-RAW_s2-*.csv"))

def load_eeg(file):
    df = pd.read_csv(file, sep="\t", skiprows=3, names=column_names, engine="c")
    df["Activity"] = file.split("/")[-2]
    return df

eeg_df_list = [load_eeg(file) for file in eeg_files]
eeg_df = pd.concat(eeg_df_list, ignore_index=True)

print(f"Successfully loaded {len(eeg_files)} EEG files for Subject 2 across all activities.")
print("Unique activities in EEG data:", eeg_df["Activity"].unique())
display(eeg_df.head())

Successfully loaded 50 EEG files for Subject 2 across all activities.
Unique activities in EEG data: ['OpenBCISession_s2-chair squats'
 'OpenBCISession_s2-light stationary cycling'
 'OpenBCISession_s2-marching in place'
 'OpenBCISession_s2-seated boxing hooks'
 'OpenBCISession_s2-seated leg extensions'
 'OpenBCISession_s2-seated medicine ball twists'
 'OpenBCISession_s2-seated side bends' 'OpenBCISession_s2-side-stepping'
 'OpenBCISession_s2-standing heel to toe walk'
 'OpenBCISession_s2-wall push-ups']


Unnamed: 0,Sample Index,EXG Channel 0,EXG Channel 1,EXG Channel 2,EXG Channel 3,EXG Channel 4,EXG Channel 5,EXG Channel 6,EXG Channel 7,EXG Channel 8,...,Digital Channel 3 (D17),Not Used 2,Digital Channel 4 (D18),Analog Channel 0,Analog Channel 1,Analog Channel 2,Timestamp,Marker Channel,Timestamp (Formatted),Activity
0,6.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-16554.372496,-147240.239053,2162.866552,-2391.323732,2869.516953,...,0.0,0.0,0.0,0.0,0.0,0.0,1739034000.0,0.0,,OpenBCISession_s2-chair squats
1,8.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-16578.311214,-147003.578783,2180.479727,-2336.696069,2786.234353,...,104.0,113.5,80.0,0.0,0.0,0.0,1739034000.0,0.0,,OpenBCISession_s2-chair squats
2,10.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-16569.705793,-147620.017543,2159.916122,-2367.832049,2887.331294,...,0.0,0.0,0.0,0.0,0.0,0.0,1739034000.0,0.0,,OpenBCISession_s2-chair squats
3,12.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-16550.036258,-147247.190445,2205.178404,-2358.086688,2770.766946,...,0.0,0.0,0.0,0.0,0.0,0.0,1739034000.0,0.0,,OpenBCISession_s2-chair squats
4,14.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-16582.178066,-148076.820144,2188.302837,-2357.952578,2896.584916,...,0.0,0.0,0.0,0.0,0.0,0.0,1739034000.0,0.0,,OpenBCISession_s2-chair squats


# Display Last Rows of EEG Data

In [3]:
display(eeg_df.tail()) 

Unnamed: 0,Sample Index,EXG Channel 0,EXG Channel 1,EXG Channel 2,EXG Channel 3,EXG Channel 4,EXG Channel 5,EXG Channel 6,EXG Channel 7,EXG Channel 8,...,Digital Channel 3 (D17),Not Used 2,Digital Channel 4 (D18),Analog Channel 0,Analog Channel 1,Analog Channel 2,Timestamp,Marker Channel,Timestamp (Formatted),Activity
555734,234.0,187500.0,-187500.022352,-140259.275169,-187500.022352,-17494.956254,-72377.988384,7690.564715,-1263.924094,7785.626684,...,0.0,0.0,0.0,0.0,0.0,0.0,1739032000.0,0.0,,OpenBCISession_s2-wall push-ups
555735,236.0,187500.0,-187500.022352,-140114.905252,-187500.022352,-17410.377253,-73523.224714,7659.249921,-1338.601272,7698.097253,...,0.0,0.0,0.0,0.0,0.0,0.0,1739032000.0,0.0,,OpenBCISession_s2-wall push-ups
555736,238.0,187500.0,-187500.022352,-141747.454315,-187500.022352,-17423.385969,-75539.754395,7697.449052,-1337.416629,7699.058378,...,0.0,0.0,0.0,0.0,0.0,0.0,1739032000.0,0.0,,OpenBCISession_s2-wall push-ups
555737,240.0,187500.0,-187500.022352,-145106.96621,-187500.022352,-17467.284795,-79326.810459,7742.957204,-1319.244661,7681.02052,...,0.0,0.0,0.0,0.0,0.0,0.0,1739032000.0,0.0,,OpenBCISession_s2-wall push-ups
555738,242.0,187500.0,-187500.022352,-150922.487786,-187500.022352,-17558.077581,-85545.669263,7704.646314,-1274.340007,7746.1088,...,0.0,0.0,0.0,0.0,0.0,0.0,1739032000.0,0.0,,OpenBCISession_s2-wall push-ups


# Extract EEG Start and End Timestamps

In [4]:
eeg_timestamps = eeg_df[['Timestamp', 'Marker Channel']]
start_time = eeg_timestamps['Timestamp'].min()
end_time = eeg_timestamps['Timestamp'].max()
print(f" Extracted EEG activity start and end timestamps:")
print(f" Start Timestamp: {start_time}")
print(f" End Timestamp: {end_time}")

 Extracted EEG activity start and end timestamps:
 Start Timestamp: 1739030600.718708
 End Timestamp: 1739034879.018659


# Load and Filter HAR Data for Subject 2

In [5]:
har_file_path = "/data0/HAR-datasets/PLHI-HAR_EEG-2025/Combined_Gyro_Acg_Data_Subjects_1_to_6.csv"
har_df = pd.read_csv(har_file_path)

har_df['Timestamp_Accel'] = pd.to_numeric(har_df['Timestamp_Accel'], errors='coerce')
har_df['Timestamp_Gyro'] = pd.to_numeric(har_df['Timestamp_Gyro'], errors='coerce')

if har_df['Timestamp_Accel'].max() > 1e12:  
    har_df['Timestamp_Accel'] = har_df['Timestamp_Accel'] / 1000
    har_df['Timestamp_Gyro'] = har_df['Timestamp_Gyro'] / 1000

print("Before filtering:", har_df.shape)
har_df = har_df[har_df['Subject_ID_x'] == "Subject 2"]
print("After filtering:", har_df.shape)

valid_har = har_df[
    (har_df['Timestamp_Accel'].between(start_time, end_time)) |
    (har_df['Timestamp_Gyro'].between(start_time, end_time))
]

filtered_har = valid_har

print("HAR data is clipped to the EEG activity time range. All activities for Subject 2 are included.")
display(filtered_har.head())

Before filtering: (944460, 12)
After filtering: (173913, 12)
HAR data is clipped to the EEG activity time range. All activities for Subject 2 are included.


  har_df = pd.read_csv(har_file_path)


Unnamed: 0,Timestamp_Gyro,Gyro X (°/s),Gyro Y (°/s),Gyro Z (°/s),Activity_Label_x,Subject_ID_x,Timestamp_Accel,Accel X (g),Accel Y (g),Accel Z (g),Activity_Label_y,Subject_ID_y
187649,1739031000.0,-0.024435,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
187650,1739031000.0,-0.024435,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
187651,1739031000.0,-0.024435,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2
187652,1739031000.0,-0.024435,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2
187653,1739031000.0,-0.024435,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2


# Display Shape and Info of Filtered HAR Data

In [6]:
print(filtered_har.shape) 
print(filtered_har.info())  

(152747, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 152747 entries, 187649 to 361305
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp_Gyro    152747 non-null  float64
 1   Gyro X (°/s)      152747 non-null  float64
 2   Gyro Y (°/s)      152747 non-null  float64
 3   Gyro Z (°/s)      152718 non-null  float64
 4   Activity_Label_x  152747 non-null  object 
 5   Subject_ID_x      152747 non-null  object 
 6   Timestamp_Accel   152747 non-null  float64
 7   Accel X (g)       152747 non-null  float64
 8   Accel Y (g)       152734 non-null  object 
 9   Accel Z (g)       152715 non-null  float64
 10  Activity_Label_y  152747 non-null  object 
 11  Subject_ID_y      152747 non-null  object 
dtypes: float64(7), object(5)
memory usage: 15.1+ MB
None


# Merge EEG and HAR Data by Timestamp

In [7]:
eeg_df_sorted = eeg_df.dropna(subset=['Timestamp'])
eeg_df_sorted = eeg_df_sorted.sort_values("Timestamp")
har_df_sorted = filtered_har.sort_values("Timestamp_Accel")

merged_df = pd.merge_asof(
    eeg_df_sorted, 
    har_df_sorted, 
    left_on="Timestamp", 
    right_on="Timestamp_Accel", 
    direction="nearest"
)

print("EEG & HAR merged.")
display(merged_df.head())

EEG & HAR merged.


Unnamed: 0,Sample Index,EXG Channel 0,EXG Channel 1,EXG Channel 2,EXG Channel 3,EXG Channel 4,EXG Channel 5,EXG Channel 6,EXG Channel 7,EXG Channel 8,...,Gyro Y (°/s),Gyro Z (°/s),Activity_Label_x,Subject_ID_x,Timestamp_Accel,Accel X (g),Accel Y (g),Accel Z (g),Activity_Label_y,Subject_ID_y
0,6.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15542.084341,-120418.19041,10376.23857,273.80887,4420.683315,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
1,8.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15533.501271,-120356.723113,10384.665177,286.012922,4463.442202,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
2,10.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15519.553783,-119781.009231,10389.627265,257.648558,4434.563748,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
3,12.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15544.028943,-119762.256117,10369.689509,240.325956,4464.872714,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2
4,14.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15539.849167,-119185.983442,10381.334768,214.48734,4424.326649,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1739031000.0,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2


# Check Timestamp Alignment

In [8]:
print("Timestamp alignment (EEG vs HAR):")
print(merged_df[['Timestamp', 'Timestamp_Accel']])

Timestamp alignment (EEG vs HAR):
           Timestamp  Timestamp_Accel
0       1.739031e+09     1.739031e+09
1       1.739031e+09     1.739031e+09
2       1.739031e+09     1.739031e+09
3       1.739031e+09     1.739031e+09
4       1.739031e+09     1.739031e+09
...              ...              ...
555733  1.739035e+09     1.739035e+09
555734  1.739035e+09     1.739035e+09
555735  1.739035e+09     1.739035e+09
555736  1.739035e+09     1.739035e+09
555737  1.739035e+09     1.739035e+09

[555738 rows x 2 columns]


# Show Merged Data Columns

In [9]:
print("Unique Columns in Merged Data:")
print(merged_df.columns)

Unique Columns in Merged Data:
Index(['Sample Index', 'EXG Channel 0', 'EXG Channel 1', 'EXG Channel 2',
       'EXG Channel 3', 'EXG Channel 4', 'EXG Channel 5', 'EXG Channel 6',
       'EXG Channel 7', 'EXG Channel 8', 'EXG Channel 9', 'EXG Channel 10',
       'EXG Channel 11', 'EXG Channel 12', 'EXG Channel 13', 'EXG Channel 14',
       'EXG Channel 15', 'Accel Channel 0', 'Accel Channel 1',
       'Accel Channel 2', 'Not Used 1', 'Digital Channel 0 (D11)',
       'Digital Channel 1 (D12)', 'Digital Channel 2 (D13)',
       'Digital Channel 3 (D17)', 'Not Used 2', 'Digital Channel 4 (D18)',
       'Analog Channel 0', 'Analog Channel 1', 'Analog Channel 2', 'Timestamp',
       'Marker Channel', 'Timestamp (Formatted)', 'Activity', 'Timestamp_Gyro',
       'Gyro X (°/s)', 'Gyro Y (°/s)', 'Gyro Z (°/s)', 'Activity_Label_x',
       'Subject_ID_x', 'Timestamp_Accel', 'Accel X (g)', 'Accel Y (g)',
       'Accel Z (g)', 'Activity_Label_y', 'Subject_ID_y'],
      dtype='object')


# Display Final Merged Data

In [10]:
display(merged_df)

Unnamed: 0,Sample Index,EXG Channel 0,EXG Channel 1,EXG Channel 2,EXG Channel 3,EXG Channel 4,EXG Channel 5,EXG Channel 6,EXG Channel 7,EXG Channel 8,...,Gyro Y (°/s),Gyro Z (°/s),Activity_Label_x,Subject_ID_x,Timestamp_Accel,Accel X (g),Accel Y (g),Accel Z (g),Activity_Label_y,Subject_ID_y
0,6.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15542.084341,-120418.190410,10376.238570,273.808870,4420.683315,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1.739031e+09,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
1,8.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15533.501271,-120356.723113,10384.665177,286.012922,4463.442202,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1.739031e+09,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
2,10.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15519.553783,-119781.009231,10389.627265,257.648558,4434.563748,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1.739031e+09,-0.792481,-2.0638018,9.464279,Seated Leg Extensions,Subject 2
3,12.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15544.028943,-119762.256117,10369.689509,240.325956,4464.872714,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1.739031e+09,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2
4,14.0,187500.0,-187500.022352,-187500.022352,-187500.022352,-15539.849167,-119185.983442,10381.334768,214.487340,4424.326649,...,-0.001222,-0.013439,Seated Leg Extensions,Subject 2,1.739031e+09,-0.830788,-2.0374656,9.425972,Seated Leg Extensions,Subject 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555733,138.0,187500.0,-187500.022352,-187500.022352,-142125.936404,-14622.712388,-187500.022352,1015.506806,254.273445,-5181.581400,...,-0.030543,-0.155160,Light Stationary Cycling,Subject 2,1.739035e+09,-5.968745,-1.30484,8.207323,Light Stationary Cycling,Subject 2
555734,140.0,187500.0,-187500.022352,-187500.022352,-143851.960463,-14670.343956,-187500.022352,1039.914911,313.952603,-5177.043996,...,-0.030543,-0.155160,Light Stationary Cycling,Subject 2,1.739035e+09,-5.968745,-1.30484,8.207323,Light Stationary Cycling,Subject 2
555735,140.0,187500.0,-187500.022352,-187500.022352,-143851.960463,-14670.343956,-187500.022352,1039.914911,313.952603,-5177.043996,...,-0.030543,-0.155160,Light Stationary Cycling,Subject 2,1.739035e+09,-5.968745,-1.30484,8.207323,Light Stationary Cycling,Subject 2
555736,140.0,187500.0,-187500.022352,-187500.022352,-143851.960463,-14670.343956,-187500.022352,1039.914911,313.952603,-5177.043996,...,-0.030543,-0.155160,Light Stationary Cycling,Subject 2,1.739035e+09,-5.968745,-1.30484,8.207323,Light Stationary Cycling,Subject 2


# Save Merged Data to CSV

In [11]:
try:
    merged_df.to_csv("EEG-HAR_Subject_2_Merged.csv", index=False)
    print("File saved successfully as EEG-HAR_Subject_2_Merged.csv.")
except Exception as e:
    print(f"An error occurred while saving the file: {e}")

File saved successfully as EEG-HAR_Subject_2_Merged.csv.
