# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Initial Data Exploration

In [2]:
df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6.csv')

print(df.head())

print("\nTotal number of rows and columns:")
print(df.shape)

print("\nData types of each column:")
print(df.dtypes)

   Timestamp_Gyro  Gyro X (°/s)  Gyro Y (°/s)  Gyro Z (°/s)  \
0    1.739025e+12      0.009774      0.017104     -0.003665   
1    1.739025e+12      0.009774      0.017104     -0.003665   
2    1.739025e+12      0.009774      0.017104     -0.003665   
3    1.739025e+12      0.009774      0.017104     -0.003665   
4    1.739025e+12      0.009774      0.017104     -0.003665   

        Activity_Label_x Subject_ID_x  Timestamp_Accel  Accel X (g)  \
0  Seated Leg Extensions    Subject 1     1.739025e+12    -2.533065   
1  Seated Leg Extensions    Subject 1     1.739025e+12    -2.533065   
2  Seated Leg Extensions    Subject 1     1.739025e+12    -2.533065   
3  Seated Leg Extensions    Subject 1     1.739025e+12    -2.533065   
4  Seated Leg Extensions    Subject 1     1.739025e+12    -2.533065   

  Accel Y (g)  Accel Z (g)       Activity_Label_y Subject_ID_y  
0   -4.299986     8.389283  Seated Leg Extensions    Subject 1  
1   -4.299986     8.389283  Seated Leg Extensions    Subject 1  

  df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6.csv')


# Extract and Save Important Columns

In [3]:
df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6.csv')

df = df[['Timestamp_Gyro', 'Gyro X (°/s)', 'Gyro Y (°/s)', 'Gyro Z (°/s)', 
         'Accel X (g)', 'Accel Y (g)', 'Accel Z (g)', 
         'Activity_Label_x', 'Subject_ID_x']]

new_filename = 'Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv'

df.to_csv(new_filename, index=False)

print(f"Shape of the DataFrame: {df.shape}")

print("\nDataFrame Info:")
print(df.info())

  df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6.csv')


Shape of the DataFrame: (944460, 9)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944460 entries, 0 to 944459
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp_Gyro    944459 non-null  float64
 1   Gyro X (°/s)      944444 non-null  float64
 2   Gyro Y (°/s)      944375 non-null  float64
 3   Gyro Z (°/s)      944319 non-null  float64
 4   Accel X (g)       944457 non-null  float64
 5   Accel Y (g)       944402 non-null  object 
 6   Accel Z (g)       944319 non-null  float64
 7   Activity_Label_x  944460 non-null  object 
 8   Subject_ID_x      944460 non-null  object 
dtypes: float64(6), object(3)
memory usage: 64.9+ MB
None


# Rename and Type Conversion

In [4]:
df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6.csv')

df = df[['Timestamp_Gyro', 'Accel X (g)', 'Accel Y (g)', 'Accel Z (g)', 
         'Gyro X (°/s)', 'Gyro Y (°/s)', 'Gyro Z (°/s)', 
         'Subject_ID_x', 'Activity_Label_x']]

df = df.rename(columns={
    'Timestamp_Gyro': 'Timestamp (microseconds)', 
    'Accel X (g)': 'Accel X (g)', 
    'Accel Y (g)': 'Accel Y (g)', 
    'Accel Z (g)': 'Accel Z (g)', 
    'Gyro X (°/s)': 'Gyro X (°/s)', 
    'Gyro Y (°/s)': 'Gyro Y (°/s)', 
    'Gyro Z (°/s)': 'Gyro Z (°/s)', 
    'Subject_ID_x': 'Subject_ID', 
    'Activity_Label_x': 'Activity_Label'
})

df['Timestamp (microseconds)'] = pd.to_numeric(df['Timestamp (microseconds)'], errors='coerce')
df['Timestamp (microseconds)'].fillna(0, inplace=True)
df['Timestamp (microseconds)'] = df['Timestamp (microseconds)'].astype('int64')

df['Subject_ID'] = df['Subject_ID'].str.extract('(\d+)').astype(int)

df['Accel Y (g)'] = pd.to_numeric(df['Accel Y (g)'], errors='coerce')

df['Accel X (g)'] = df['Accel X (g)'].astype(float)
df['Accel Z (g)'] = df['Accel Z (g)'].astype(float)
df['Gyro X (°/s)'] = df['Gyro X (°/s)'].astype(float)
df['Gyro Y (°/s)'] = df['Gyro Y (°/s)'].astype(float)
df['Gyro Z (°/s)'] = df['Gyro Z (°/s)'].astype(float)

df['Activity_Label'] = df['Activity_Label'].astype('object')

new_filename = 'Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv'
df.to_csv(new_filename, index=False)

print(f"Shape of the DataFrame: {df.shape}")
print("\nDataFrame Info:")
print(df.info())

  df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Timestamp (microseconds)'].fillna(0, inplace=True)


Shape of the DataFrame: (944460, 9)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944460 entries, 0 to 944459
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Timestamp (microseconds)  944460 non-null  int64  
 1   Accel X (g)               944457 non-null  float64
 2   Accel Y (g)               944395 non-null  float64
 3   Accel Z (g)               944319 non-null  float64
 4   Gyro X (°/s)              944444 non-null  float64
 5   Gyro Y (°/s)              944375 non-null  float64
 6   Gyro Z (°/s)              944319 non-null  float64
 7   Subject_ID                944460 non-null  int64  
 8   Activity_Label            944460 non-null  object 
dtypes: float64(6), int64(2), object(1)
memory usage: 64.9+ MB
None


# Verify Updated Dataset

In [5]:
df = pd.read_csv('Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv')

print(f"Total rows: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")

print(df.head())

Total rows: 944460
Total columns: 9
   Timestamp (microseconds)  Accel X (g)  Accel Y (g)  Accel Z (g)  \
0             1739025164003    -2.533065    -4.299986     8.389283   
1             1739025164003    -2.533065    -4.299986     8.389283   
2             1739025164003    -2.533065    -4.299986     8.389283   
3             1739025164003    -2.533065    -4.299986     8.389283   
4             1739025164003    -2.533065    -4.299986     8.389283   

   Gyro X (°/s)  Gyro Y (°/s)  Gyro Z (°/s)  Subject_ID         Activity_Label  
0      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
1      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
2      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
3      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
4      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  


# Activity Analysis by Subject

In [6]:
distinct_activities_per_subject = df.groupby('Subject_ID')['Activity_Label'].nunique()

print("\nTotal distinct activities per subject:")
print(distinct_activities_per_subject)


Total distinct activities per subject:
Subject_ID
1    10
2    10
3    10
4    10
5     4
6    10
Name: Activity_Label, dtype: int64


# Activity Counts Analysis

In [7]:
activity_counts_by_subject = df.groupby('Subject_ID')['Activity_Label'].value_counts()

total_activity_count = df['Activity_Label'].count()

total_activity_per_subject = df.groupby('Subject_ID')['Activity_Label'].count()

print("Total count of all activities: ", total_activity_count)
print("\nTotal activities per subject:")
print(total_activity_per_subject)

print("\nActivity counts by subject and label:")
print(activity_counts_by_subject)

Total count of all activities:  944460

Total activities per subject:
Subject_ID
1    187577
2    173913
3    186349
4    177520
5     43411
6    175690
Name: Activity_Label, dtype: int64

Activity counts by subject and label:
Subject_ID  Activity_Label             
1           Seated Boxing Hooks            25681
            Seated Side Bends              22799
            Seated Medicine Ball Twists    20265
            Marching in Place              18972
            Side-Stepping                  18657
            Standing Heel-to-Toe Walk      18603
            Chair Squats                   18021
            Light Stationary Cycling       17263
            Wall Push-ups                  16867
            Seated Leg Extensions          10449
2           Standing Heel-to-Toe Walk      20910
            Chair Squats                   19791
            Seated Side Bends              18784
            Seated Leg Extensions          18698
            Wall Push-ups                  1764

# Process Subject 1 Data

In [8]:
df = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
subject_df = df[df['Subject_ID'] == 1].copy()
subject_df.sort_values(by=['Activity_Label', 'Timestamp (microseconds)'], inplace=True)

samples_per_phase = 750
total_phases = 5

filtered_dfs = []

for activity, group in subject_df.groupby('Activity_Label'):
    group = group.reset_index(drop=True)
    expected_length = total_phases * samples_per_phase * 2

    if len(group) >= expected_length:
        for i in range(total_phases):
            start_idx = i * samples_per_phase * 2
            end_idx = start_idx + samples_per_phase
            filtered_dfs.append(group.iloc[start_idx:end_idx])
    else:
        print(f"Skipping Activity '{activity}' — insufficient data.")

filtered_subject_df = pd.concat(filtered_dfs, ignore_index=True)
filtered_subject_df.to_csv("Subject_1_Active_Only.csv", index=False)

print("CSV file 'Subject_1_Active_Only.csv' has been saved successfully.")
print(f"Original shape (Subject 1): {subject_df.shape}")
print(f"Filtered shape (active only): {filtered_subject_df.shape}")

CSV file 'Subject_1_Active_Only.csv' has been saved successfully.
Original shape (Subject 1): (187577, 9)
Filtered shape (active only): (37500, 9)


# Process Subject 2 Data

In [9]:
df = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
subject_df = df[df['Subject_ID'] == 2].copy()
subject_df.sort_values(by=['Activity_Label', 'Timestamp (microseconds)'], inplace=True)

samples_per_phase = 750
total_phases = 5

filtered_dfs = []

for activity, group in subject_df.groupby('Activity_Label'):
    group = group.reset_index(drop=True)
    expected_length = total_phases * samples_per_phase * 2

    if len(group) >= expected_length:
        for i in range(total_phases):
            start_idx = i * samples_per_phase * 2
            end_idx = start_idx + samples_per_phase
            filtered_dfs.append(group.iloc[start_idx:end_idx])
    else:
        print(f"Skipping Activity '{activity}' — insufficient data.")

filtered_subject_df = pd.concat(filtered_dfs, ignore_index=True)
filtered_subject_df.to_csv("Subject_2_Active_Only.csv", index=False)

print("CSV file 'Subject_2_Active_Only.csv' has been saved successfully.")
print(f"Original shape (Subject 2): {subject_df.shape}")
print(f"Filtered shape (active only): {filtered_subject_df.shape}")

CSV file 'Subject_2_Active_Only.csv' has been saved successfully.
Original shape (Subject 2): (173913, 9)
Filtered shape (active only): (37500, 9)


# Process Subject 3 Data

In [10]:
df = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
subject_df = df[df['Subject_ID'] == 3].copy()
subject_df.sort_values(by=['Activity_Label', 'Timestamp (microseconds)'], inplace=True)

samples_per_phase = 750
total_phases = 5

filtered_dfs = []

for activity, group in subject_df.groupby('Activity_Label'):
    group = group.reset_index(drop=True)
    expected_length = total_phases * samples_per_phase * 2

    if len(group) >= expected_length:
        for i in range(total_phases):
            start_idx = i * samples_per_phase * 2
            end_idx = start_idx + samples_per_phase
            filtered_dfs.append(group.iloc[start_idx:end_idx])
    else:
        print(f"Skipping Activity '{activity}' — insufficient data.")

filtered_subject_df = pd.concat(filtered_dfs, ignore_index=True)
filtered_subject_df.to_csv("Subject_3_Active_Only.csv", index=False)

print("CSV file 'Subject_3_Active_Only.csv' has been saved successfully.")
print(f"Original shape (Subject 3): {subject_df.shape}")
print(f"Filtered shape (active only): {filtered_subject_df.shape}")

CSV file 'Subject_3_Active_Only.csv' has been saved successfully.
Original shape (Subject 3): (186349, 9)
Filtered shape (active only): (37500, 9)


# Process Subject 4 Data

In [11]:
df = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
subject_df = df[df['Subject_ID'] == 4].copy()
subject_df.sort_values(by=['Activity_Label', 'Timestamp (microseconds)'], inplace=True)

samples_per_phase = 750
total_phases = 5

filtered_dfs = []

for activity, group in subject_df.groupby('Activity_Label'):
    group = group.reset_index(drop=True)
    expected_length = total_phases * samples_per_phase * 2

    if len(group) >= expected_length:
        for i in range(total_phases):
            start_idx = i * samples_per_phase * 2
            end_idx = start_idx + samples_per_phase
            filtered_dfs.append(group.iloc[start_idx:end_idx])
    else:
        print(f"Skipping Activity '{activity}' — insufficient data.")

filtered_subject_df = pd.concat(filtered_dfs, ignore_index=True)
filtered_subject_df.to_csv("Subject_4_Active_Only.csv", index=False)

print("CSV file 'Subject_4_Active_Only.csv' has been saved successfully.")
print(f"Original shape (Subject 4): {subject_df.shape}")
print(f"Filtered shape (active only): {filtered_subject_df.shape}")

CSV file 'Subject_4_Active_Only.csv' has been saved successfully.
Original shape (Subject 4): (177520, 9)
Filtered shape (active only): (37500, 9)


# Process Subject 6 Data

In [12]:
df = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
subject_df = df[df['Subject_ID'] == 6].copy()
subject_df.sort_values(by=['Activity_Label', 'Timestamp (microseconds)'], inplace=True)

samples_per_phase = 750
total_phases = 5

filtered_dfs = []

for activity, group in subject_df.groupby('Activity_Label'):
    group = group.reset_index(drop=True)

    if activity == "Light Stationary Cycling":
        filtered_dfs.append(group)
    else:
        expected_length = total_phases * samples_per_phase * 2

        if len(group) >= expected_length:
            for i in range(total_phases):
                start_idx = i * samples_per_phase * 2
                end_idx = start_idx + samples_per_phase
                filtered_dfs.append(group.iloc[start_idx:end_idx])
        else:
            print(f"Skipping Activity '{activity}' — insufficient data.")

filtered_subject_df = pd.concat(filtered_dfs, ignore_index=True)
filtered_subject_df.to_csv("Subject_6_Active_Only.csv", index=False)

print("CSV file 'Subject_6_Active_Only.csv' has been saved successfully.")
print(f"Original shape (Subject 6): {subject_df.shape}")
print(f"Filtered shape (active only): {filtered_subject_df.shape}")

CSV file 'Subject_6_Active_Only.csv' has been saved successfully.
Original shape (Subject 6): (175690, 9)
Filtered shape (active only): (40853, 9)


# Process Subject 5 Data

In [13]:
df = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
subject_5_df = df[df['Subject_ID'] == 5]
subject_5_df.to_csv("Subject_5_Active_Only.csv", index=False)

print(f"Subject 5 data has been saved as 'Subject_5_Active_Only.csv'.")

Subject 5 data has been saved as 'Subject_5_Active_Only.csv'.


# Combine All Subject Data

In [14]:
subject_1_df = pd.read_csv('Subject_1_Active_Only.csv')
subject_2_df = pd.read_csv('Subject_2_Active_Only.csv')
subject_3_df = pd.read_csv('Subject_3_Active_Only.csv')
subject_4_df = pd.read_csv('Subject_4_Active_Only.csv')
subject_5_df = pd.read_csv('Subject_5_Active_Only.csv')
subject_6_df = pd.read_csv('Subject_6_Active_Only.csv')

combined_df = pd.concat([subject_1_df, subject_2_df, subject_3_df, subject_4_df, subject_5_df, subject_6_df], ignore_index=True)
combined_df.to_csv('Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv', index=False)

print("All subject active Gyro and Accel data has been combined into 'Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv'")

All subject active Gyro and Accel data has been combined into 'Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv'


# Compare Timestamps

In [15]:
df_all = pd.read_csv("Combined_Gyro_Acg_Data_Subjects_1_to_6_Updated.csv")
df_active = pd.read_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv")

timestamps_all = df_all['Timestamp (microseconds)'].sort_values().reset_index(drop=True)
timestamps_active = df_active['Timestamp (microseconds)'].sort_values().reset_index(drop=True)

tolerance = 50

matching_timestamps = []

for ts_active in timestamps_active:
    matching = timestamps_all[(timestamps_all >= ts_active - tolerance) & (timestamps_all <= ts_active + tolerance)]
    if not matching.empty:
        matching_timestamps.append(ts_active)

print(f"Number of matching timestamps within {tolerance} microseconds: {len(matching_timestamps)}")

Number of matching timestamps within 50 microseconds: 234264


# Analyze Combined Dataset

In [16]:
df_active = pd.read_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv")

print(f"Total rows and columns: {df_active.shape}")

print("\nStatistics of the numeric columns:")
print(df_active.describe())

activity_counts_per_person = df_active.groupby('Subject_ID')['Activity_Label'].count()
print("\nActivity count per person:")
print(activity_counts_per_person)

activity_distribution = df_active['Activity_Label'].value_counts()
print("\nActivity distribution across all activities:")
print(activity_distribution)

Total rows and columns: (234264, 9)

Statistics of the numeric columns:
       Timestamp (microseconds)    Accel X (g)    Accel Y (g)    Accel Z (g)  \
count              2.342640e+05  234261.000000  234257.000000  234236.000000   
mean               1.739053e+12       1.872733      -2.800427       4.889721   
std                6.224042e+09       7.021548       4.031065       4.052807   
min                0.000000e+00     -41.084500     -30.600292     -33.988087   
25%                1.739035e+12      -3.208230      -5.252879       2.123657   
50%                1.739047e+12       2.719813      -2.355895       5.114015   
75%                1.739051e+12       8.111555      -0.560243       8.161834   
max                1.739337e+12      70.262634      27.023355      31.280245   

        Gyro X (°/s)   Gyro Y (°/s)   Gyro Z (°/s)     Subject_ID  
count  234261.000000  234261.000000  234253.000000  234264.000000  
mean       -0.038985      -0.007815      -0.028342       3.573631  
std

# Sort Activities

In [17]:
df_active = pd.read_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv")

activity_order = [
    "Seated Leg Extensions", 
    "Marching in Place", 
    "Wall Push-ups", 
    "Seated Boxing Hooks", 
    "Standing Heel-to-Toe Walk", 
    "Side-Stepping", 
    "Seated Side Bends", 
    "Seated Medicine Ball Twists", 
    "Chair Squats", 
    "Light Stationary Cycling"
]

df_filtered = df_active[df_active['Activity_Label'].isin(activity_order)]

df_filtered['Activity_Label'] = pd.Categorical(df_filtered['Activity_Label'], categories=activity_order, ordered=True)
df_sorted = df_filtered.sort_values(by=['Subject_ID', 'Activity_Label'])

df_sorted.to_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Sorted.csv", index=False)

print("Filtered and sorted dataset has been saved as 'Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Sorted.csv'.")

Filtered and sorted dataset has been saved as 'Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Sorted.csv'.


# Explore Sorted Dataset

In [18]:
df_sorted = pd.read_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Sorted.csv")

print("Head of the dataset:")
print(df_sorted.head())

print(f"\nTotal rows and columns: {df_sorted.shape}")

print("\nStatistics of the numeric columns:")
print(df_sorted.describe())

Head of the dataset:
   Timestamp (microseconds)  Accel X (g)  Accel Y (g)  Accel Z (g)  \
0             1739025164003    -2.533065    -4.299986     8.389283   
1             1739025164003    -2.533065    -4.299986     8.389283   
2             1739025164003    -2.533065    -4.299986     8.389283   
3             1739025164003    -2.533065    -4.299986     8.389283   
4             1739025164003    -2.533065    -4.299986     8.389283   

   Gyro X (°/s)  Gyro Y (°/s)  Gyro Z (°/s)  Subject_ID         Activity_Label  
0      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
1      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
2      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
3      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  
4      0.009774      0.017104     -0.003665           1  Seated Leg Extensions  

Total rows and columns: (234264, 9)

Statistics of the numeric columns:

# Clean Dataset

In [19]:
df_active = pd.read_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6.csv")

df_cleaned = df_active[df_active["Timestamp (microseconds)"] != 0]
df_cleaned = df_cleaned.dropna(subset=["Timestamp (microseconds)"])

sensor_columns = [
    "Accel X (g)", "Accel Y (g)", "Accel Z (g)",
    "Gyro X (°/s)", "Gyro Y (°/s)", "Gyro Z (°/s)"
]
df_cleaned = df_cleaned.dropna(subset=sensor_columns)

df_cleaned.to_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Cleaned.csv", index=False)

print("Cleaned dataset saved as 'Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Cleaned.csv'")

Cleaned dataset saved as 'Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Cleaned.csv'


# Verify Cleaned Dataset

In [20]:
df_cleaned = pd.read_csv("Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Cleaned.csv")

print("Head of the cleaned dataset:")
print(df_cleaned.head())

print("\nTotal rows and columns:", df_cleaned.shape)

print("\nStatistics of the numeric columns:")
print(df_cleaned.describe())

Head of the cleaned dataset:
   Timestamp (microseconds)  Accel X (g)  Accel Y (g)  Accel Z (g)  \
0             1739028476725     -5.98311     1.247379     8.365341   
1             1739028476725     -5.98311     1.247379     8.365341   
2             1739028476725     -5.98311     1.247379     8.365341   
3             1739028476725     -5.98311     1.247379     8.365341   
4             1739028476725     -5.98311     1.247379     8.365341   

   Gyro X (°/s)  Gyro Y (°/s)  Gyro Z (°/s)  Subject_ID Activity_Label  
0      0.321315     -0.024435     -0.131947           1   Chair Squats  
1      0.321315     -0.024435     -0.131947           1   Chair Squats  
2      0.321315     -0.024435     -0.131947           1   Chair Squats  
3      0.321315     -0.024435     -0.131947           1   Chair Squats  
4      0.321315     -0.024435     -0.131947           1   Chair Squats  

Total rows and columns: (234228, 9)

Statistics of the numeric columns:
       Timestamp (microseconds)    Acce

# Create Windows

In [21]:
df = pd.read_csv('Combined_Active_Only_Gyro_and_Accel_Subjects_1_to_6_Cleaned.csv')

accel_gyro_cols = ['Accel X (g)', 'Accel Y (g)', 'Accel Z (g)',
                   'Gyro X (°/s)', 'Gyro Y (°/s)', 'Gyro Z (°/s)']

df.to_csv('Scaled_Combined_Active_Gyro_Accel.csv', index=False)

def create_overlapping_windows(data, window_size, step_size, features):
    segments = []
    for start in range(0, len(data) - window_size + 1, step_size):
        segment = data[start:start + window_size, :features]
        segments.append(segment)
    return np.array(segments)

def assign_labels_to_windows(labels, window_size, step_size):
    window_labels = []
    for start in range(0, len(labels) - window_size + 1, step_size):
        window_label = np.bincount(labels[start:start + window_size]).argmax()
        window_labels.append(window_label)
    return np.array(window_labels)

window_size = 128
step_size = 64
num_features = 6

labels = pd.factorize(df['Activity_Label'])[0]
features = df[accel_gyro_cols].values

X = create_overlapping_windows(features, window_size, step_size, num_features)
y = assign_labels_to_windows(labels, window_size, step_size)

print(f"Full dataset shape (windows): {X.shape}")
print(f"Windowed labels shape: {y.shape}")

print("\nFile saved:")
print("- Full Data: 'Scaled_Combined_Active_Gyro_Accel.csv'")

Full dataset shape (windows): (3658, 128, 6)
Windowed labels shape: (3658,)

File saved:
- Full Data: 'Scaled_Combined_Active_Gyro_Accel.csv'


# Print Scaled Dataset

In [22]:
df = pd.read_csv('Scaled_Combined_Active_Gyro_Accel.csv')

print(df)

        Timestamp (microseconds)  Accel X (g)  Accel Y (g)  Accel Z (g)  \
0                  1739028476725    -5.983110     1.247379     8.365341   
1                  1739028476725    -5.983110     1.247379     8.365341   
2                  1739028476725    -5.983110     1.247379     8.365341   
3                  1739028476725    -5.983110     1.247379     8.365341   
4                  1739028476725    -5.983110     1.247379     8.365341   
...                          ...          ...          ...          ...   
234223             1739051490559     8.446744    -4.716577     3.725378   
234224             1739051490568     8.446744    -4.716577     3.725378   
234225             1739051490589     8.446744    -4.716577     3.725378   
234226             1739051490609     8.446744    -4.716577     3.725378   
234227             1739051490620     8.446744    -4.716577     3.725378   

        Gyro X (°/s)  Gyro Y (°/s)  Gyro Z (°/s)  Subject_ID Activity_Label  
0           0.321315 

# Analyze Scaled Dataset

In [23]:
df = pd.read_csv('Scaled_Combined_Active_Gyro_Accel.csv')

print("Basic Data Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

duplicates = df[df.duplicated()]
print("\nDuplicates Found (if any):")
print(duplicates)

print("\nDescriptive Statistics:")
print(df.describe())

Basic Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234228 entries, 0 to 234227
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Timestamp (microseconds)  234228 non-null  int64  
 1   Accel X (g)               234228 non-null  float64
 2   Accel Y (g)               234228 non-null  float64
 3   Accel Z (g)               234228 non-null  float64
 4   Gyro X (°/s)              234228 non-null  float64
 5   Gyro Y (°/s)              234228 non-null  float64
 6   Gyro Z (°/s)              234228 non-null  float64
 7   Subject_ID                234228 non-null  int64  
 8   Activity_Label            234228 non-null  object 
dtypes: float64(6), int64(2), object(1)
memory usage: 16.1+ MB
None

Missing Values:
Timestamp (microseconds)    0
Accel X (g)                 0
Accel Y (g)                 0
Accel Z (g)                 0
Gyro X (°/s)                0
Gyro Y (°/s)          

# Remove Duplicates

In [24]:
df = pd.read_csv('Scaled_Combined_Active_Gyro_Accel.csv')

df_cleaned = df.drop_duplicates()

duplicates_removed = df.shape[0] - df_cleaned.shape[0]
print(f"Duplicates removed: {duplicates_removed}")

print(df_cleaned.info())

df_cleaned.to_csv('Scaled_Combined_Active_Gyro_Accel_Cleaned.csv', index=False)

Duplicates removed: 4063
<class 'pandas.core.frame.DataFrame'>
Index: 230165 entries, 0 to 234227
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Timestamp (microseconds)  230165 non-null  int64  
 1   Accel X (g)               230165 non-null  float64
 2   Accel Y (g)               230165 non-null  float64
 3   Accel Z (g)               230165 non-null  float64
 4   Gyro X (°/s)              230165 non-null  float64
 5   Gyro Y (°/s)              230165 non-null  float64
 6   Gyro Z (°/s)              230165 non-null  float64
 7   Subject_ID                230165 non-null  int64  
 8   Activity_Label            230165 non-null  object 
dtypes: float64(6), int64(2), object(1)
memory usage: 17.6+ MB
None


# Verify Cleaned Scaled Dataset

In [25]:
df_cleaned = pd.read_csv('Scaled_Combined_Active_Gyro_Accel_Cleaned.csv')

print("Basic Data Info:")
print(df_cleaned.info())
print("\nMissing Values:")
print(df_cleaned.isnull().sum())

duplicates_cleaned = df_cleaned[df_cleaned.duplicated()]
print("\nDuplicates Found (if any) after cleaning:")
print(duplicates_cleaned)

print("\nDescriptive Statistics of Cleaned Data:")
print(df_cleaned.describe())

Basic Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230165 entries, 0 to 230164
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Timestamp (microseconds)  230165 non-null  int64  
 1   Accel X (g)               230165 non-null  float64
 2   Accel Y (g)               230165 non-null  float64
 3   Accel Z (g)               230165 non-null  float64
 4   Gyro X (°/s)              230165 non-null  float64
 5   Gyro Y (°/s)              230165 non-null  float64
 6   Gyro Z (°/s)              230165 non-null  float64
 7   Subject_ID                230165 non-null  int64  
 8   Activity_Label            230165 non-null  object 
dtypes: float64(6), int64(2), object(1)
memory usage: 15.8+ MB
None

Missing Values:
Timestamp (microseconds)    0
Accel X (g)                 0
Accel Y (g)                 0
Accel Z (g)                 0
Gyro X (°/s)                0
Gyro Y (°/s)          

# Explore Cleaned Scaled Dataset

In [26]:
df_cleaned = pd.read_csv('Scaled_Combined_Active_Gyro_Accel_Cleaned.csv')

print("First few rows of the dataset:")
print(df_cleaned.head())

print("\nTotal rows and columns:")
print(f"Rows: {df_cleaned.shape[0]}")
print(f"Columns: {df_cleaned.shape[1]}")

print("\nActivity counts:")
print(df_cleaned['Activity_Label'].value_counts())

First few rows of the dataset:
   Timestamp (microseconds)  Accel X (g)  Accel Y (g)  Accel Z (g)  \
0             1739028476725     -5.98311     1.247379     8.365341   
1             1739028476778     -5.98311     1.247379     8.365341   
2             1739028476802     -5.98311     1.247379     8.365341   
3             1739028476810     -5.98311     1.247379     8.365341   
4             1739028476820     -5.98311     1.247379     8.365341   

   Gyro X (°/s)  Gyro Y (°/s)  Gyro Z (°/s)  Subject_ID Activity_Label  
0      0.321315     -0.024435     -0.131947           1   Chair Squats  
1      0.321315     -0.024435     -0.131947           1   Chair Squats  
2      0.321315     -0.024435     -0.131947           1   Chair Squats  
3      0.321315     -0.024435     -0.131947           1   Chair Squats  
4      0.321315     -0.024435     -0.131947           1   Chair Squats  

Total rows and columns:
Rows: 230165
Columns: 9

Activity counts:
Activity_Label
Marching in Place           

# Apply SMOTE for HART Models

In [27]:
df = pd.read_csv('Scaled_Combined_Active_Gyro_Accel_Cleaned.csv')

X = df.drop(columns=['Activity_Label', 'Timestamp (microseconds)'])
y = df['Activity_Label']

print("Class distribution before SMOTE:")
print(y.value_counts())

le = LabelEncoder()
y_encoded = le.fit_transform(y)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

print("\nClass distribution after SMOTE:")
y_resampled_series = pd.Series(y_resampled)
y_resampled_activity_names = le.inverse_transform(y_resampled_series)
print(pd.Series(y_resampled_activity_names).value_counts())

y_resampled_decoded = le.inverse_transform(y_resampled)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Activity_Label'] = y_resampled_decoded

print("\nFirst few rows of the resampled dataframe:")
print(df_resampled.head())

df_resampled.to_csv('SMOTE_Scaled_Combined_Active_Gyro_Accel_Cleaned_V2.csv', index=False)

Class distribution before SMOTE:
Activity_Label
Marching in Place              36556
Wall Push-ups                  27090
Standing Heel-to-Toe Walk      26538
Seated Boxing Hooks            25783
Light Stationary Cycling       21696
Seated Medicine Ball Twists    18570
Seated Side Bends              18554
Chair Squats                   18509
Seated Leg Extensions          18498
Side-Stepping                  18371
Name: count, dtype: int64

Class distribution after SMOTE:
Chair Squats                   36556
Light Stationary Cycling       36556
Marching in Place              36556
Seated Boxing Hooks            36556
Seated Leg Extensions          36556
Seated Medicine Ball Twists    36556
Seated Side Bends              36556
Side-Stepping                  36556
Standing Heel-to-Toe Walk      36556
Wall Push-ups                  36556
Name: count, dtype: int64

First few rows of the resampled dataframe:
   Accel X (g)  Accel Y (g)  Accel Z (g)  Gyro X (°/s)  Gyro Y (°/s)  \
0     -5.98