In [7]:
import pandas as pd
import glob
import os

# Input folder
data_folder = 'Data/'
csv_files = glob.glob(os.path.join(data_folder, 'Participant_*.csv'))

# Define body locations and acceleration axes
body_locations = ['Left_pocket', 'Right_pocket', 'Wrist', 'Upper_arm', 'Belt']
accel_axes = ['Ax', 'Ay', 'Az']

# List to hold all cleaned DataFrames
all_data = []

for file_path in csv_files:
    with open(file_path, 'r') as f:
        line1 = f.readline().strip().split(',')
        line2 = f.readline().strip().split(',')

    # Combine the two header rows
    header = []
    current_prefix = None
    for part1, part2 in zip(line1, line2):
        if part1:
            current_prefix = part1.strip()
        if part2:
            header.append(f"{current_prefix}_{part2.strip()}")
        else:
            header.append("")

    # Read actual data
    df = pd.read_csv(file_path, skiprows=2, header=None)
    df.columns = header

    # Extract participant ID from filename
    participant_id = int(os.path.basename(file_path).split('_')[1].split('.')[0])

    # Get activity from column index 69 (only once — it's global per row)
    if df.shape[1] > 69:
        activity_col = df.iloc[:, 69].values
    else:
        activity_col = ['unknown'] * len(df)

    # Process each position
    for loc in body_locations:
        cols = [f"{loc}_{axis}" for axis in accel_axes if f"{loc}_{axis}" in df.columns]

        if not cols:
            continue

        temp_df = df[cols].copy()
        temp_df.columns = accel_axes
        temp_df['position'] = loc
        temp_df['participant'] = participant_id
        temp_df['activity'] = activity_col

        all_data.append(temp_df)

# Combine everything
combined_df = pd.concat(all_data, ignore_index=True)

# Save
combined_df.to_csv('combined_accel_with_activity.csv', index=False)

# Print summary
print(combined_df.sample(5))
print(f"\n✅ Total rows combined: {combined_df.shape[0]}")
print(f"📁 Output saved as: combined_accel_with_activity.csv")




               Ax       Ay       Az      position  participant    activity
1773387  -0.38137  -9.1120 -2.13840     Upper_arm            5    standing
1384746  -1.72980 -10.2700 -4.82160  Right_pocket            4  downstairs
2496662  13.93400  -2.9011 -4.42660          Belt            7      biking
1768103   0.96704  -7.3958 -0.65378     Upper_arm            5     walking
1537945  19.51800  -2.7922 -4.98500          Belt            4     jogging

✅ Total rows combined: 3150000
📁 Output saved as: combined_accel_with_activity.csv


In [8]:
# Reading the combined data
# Load the preprocessed data
combined_df = pd.read_csv('Data/combined_accel_with_activity.csv')
combined_df.head()

Unnamed: 0,time_stamp,Ax,Ay,Az,position,participant,activity
0,1390000000000.0,-1.8115,-14.873,-1.3484,Left_pocket,1,walking
1,1390000000000.0,0.24517,-14.07,-0.84446,Left_pocket,1,walking
2,1390000000000.0,-0.57205,-14.628,-1.757,Left_pocket,1,walking
3,1390000000000.0,-0.69464,-12.939,-3.0918,Left_pocket,1,walking
4,1390000000000.0,0.8717,-12.0,-1.5663,Left_pocket,1,walking


In [9]:
all_df_axis = combined_df[['time_stamp','Ax','Ay','Az','participant', 'activity']].copy()
# Convert to numpy array
all_df_axis['activity'].unique()

array(['walking', 'standing', 'jogging', 'sitting', 'biking', 'upstairs',
       'downstairs', 'upsatirs'], dtype=object)

In [10]:
all_df_axis.loc[all_df_axis['activity'] == 'upsatirs', 'activity'] = 'upstairs'
all_df_axis.sample(5)

Unnamed: 0,time_stamp,Ax,Ay,Az,participant,activity
2216968,1390000000000.0,1.9205,-10.011,1.6617,7,standing
3041316,1390000000000.0,2.4789,-9.221,-0.36775,9,standing
2721947,1390000000000.0,2.6015,-9.1393,0.50395,8,standing
2318534,1390000000000.0,-2.8194,-9.3436,-1.6481,7,upstairs
2449643,1390000000000.0,-0.78998,-15.282,-0.44947,7,downstairs


In [11]:
cols_to_numeric = ['Ax', 'Ay', 'Az', 'time_stamp']
for col in cols_to_numeric:
    all_df_axis[col] = pd.to_numeric(all_df_axis[col], errors='coerce')
all_df_axis['participant'] = all_df_axis['participant'].astype('category')
all_df_axis['activity'] = all_df_axis['activity'].astype('category')

In [12]:
all_df_axis.describe()

Unnamed: 0,time_stamp,Ax,Ay,Az
count,3150000.0,3150000.0,3150000.0,3150000.0
mean,1390000000000.0,2.970175,-6.2868,-2.207186
std,0.0,5.318678,5.467467,3.765335
min,1390000000000.0,-19.6,-19.668,-19.491
25%,1390000000000.0,-0.081722,-9.7522,-4.413
50%,1390000000000.0,2.5061,-7.1507,-1.6617
75%,1390000000000.0,6.3607,-1.757,0.10896
max,1390000000000.0,19.6,19.259,19.6


In [13]:
data = all_df_axis.copy()
data

Unnamed: 0,time_stamp,Ax,Ay,Az,participant,activity
0,1.390000e+12,-1.81150,-14.8730,-1.34840,1,walking
1,1.390000e+12,0.24517,-14.0700,-0.84446,1,walking
2,1.390000e+12,-0.57205,-14.6280,-1.75700,1,walking
3,1.390000e+12,-0.69464,-12.9390,-3.09180,1,walking
4,1.390000e+12,0.87170,-12.0000,-1.56630,1,walking
...,...,...,...,...,...,...
3149995,1.390000e+12,9.05750,-1.9477,-1.28030,9,downstairs
3149996,1.390000e+12,9.16650,-2.1793,-1.29390,9,downstairs
3149997,1.390000e+12,10.03800,-3.2144,-1.51190,9,downstairs
3149998,1.390000e+12,9.62960,-4.2223,-1.70250,9,downstairs


In [14]:
for i in range(1, len(data['participant'].unique())+1):
    print(f'Participant {i} has {len(data[data["participant"] == i])} rows')

Participant 1 has 315000 rows
Participant 2 has 315000 rows
Participant 3 has 315000 rows
Participant 4 has 315000 rows
Participant 5 has 315000 rows
Participant 6 has 315000 rows
Participant 7 has 315000 rows
Participant 8 has 315000 rows
Participant 9 has 315000 rows
Participant 10 has 315000 rows


In [15]:
import numpy as np
data['accel_magnitude'] = np.sqrt(data['Ax']**2 + data['Ay']**2 + data['Az']**2)

In [16]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [17]:
import numpy as np

def replace_sensor_errors(data, threshold=1000):
    data = np.array(data, dtype=float)
    n = data.shape[0]

    # Find indices where data > threshold
    error_idx = np.where(data > threshold)[0] # to get the actual indices array from the tuple returned by np.where.

    # Handle the first element if it's an error
    if 0 in error_idx:
        valid_idx = np.where(data <= threshold)[0]
        if valid_idx.size == 0:
            raise ValueError("All values in this dataset are errors")
        data[0] = data[valid_idx[0]]

    # For the rest, replace with previous valid value
    for i in error_idx:
        if i == 0:
            continue
        data[i] = data[i-1]

    return data

# Apply only to the accel_magnitude column
data['accel_magnitude'] = replace_sensor_errors(data['accel_magnitude'].values)

In [18]:
# raw data before feature extraction and window building
data['accel_magnitude'].groupby(data['activity']).describe() 


  data['accel_magnitude'].groupby(data['activity']).describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
biking,450000.0,10.039184,1.927517,0.921665,9.044184,10.000935,10.950975,30.916655
downstairs,450000.0,10.405523,4.081392,0.268638,7.641863,9.671814,12.538612,31.968675
jogging,450000.0,13.050862,6.521729,0.117166,7.62076,11.966447,19.041573,33.044148
sitting,450000.0,9.756681,0.226904,4.221222,9.652546,9.801582,9.898556,20.934715
standing,450000.0,9.871296,0.28371,3.870511,9.781635,9.890596,9.991227,18.484682
upstairs,450000.0,10.373926,3.151055,0.333906,8.327612,10.048675,12.252458,29.59737
walking,450000.0,10.67627,3.52824,0.288931,8.320685,10.398329,12.661786,31.462008


In [19]:
combined_df.sample(5) # with the other metrics
data.columns
# Group Column is like a time window of 1 second,
# we will leave it out, because we will use a sliding window of 20 seconds.


Index(['time_stamp', 'Ax', 'Ay', 'Az', 'participant', 'activity',
       'accel_magnitude'],
      dtype='object')

In [20]:
# Keep the essential columns
data_needed = data[['time_stamp','participant','activity','accel_magnitude']].copy()

In [21]:
# Save the cleaned and processed data to a new CSV file
output_file_path = 'Data/preprocessed_all_data.csv'
data_needed.to_csv(output_file_path, index=False)

## Creating Time Window
Info:

- window size: 20 sec
- sliding step : 1 sec = 50 Hz
- overlapping windows
- size: 1000samples = 20 seconds * 50 Hz  
- every participant has 63000 samples
- each window does

In [22]:
window_size = 1000   # 20 seconds * 50 Hz
step_size = 50       # 1 second * 50 Hz

participant_1_data = data[data['participant'] == 1].reset_index(drop=True)
windows = []

for start in range(0, len(participant_1_data) - window_size + 1, step_size):
    end = start + window_size
    window = participant_1_data.iloc[start:end]
    windows.append(window)

print(f"Total windows for participant 1: {len(windows)}")

Total windows for participant 1: 6281


# Feature Extraction
for all positions

#### Creating Time Window
Info:

- window size: 20 sec
- sliding step : 1 sec = 50 Hz
- overlapping windows
- size: 1000samples = 20 seconds * 50 Hz  
- every participant has 63000 samples
- each window does

In [1]:
import numpy as np
import pandas as pd

# Load the preprocessed data
data = pd.read_csv('Data/preprocessed_all_data.csv')
data.head()


Unnamed: 0,time_stamp,participant,activity,accel_magnitude
0,1390000000000.0,1,walking,15.043465
1,1390000000000.0,1,walking,14.097451
2,1390000000000.0,1,walking,14.744242
3,1390000000000.0,1,walking,13.321392
4,1390000000000.0,1,walking,12.133143


### Overlapping windows with 1-second steps and 20-second window length


In [2]:
# Define window size and step size
window_size = 1000   # 20 seconds * 50 Hz
step_size = 50       # 1 second * 50 Hz

# Initialize a list to hold the windows and their corresponding majority activities
windows = []
activities = []

# Create sliding windows
for start in range(0, len(data) - window_size + 1, step_size):
    end = start + window_size
    window = data.iloc[start:end]
    windows.append(window)
    majority_activity = window['activity'].mode()[0]  # Get the majority activity
    activities.append(majority_activity)

print(f'Total windows created: {len(windows)}')

Total windows created: 62981


In [3]:
# Convert the list of windows and activities into a DataFrame
windowed_data = pd.DataFrame({'window': windows, 'majority_activity': activities})

# Display the first few entries of the windowed data
windowed_data.sample(5)

Unnamed: 0,window,majority_activity
7132,time_stamp participant activity ac...,biking
49002,time_stamp participant activity...,downstairs
51008,time_stamp participant activity a...,sitting
19759,time_stamp participant activity ac...,biking
35788,time_stamp participant activity a...,jogging


In [4]:
from scipy.stats import skew
from scipy.signal import welch

def extract_features(window, fs=50):
    x = window['accel_magnitude'].values.astype(float)
    features = {}
    features['mean'] = np.mean(x)
    features['std'] = np.std(x)
    features['skewness'] = skew(x)
    features['max'] = np.max(x)
    features['min'] = np.min(x)
    features['range'] = np.max(x) - np.min(x)
    # Welch's method for power spectral density
    f, Pxx = welch(x, fs=fs)
    #https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.welch.html
    
    # Μπορείς να κρατήσεις όλο το φάσμα ή π.χ. το άθροισμα ή το μέγιστο
    features['psd_sum'] = np.sum(Pxx)
    features['psd_max'] = np.max(Pxx)
    # Αν θέλεις όλο το φάσμα:
    # for i, val in enumerate(Pxx):
    #     features[f'psd_{i}'] = val
    return features

# Εξαγωγή χαρακτηριστικών για όλα τα παράθυρα
feature_list = []
for window in windows:
    feats = extract_features(window)
    feature_list.append(feats)

features_df = pd.DataFrame(feature_list)
features_df['majority_activity'] = activities
features_df.sample(5)

Unnamed: 0,mean,std,skewness,max,min,range,psd_sum,psd_max,majority_activity
2389,11.109897,5.097848,0.297315,28.242444,0.768914,27.47353,131.961316,14.186211,downstairs
14392,10.831273,3.924805,1.243126,23.200958,1.925529,21.275428,80.616765,19.188657,sitting
41977,12.991344,7.519448,0.618191,31.059954,2.552627,28.507328,281.091602,128.641031,jogging
43746,10.194799,2.887675,0.456144,20.493507,3.104862,17.388645,41.669312,10.800412,upstairs
21177,10.56724,3.4864,0.122582,21.517285,2.738536,18.778749,64.551257,10.932152,upstairs


In [9]:
# Ensure 'participant' column exists in features_df before saving

if 'participant' not in features_df.columns:
    # Assign participant to each window based on the original data index
    # Each window is a DataFrame, so get the participant from the first row of each window
    participants = [int(window['participant'].iloc[0]) for window in windows]
    features_df['participant'] = participants

# Save the extracted features to a new CSV file
output_features_file_path = 'Data/features_all_df.csv'
features_df.to_csv(output_features_file_path, index=False)
print(f'Features DataFrame saved to {output_features_file_path}')

Features DataFrame saved to Data/features_all_df.csv


In [10]:
# --- Step 1: Load All Feature Data ---

# Adjust the path if your features file has a different name or location
features_path = 'Data/features_all_df.csv'  # Change if needed
features_df = pd.read_csv(features_path)

In [11]:
# Ensure 'participant' and 'majority_activity' columns exist
assert 'participant' in features_df.columns, "Participant column missing!"
assert 'majority_activity' in features_df.columns, "Activity label column missing!"

participants = features_df['participant'].unique()
print(f"Participants found: {participants}")

Participants found: [ 1 10  2  3  4  5  6  7  8  9]


### all data with the best MODEL MRWHIIIITE


##### last tasks 

- last grid searches for the best model
- change the test set according to the question 5 and use the best model
- combination of labels for better confusion matrix
-  report
- presentation




(the simplest svm model Mean LOSO SVM Accuracy: 0.658 for all data took 30minutes...... so decide the best of all and just one try)

In [12]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# --- Step 4: LOSO Cross-Validation Loop with SVM ---

svm_results = []

for pid in participants:
    # Split data
    train_df = features_df[features_df['participant'] != pid]
    test_df = features_df[features_df['participant'] == pid]
    
    # Features and labels
    X_train = train_df.drop(['majority_activity', 'participant'], axis=1)
    y_train = train_df['majority_activity']
    X_test = test_df.drop(['majority_activity', 'participant'], axis=1)
    y_test = test_df['majority_activity']
    
    # Train SVM classifier
    n_features = X_train.shape[1]
    svm_clf = SVC(kernel='rbf', C=1, gamma=1/n_features, random_state=42)
    svm_clf.fit(X_train, y_train)
    
    # Predict
    y_pred = svm_clf.predict(X_test)
    
    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    svm_results.append({'participant': pid, 'accuracy': acc, 'report': report})

    print(f"SVM Participant {pid} - Accuracy: {acc:.3f}")

# Optionally, aggregate results
mean_svm_acc = np.mean([r['accuracy'] for r in svm_results])
print(f"\nMean LOSO SVM Accuracy: {mean_svm_acc:.3f}")

SVM Participant 1 - Accuracy: 0.746
SVM Participant 10 - Accuracy: 0.575
SVM Participant 2 - Accuracy: 0.668
SVM Participant 3 - Accuracy: 0.585
SVM Participant 4 - Accuracy: 0.679
SVM Participant 5 - Accuracy: 0.640
SVM Participant 6 - Accuracy: 0.646
SVM Participant 7 - Accuracy: 0.693
SVM Participant 8 - Accuracy: 0.690
SVM Participant 9 - Accuracy: 0.654

Mean LOSO SVM Accuracy: 0.658
