In [None]:
import os
import pandas as pd
import numpy as np
from torch.utils.data import  Dataset
import pickle
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import interp1d
import scipy.io
from scipy.io import loadmat

In [None]:
file_path = './segmented_data.pickle'
with open(file_path, 'rb') as file:
    data = pickle.load(file)

subject_names = [
    'sensei-103', 'sensei-178', 'sensei-188', 'sensei-201', 'sensei-223', 
    'sensei-237', 'sensei-318', 'sensei-359', 'sensei-415', 'sensei-460', 
    'sensei-489', 'sensei-616', 'sensei-642', 'sensei-661', 'sensei-715', 
    'sensei-862', 'sensei-879', 'sensei-904', 'sensei-972', 'sensei-997'
]
subject_mapping = {name: i for i, name in enumerate(subject_names, start=1)}

sensors = [
    'corsano_wrist', 'cosinuss_ear', 'sensomative_bottom', 'sensomative_back', 
    'vivalink_patch', 'zurichmove_wheel'
]

all_sensors_data = []

original_sampling_rate = 125
new_sampling_rate = 20

for subject, labels in data.items():
    for label, sensors_data in labels.items():
        sensor_dfs = []
        for sensor in sensors:
            if sensor in sensors_data:
                df = sensors_data[sensor].copy()
                if not df.empty:
                    timestamps = np.linspace(0, len(df)/original_sampling_rate, len(df), endpoint=False)
                    new_timestamps = np.arange(0, timestamps[-1], 1/new_sampling_rate)
                    
                    interpolated_sensor_data = pd.DataFrame()
                    for column in df.columns:
                        interpolator = interp1d(timestamps, df[column], kind='linear', bounds_error=False, fill_value='extrapolate')
                        interpolated_sensor_data[column] = interpolator(new_timestamps)
                    
                    interpolated_sensor_data = interpolated_sensor_data.add_prefix(sensor + '_')
                    sensor_dfs.append(interpolated_sensor_data)
        if sensor_dfs:
            concatenated_df = pd.concat(sensor_dfs, axis=1)
            concatenated_df['label'] = label
            concatenated_df['subject'] = subject_mapping[subject]
            all_sensors_data.append(concatenated_df)

final_combined_df = pd.concat(all_sensors_data, ignore_index=True)

existing_subjects = sorted(final_combined_df['subject'].unique())
new_subject_mapping = {old: new for new, old in enumerate(existing_subjects, start=1)}
final_combined_df['subject'] = final_combined_df['subject'].map(new_subject_mapping)

scaler = StandardScaler()
cols_to_exclude = ['label', 'subject']
features_df = final_combined_df.drop(columns=cols_to_exclude)
standardized_features = scaler.fit_transform(features_df)
standardized_features_df = pd.DataFrame(standardized_features, columns=features_df.columns)

final_standardized_df = pd.concat([standardized_features_df, final_combined_df[cols_to_exclude].reset_index(drop=True)], axis=1)

root_dir = 'segmented_data/combined_sensor_data.csv'
final_standardized_df.to_csv(root_dir, index=False)

df = pd.read_csv(root_dir)
df.drop(columns=['vivalink_patch_ecg'], inplace=True)
modified_csv_file_path = './segmented_data/modified_combined_sensor_data.csv'
df.to_csv(modified_csv_file_path, index=False)
pickle_file_path = './segmented_data/modified_combined_sensor_data.pkl'
df.to_pickle(pickle_file_path)

print(f"DataFrame saved as Pickle file to {pickle_file_path}")

In [None]:
class MhealthDataset(Dataset): # 50hz   
    '''L1: Standing still (1 min) 
    L2: Sitting and relaxing (1 min) 
    L3: Lying down (1 min) 
    L4: Walking (1 min) 
    L5: Climbing stairs (1 min) 
    L6: Waist bends forward (20x) 
    L7: Frontal elevation of arms (20x)
    L8: Knees bending (crouching) (20x)
    L9: Cycling (1 min)
    L10: Jogging (1 min)
    L11: Running (1 min)
    L12: Jump front & back (20x)'''
    def __init__(self, csv_path='./Datapool_new/Mhealth_data.csv', transform=None):
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")

            files = [f'./ADL/Dataset/1.Mhealth/MHEALTHDATASET/mHealth_subject{i}.log' for i in range(1, 11)]
            columns = [0, 1, 2, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 19, 23]
            mhealth_column_names = ['acx_chest', 'acy_chest', 'acz_chest',
                                    'acx_l_ankle', 'acy_l_ankle', 'acz_l_ankle',
                                    'gyx_l_ankle', 'gyy_l_ankle', 'gyz_l_ankle',
                                    'acx_r_arm', 'acy_r_arm', 'acz_r_arm',
                                    'gyx_r_arm', 'gyy_r_arm', 'gyz_r_arm',
                                    'label','subject']
            mhealth_data = []

            for i, file in enumerate(files, start=1):
                data = np.loadtxt(file)
                selected_data = data[:, columns[:-1]]
                labels = data[:, columns[-1]] 

                timestamps = np.arange(selected_data.shape[0]) / 50.0
                new_timestamps = np.arange(0, timestamps[-1], 1/20.0)
                
                downsampled_data = []
                for j in range(selected_data.shape[1]):
                    interpolator = interp1d(timestamps, selected_data[:, j], kind='linear')
                    downsampled_column = interpolator(new_timestamps)
                    downsampled_data.append(downsampled_column)
                label_interpolator = interp1d(timestamps, labels, kind='nearest')
                downsampled_labels = label_interpolator(new_timestamps)
                downsampled_data_with_labels = np.column_stack((np.column_stack(downsampled_data), downsampled_labels))
                
                subject_column = np.full(downsampled_data_with_labels.shape[0], i, dtype=int)
                downsampled_data_with_subject = np.column_stack((downsampled_data_with_labels, subject_column))
                
                mhealth_data.append(downsampled_data_with_subject)
            mhealth_dataset = pd.DataFrame(np.vstack(mhealth_data), columns=mhealth_column_names)
          
            mhealth_dataset = mhealth_dataset[mhealth_dataset['label'] != 0]
            cols_to_exclude = ['label','subject']
            labels = mhealth_dataset['label'].values
            subjects = mhealth_dataset['subject'].values
            mhealth_to_standardize = mhealth_dataset.drop(columns=cols_to_exclude)

            scaler = StandardScaler()
            features = scaler.fit_transform(mhealth_to_standardize)

            features_df = pd.DataFrame(features, columns=mhealth_to_standardize.columns)
            labels_df = pd.DataFrame(labels, columns=['label'])
            subjects_df = pd.DataFrame(subjects, columns=['subject'])

            processed_data = pd.concat([features_df, labels_df,subjects_df], axis=1)
            for col in ['label', 'subject']:
                processed_data[col] = processed_data[col].astype(int)
            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                print(1)
                processed_data['label'] = processed_data['label'] - 1

            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        cols_to_exclude = ['label', 'subject']
        self.labels = processed_data['label'].values
        self.subjects = processed_data['subject'].values
        self.features = processed_data.drop(columns=cols_to_exclude).values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        subject = self.subjects[index]
        
        if self.transform:
            x = self.transform(x)

        return x, y, subject

    def __len__(self):
        return len(self.features)


In [None]:
class USCHADDataset(Dataset): #100hz
    
    '''1. Walking Forward
    2. Walking Left
    3. Walking Right
    4. Walking Upstairs
    5. Walking Downstairs
    6. Running Forward
    7. Jumping Up
    8. Sitting
    9. Standing
    10. Sleeping
    11. Elevator Up
    12. Elevator Down'''
    def __init__(self, root_dir='./ADL/Dataset/4.USC-HAD/', csv_path='./Datapool_new/USCHAD_data.csv', transform=None):
        try:
            self.processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")

            self.data_frames = []
            subject_id = 1
            for subject_folder in sorted(os.listdir(root_dir)):
                subject_folder_path = os.path.join(root_dir, subject_folder)
                if os.path.isdir(subject_folder_path):
                    # Process each MAT file within the subject folder
                    for file in sorted(os.listdir(subject_folder_path)):
                        file_path = os.path.join(subject_folder_path, file)
                        if file.endswith('.mat'):
                            mat = scipy.io.loadmat(file_path)
                            sensor_readings = mat['sensor_readings']
                            activity_number = mat['activity_number']

                            timestamps = np.arange(len(sensor_readings)) / 100.0
                            new_timestamps = np.arange(0, timestamps[-1], 1/20.0)
                            downsampled_data = []
                            for j in range(sensor_readings.shape[1]):
                                interpolator = interp1d(timestamps, sensor_readings[:, j], kind='linear')
                                downsampled_column = interpolator(new_timestamps)
                                downsampled_data.append(downsampled_column)
                            downsampled_data = np.array(downsampled_data).T
                            
                            activity_number = np.full(len(downsampled_data), activity_number[0])
                            data_frame = pd.DataFrame(downsampled_data, columns=['accx', 'accy', 'accz', 'gyrx', 'gyry', 'gyrz'])

                            data_frame['label'] = activity_number
                            data_frame['subject'] = subject_id
                            self.data_frames.append(data_frame)
                    subject_id += 1
            self.processed_data = pd.concat(self.data_frames, ignore_index=True)
            
            subjects = self.processed_data.pop('subject')
            cols_to_exclude = ['label']
            labels = self.processed_data[cols_to_exclude].values
            features_to_standardize = self.processed_data.drop(columns=cols_to_exclude)

            scaler = StandardScaler()
            features = scaler.fit_transform(features_to_standardize)

            features_df = pd.DataFrame(features, columns=features_to_standardize.columns)
            labels_df = pd.DataFrame(labels, columns=cols_to_exclude)

            self.processed_data = pd.concat([features_df, labels_df, subjects], axis=1)
            
            for col in ['label', 'subject']:
                self.processed_data[col] = self.processed_data[col].astype(int)
            
            print(self.processed_data['label'].min() )
            if self.processed_data['label'].min() > 0:
                print(1)
                self.processed_data['label'] = self.processed_data['label'] - 1

            self.processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = self.processed_data['label'].values
        self.features = self.processed_data.drop(columns=['label', 'subject']).values
        self.subjects = self.processed_data['subject'].values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        subject = self.subjects[index]

        if self.transform:
            x = self.transform(x)
        return x, y, subject

    def __len__(self):
        return len(self.features)

In [None]:
class MotionsenseDataset(Dataset): # 50hz   
    '''1.dws: downstairs
    2.ups: upstairs
    3.sit: sitting
    4.std: standing
    5.wlk: walking
    6.jog: jogging''' 
    def __init__(self, root_dir='./ADL/Dataset/5.motionsense/A_DeviceMotion_data', csv_path='./Datapool_new/motionsense_data.csv', transform=None):
        activities = {'dws': 1, 'ups': 2, 'sit': 3, 'std': 4, 'wlk': 5, 'jog': 6}
        
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            all_data = []

            for activity, label in activities.items():
                activity_folders = [f for f in os.listdir(root_dir) if f.startswith(activity)]
                
                for folder in activity_folders:
                    folder_path = os.path.join(root_dir, folder)
                    for file_name in os.listdir(folder_path)[:24]:
                        subject_id = int(file_name.split('_')[1].split('.')[0])
                        file_path = os.path.join(folder_path, file_name)
                        data = pd.read_csv(file_path, usecols=[10, 11, 12, 7, 8, 9])
                        
                        timestamps = np.arange(len(data)) / 50.0
                        new_timestamps = np.arange(0, timestamps[-1], 1/20.0)
                        
                        interpolated_data = []
                        for column in data.columns:
                            interpolator = interp1d(timestamps, data[column], kind='linear')
                            downsampled_column = interpolator(new_timestamps)
                            interpolated_data.append(downsampled_column)
                        
                        downsampled_data = np.column_stack(interpolated_data)
                        labels = np.full((downsampled_data.shape[0], 1), label, dtype=int)
                        subject_column = np.full((downsampled_data.shape[0], 1), subject_id, dtype=int)
                        all_data.append(np.hstack((downsampled_data, labels, subject_column)))

            all_data = np.vstack(all_data)
            columns = ['accx', 'accy', 'accz', 'gyrx', 'gyry', 'gyrz', 'label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)

            scaler = StandardScaler()
            features = scaler.fit_transform(processed_data.iloc[:, :-2])

            processed_data.iloc[:, :-2] = features
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)
            
            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data['label'].values
        self.features = processed_data.drop(columns=['label']).values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]

        if self.transform:
            x = self.transform(x)

        return x, y

    def __len__(self):
        return len(self.features)



In [None]:
class PAMAP2Dataset(Dataset): # 100hz   
    '''1: 'lying',
    2: 'sitting',
    3: 'standing',
    4: 'walking',
    5: 'running',
    6: 'cycling',
    7: 'Nordic_walking',
    8: 'ascending_stairs',
    9: 'descending_stairs',
    10: 'vacuum_cleaning',
    11: 'ironing',
    12: 'rope_jumping'''
    def __init__(self, root_dir='./ADL/Dataset/6.PAMAP2/PAMAP2_Dataset/Protocol/', csv_path='./Datapool_new/PAMAP2_data.csv', transform=None):
        try:
            all_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            all_data = []

            columns = [4, 5, 6, 10, 11, 12, 21, 22, 23, 27, 28, 29, 38, 39, 40, 44, 45, 46, 1]
            column_names = ['handAccx', 'handAccy', 'handAccz', 'handGyrox', 'handGyroy', 'handGyroz',
                            'chestAccx', 'chestAccy', 'chestAccz', 'chestGyrox', 'chestGyroy', 'chestGyroz',
                            'ankleAccx', 'ankleAccy', 'ankleAccz', 'ankleGyrox', 'ankleGyroy', 'ankleGyroz','label']

            subject_id = 1
            for file_name in os.listdir(root_dir):
                if file_name.endswith('.dat'):
                    file_path = os.path.join(root_dir, file_name)
                    data = pd.read_table(file_path, header=None, sep='\s+')
                    data = data[columns]
                    data.columns = column_names
                    data = data.apply(pd.to_numeric, errors='coerce')
                    data = data[data['label'] != 0]
                    data = data.interpolate()
                    downsampled_data = data.iloc[::5].copy()
                    downsampled_data['subject'] = subject_id

                    all_data.append(downsampled_data)
                    subject_id += 1
                
            all_data = pd.concat(all_data)

            unique_labels = sorted(all_data['label'].unique())
            label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels, start=1)}
            all_data['label'] = all_data['label'].map(label_mapping)

            scaler = StandardScaler()
            features_to_scale = all_data.columns.drop(['label','subject'])
            all_data[features_to_scale] = scaler.fit_transform(all_data[features_to_scale])

            print(all_data['label'].min() )
            if all_data['label'].min() > 0:
                all_data['label'] = all_data['label'] - 1
            all_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = all_data['label'].values
        self.features = all_data.drop(columns=['label']).values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.features)



In [None]:
class RealdispDataset(Dataset): # 50hz   
    
    '''L1: Walking (1 min) L12: Waist rotation (20x) L23: Shoulders high amplitude rotation (20x)
    L2: Jogging (1 min) L13: Waist bends (reach foot with opposite hand) (20x) L24: Shoulders low amplitude rotation (20x)
    L3: Running (1 min) L14: Reach heels backwards (20x) L25: Arms inner rotation (20x)
    L4: Jump up (20x) L15: Lateral bend (10x to the left + 10x to the right) L26: Knees (alternatively) to the breast (20x)
    L5: Jump front & back (20x) L16: Lateral bend arm up (10x to the left + 10x to the right) L27: Heels (alternatively) to the backside (20x)
    L6: Jump sideways (20x) L17: Repetitive forward stretching (20x) L28: Knees bending (crouching) (20x)
    L7: Jump leg/arms open/closed (20x) L18: Upper trunk and lower body opposite twist (20x) L29: Knees (alternatively) bend forward (20x)
    L8: Jump rope (20x) L19: Arms lateral elevation (20x) L30: Rotation on the knees (20x)
    L9: Trunk twist (arms outstretched) (20x) L20: Arms frontal elevation (20x) L31: Rowing (1 min)
    L10: Trunk twist (elbows bended) (20x) L21: Frontal hand claps (20x) L32: Elliptic bike (1 min)
    L11: Waist bends forward (20x) L22: Arms frontal crossing (20x) L33: Cycling (1 min)'''
    
    def __init__(self, root_dir='./ADL/Dataset/7.REALDISP/', csv_path='./Datapool_new/Realdisp_data.csv', transform=None):
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            
            body_parts = ['RLA', 'RUA', 'BACK', 'LUA', 'LLA', 'RC', 'RT', 'LT', 'LC']
            data_types = ['_accx', '_accy', '_accz', '_gyrx', '_gyry', '_gyrz']
            column_names = [bp + dt for bp in body_parts for dt in data_types]
            column_names.append('label')
            column_names.append('subject')
            all_data = []
            for file_name in os.listdir(root_dir):
                if 'ideal' in file_name and file_name.endswith('.log'):
                    subject_id = int(file_name.split('subject')[1].split('_')[0])
                    file_path = os.path.join(root_dir, file_name)
                    columns_to_use = [2, 3, 4, 5, 6, 7, 15, 16, 17, 18, 19, 20, 28, 29, 30, 31, 32, 33, 41, 42, 43, 44, 45, 46, 54, 55, 56, 57, 58, 59, 67, 68, 69, 70, 71, 72, 80, 81, 82, 83, 84, 85, 93, 94, 95, 96, 97, 98, 106, 107, 108, 109, 110, 111, 119]
                    data = pd.read_csv(file_path,sep='\t', usecols=columns_to_use)
                    timestamps = np.arange(len(data)) / 50.0
                    new_timestamps = np.arange(0, timestamps[-1], 1/20.0)

                    interpolated_data = []
                    for column in data.columns[:-1]:
                        interpolator = interp1d(timestamps, data[column], kind='linear')
                        downsampled_column = interpolator(new_timestamps)
                        interpolated_data.append(downsampled_column)

                    label_interpolator = interp1d(timestamps, data.iloc[:, -1], kind='nearest')
                    downsampled_labels = label_interpolator(new_timestamps)
                    
                    mask = downsampled_labels != 0
                    downsampled_labels = downsampled_labels[mask]
                    interpolated_data = [column[mask] for column in interpolated_data]
                    
                    subject_ids = np.full(len(downsampled_labels), subject_id)
                    
                    downsampled_data = np.column_stack((np.column_stack(interpolated_data), downsampled_labels, subject_ids))
                    all_data.append(downsampled_data)

            all_data = np.vstack(all_data)
            processed_data = pd.DataFrame(all_data, columns=column_names)

            scaler = StandardScaler()
            features = processed_data.iloc[:, :-2]
            processed_data.iloc[:, :-2] = scaler.fit_transform(features)

            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)
            
            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data.iloc[:, -2].values
        self.features = processed_data.iloc[:, :-2].values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]

        if self.transform:
            x = self.transform(x)

        return x, y

    def __len__(self):
        return len(self.features)


In [None]:
class RealworldDataset(Dataset): # 50hz   
    '''1.climbingdown
    2.climbingup
    3.jumping
    4.lying
    5.running
    6.sitting
    7.standing
    8.walking''' 
    def __init__(self, root_dir='./ADL/Dataset/9.RealWorld/', csv_path='./Datapool_new/Realworld_data.csv', transform=None):
        activities = {'climbingdown': 7, 'climbingup': 8, 'jumping': 1, 'lying': 2, 'running': 3, 'sitting': 4, 'standing': 5, 'walking': 6}
        body_parts = ['chest', 'forearm', 'head', 'shin', 'thigh', 'upperarm', 'waist']
        
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            all_data = []

            for subject_folder in os.listdir(root_dir):
                subject_id = int(subject_folder.replace('proband', ''))
                subject_path = os.path.join(root_dir, subject_folder)
                if os.path.isdir(subject_path):
                    for activity, label in activities.items():
                        for part in body_parts:
                            acc_file = f'acc_{activity}_{part}.csv'
                            gyro_file = f'Gyroscope_{activity}_{part}.csv'
                            acc_path = os.path.join(subject_path, acc_file)
                            gyro_path = os.path.join(subject_path, gyro_file)

                            if os.path.isfile(acc_path) and os.path.isfile(gyro_path):
                                acc_data = pd.read_csv(acc_path, usecols=[2, 3, 4])
                                gyro_data = pd.read_csv(gyro_path, usecols=[2, 3, 4])

                                min_length = min(len(acc_data), len(gyro_data))
                                shorter_data = pd.read_csv(acc_path, usecols=[1]) if len(acc_data) == min_length else pd.read_csv(gyro_path, usecols=[1])
                                acc_data = acc_data.iloc[:min_length]
                                gyro_data = gyro_data.iloc[:min_length]
                                
                                start_time = int(shorter_data.iloc[0])
                                end_time = int(shorter_data.iloc[-1])
                                total_time = (end_time - start_time) / 1000.0
                                
                                original_freq = min_length / total_time
                                timestamps = np.arange(min_length) / original_freq
                                new_timestamps = np.arange(0, timestamps[-1], 1/20.0)
                                
                                interpolated_acc = [interp1d(timestamps, acc_data.iloc[:, i], kind='linear')(new_timestamps) for i in range(acc_data.shape[1])]
                                interpolated_gyro = [interp1d(timestamps, gyro_data.iloc[:, i], kind='linear')(new_timestamps) for i in range(gyro_data.shape[1])]

                                combined_data = np.column_stack((interpolated_acc + interpolated_gyro))
                                labels = np.full(len(combined_data), label)
                                subject_column = np.full(len(combined_data), subject_id)
                                all_data.append(np.column_stack((combined_data, labels, subject_column)))

            all_data = np.vstack(all_data)
            columns = ['accx', 'accy', 'accz', 'gyrx', 'gyry', 'gyrz', 'label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)
            scaler = StandardScaler()
            features = scaler.fit_transform(processed_data.iloc[:, :-2])

            processed_data.iloc[:, :-2] = features
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)
            
            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data['label'].values
        self.features = processed_data.drop(columns=['label']).values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]

        if self.transform:
            x = self.transform(x)

        return x, y

    def __len__(self):
        return len(self.features)



In [None]:
def load_and_process_file(file_path):
    data = pd.read_csv(file_path)

    data = data[['Arrival_Time', 'x', 'y', 'z', 'User', 'Model', 'Device', 'gt']]
    
    return data
def process_group(group, activities):
    start_time = int(group['Arrival_Time'].iloc[0])
    end_time = int(group['Arrival_Time'].iloc[-1])
    total_time = (end_time - start_time) / 1000.0
    original_freq = len(group) / total_time

    timestamps = np.arange(len(group)) / original_freq

    new_timestamps = np.arange(0, timestamps[-1], 1/20.0)

    interpolated_sensor_data = [interp1d(timestamps, group[col], kind='linear', fill_value='extrapolate')(new_timestamps) for col in ['x', 'y', 'z']]

    label_indices = np.round(np.linspace(0, len(group) - 1, len(new_timestamps))).astype(int)
    labels = np.array([activities[act] for act in group['gt'].iloc[label_indices]])

    return np.column_stack(interpolated_sensor_data), labels

def process_and_merge_data(acc_file, gyro_file, activities, users):
    data_acc = load_and_process_file(acc_file)
    data_gyro = load_and_process_file(gyro_file)

    data_acc = data_acc[data_acc['gt'].notna() & (data_acc['gt'] != 'null')]
    data_gyro = data_gyro[data_gyro['gt'].notna() & (data_gyro['gt'] != 'null')]

    merged_data = []
    for (device, user), acc_group in data_acc.groupby(['Device', 'User']):
        gyro_group = data_gyro[(data_gyro['Device'] == device) & (data_gyro['User'] == user)]

        if not gyro_group.empty:
            processed_acc, acc_labels = process_group(acc_group, activities)
            processed_gyro, gyro_labels = process_group(gyro_group, activities)

            min_length = min(len(processed_acc), len(processed_gyro))
            combined_data = np.hstack((processed_acc[:min_length], processed_gyro[:min_length]))

            labels = acc_labels if len(processed_acc) < len(processed_gyro) else gyro_labels
            labels = labels[:min_length]

            user_id = users[user]
            user_column = np.full(min_length, user_id, dtype=int)

            combined_data_with_labels = np.hstack((combined_data, labels[:, None], user_column[:, None]))
            merged_data.append(combined_data_with_labels)

    final_data = np.vstack(merged_data)

    return final_data


class HHARDataset(Dataset):
    def __init__(self, root_dir='./ADL/Dataset/8.HHAR/', csv_path='./Datapool_new/HHAR_data.csv', transform=None):
        activities = {'null': 0, 'stand': 1, 'sit': 2, 'walk': 3, 'stairsup': 4, 'stairsdown': 5, 'bike': 6}
        users = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9}

        try:
            all_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            all_data = []
            
            for file_type in ['Phones', 'Watch']:
                acc_file = os.path.join(root_dir, f'{file_type}_accelerometer.csv')
                gyro_file = os.path.join(root_dir, f'{file_type}_gyroscope.csv')

                combined_data = process_and_merge_data(acc_file, gyro_file, activities, users)
                all_data.append(combined_data)

            all_data = np.vstack(all_data)

            columns = ['accx', 'accy', 'accz', 'gyrx', 'gyry', 'gyrz', 'label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)

            scaler = StandardScaler()
            processed_data.iloc[:, :-2] = scaler.fit_transform(processed_data.iloc[:, :-2])
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)

            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                print(1)
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data[['label', 'subject']].values
        self.features = processed_data.drop(columns=['label', 'subject']).values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.features)



In [None]:
def remove_semicolon(x):
    return x.replace(';', '') if isinstance(x, str) else x

class WISDMDataset(Dataset): # 50hz   
    '''1.Walking
    2.Jogging
    3.Upstairs
    4.Downstairs
    5.Sitting
    6.Standing''' 
    def __init__(self, root_dir='./ADL/Dataset/11.WISDM/', csv_path='./Datapool_new/WISDM_data.csv', transform=None):
        
        activities = {'Walking': 1, 'Jogging': 2, 'Upstairs': 3, 'Downstairs': 4, 'Sitting': 5, 'Standing': 6}
        
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            
            file_path = os.path.join(root_dir, 'WISDM_ar_v1.1_raw.txt')
            raw_data = pd.read_csv(file_path, header=None, names=['subject', 'activity', 'timestamp', 'accx', 'accy', 'accz'],
                        converters={'accx': remove_semicolon, 'accy': remove_semicolon, 'accz': remove_semicolon})
            
            raw_data['activity'] = raw_data['activity'].map(activities)
            
            all_data = []
            
            time_interval_threshold = 3600
            
            for user in range(1, 37):
                for activity in activities.values():
                    user_activity_data = raw_data[(raw_data['subject'] == user) & (raw_data['activity'] == activity)].copy()
                    for col in ['timestamp', 'accx', 'accy', 'accz']:
                        user_activity_data[col] = pd.to_numeric(user_activity_data[col], errors='coerce')
                        user_activity_data[col].replace(0, np.nan, inplace=True)
                    user_activity_data.dropna(inplace=True)

                    if not user_activity_data.empty:
                        timestamps = user_activity_data['timestamp'].values / 1e9
                        timestamp_diffs = np.abs(np.diff(timestamps))
                        indices = np.where(timestamp_diffs > time_interval_threshold)[0] + 1
                        sequences = np.split(user_activity_data, indices)

                        for seq in sequences:
                            if len(seq) < 2:
                                continue

                            downsampled_data = []

                            timestamps = seq['timestamp'].values / 1e9
                            length = len(timestamps)
                            original_freq = length / (timestamps[-1] - timestamps[0])
                            timestamps = np.arange(length) / original_freq
                            new_timestamps = np.arange(0, timestamps[-1], 1/20.0)

                            for col in ['accx', 'accy', 'accz']:
                                interpolator = interp1d(timestamps, seq[col], kind='linear', bounds_error=False, fill_value='extrapolate')
                                downsampled_data.append(interpolator(new_timestamps))

                            downsampled_data = np.column_stack(downsampled_data)
                            labels = np.full(downsampled_data.shape[0], activity)
                            subject_column = np.full(downsampled_data.shape[0], user)
                            
                            all_data.append(np.column_stack((downsampled_data, labels, subject_column)))
                    
            all_data = np.vstack(all_data)
            columns = ['accx', 'accy', 'accz', 'label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)

            scaler = StandardScaler()
            features = scaler.fit_transform(processed_data.iloc[:, :-2])

            processed_data.iloc[:, :-2] = features
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)

            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                print(1)
                processed_data['label'] = processed_data['label'] - 1
                
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")
        self.labels = processed_data['label'].values
        self.features = processed_data.drop(columns=['label', 'subject']).values
        self.subjects = processed_data['subject'].values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        subject = self.subjects[index]

        if self.transform:
            x = self.transform(x)

        return x, y, subject

    def __len__(self):
        return len(self.features)


In [None]:
class DSADSDataset(Dataset): # 25hz   
    '''1.sitting, 
    2.standing, 
    3.4.lying on back and on right side, 
    5.6.ascending and descending stairs, 
    7.standing in an elevator still 
    8.and moving around in an elevator, 
    9.walking in a parking lot, 
    10.11.walking on a treadmill with a speed of 4 km/h (in flat and 15 deg inclined positions)
    12.running on a treadmill with a speed of 8 km/h, 
    13.exercising on a stepper, 
    14.exercising on a cross trainer, 
    15.16.cycling on an exercise bike in horizontal and vertical positions,
    17.rowing, 
    18.jumping, 
    19.playing basketball''' 
    def __init__(self, root_dir='./ADL/Dataset/12.UCI DSADS/data', csv_path='./Datapool_new/DSADS_data.csv', transform=None):
        
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            
            all_data = []
            for activity in range(1, 20):
                activity_folder = f"a{activity:02d}"
                for person in range(1, 9):
                    person_folder = f"p{person}"
                    path = os.path.join(root_dir, activity_folder, person_folder)

                    for filename in os.listdir(path):
                        file_path = os.path.join(path, filename)
                        df = pd.read_csv(file_path, header=None)

                        selected_columns = df.iloc[:, np.r_[:6, 9:15, 18:24, 27:33, 36:42]]
                        timestamps = np.arange(len(selected_columns)) / 25.0 
                        new_timestamps = np.arange(0, timestamps[-1], 1/20.0)

                        interpolated_data = []
                        for col in selected_columns.columns:
                            interpolator = interp1d(timestamps, selected_columns[col], kind='linear', fill_value='extrapolate')
                            interpolated_data.append(interpolator(new_timestamps))

                        interpolated_data = np.column_stack(interpolated_data)
                        subject_column = np.full(interpolated_data.shape[0], person)
                        activity_column = np.full(interpolated_data.shape[0], activity)
                        all_data.append(np.column_stack((interpolated_data, activity_column, subject_column)))

            all_data = np.vstack(all_data)
            units = ['T', 'RA', 'LA', 'RL', 'LL']
            columns = [f'{unit}_{col}' for unit in units for col in ['accx', 'accy', 'accz', 'gyrx', 'gyry', 'gyrz']] + ['label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)

            scaler = StandardScaler()
            features = processed_data.iloc[:, :-2]
            processed_data.iloc[:, :-2] = scaler.fit_transform(features)
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)
            
            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                print(1)
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data['label'].values
        self.features = processed_data.drop(columns=['label', 'subject']).values
        self.subjects = processed_data['subject'].values

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        subject = self.subjects[index]

        if self.transform:
            x = self.transform(x)

        return x, y, subject

    def __len__(self):
        return len(self.features)


In [None]:
class UniMiBSHARDataset(Dataset): # 50hz   
    '''1.standing_up
    2.getting_up
    3.walking
    4.running
    5.going_up
    6.jumping
    7.going_down
    8.lying_down
    9.sitting_down''' 
    def __init__(self, root_dir='./ADL/Dataset/13.UniMiB SHAR/data', csv_path='./Datapool_new/UniMiBSHAR_data.csv', transform=None):
        
        activities = {'Walking': 1, 'Running': 2, 'GoingUpS': 3, 'GoingDownS': 4, 'Jumping': 5, 'SittingDown': 6, 'StandingUpFS': 7, 'LyingDownFS': 8, 'StandingUpFL': 9}
        
        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")
            
            mat_data = loadmat(os.path.join(root_dir, 'full_data.mat'))['full_data']
            all_data = []
            for subject in range(30):
                subject_struct = mat_data[subject, 0]
                for activity_name, activity_label in activities.items():
                    activity_data_cells = subject_struct[activity_name][0][0]
                    for trial in range(2): 
                        trial_data = activity_data_cells[trial, 0]
                        acc_data = trial_data[:3, :].T
                        timestamps = np.arange(acc_data.shape[0]) / 50.0
                        new_timestamps = np.arange(0, timestamps[-1], 1/20.0)
                        interpolated_data = []
                        for col in range(acc_data.shape[1]):
                            interpolator = interp1d(timestamps, acc_data[:, col], kind='linear', fill_value='extrapolate')
                            interpolated_data.append(interpolator(new_timestamps))
                        interpolated_data = np.column_stack(interpolated_data)

                        label_column = np.full(interpolated_data.shape[0], activity_label)
                        subject_column = np.full(interpolated_data.shape[0], subject + 1)
                        all_data.append(np.column_stack((interpolated_data, label_column, subject_column)))

            all_data = np.vstack(all_data)
            columns = ['accx', 'accy', 'accz', 'label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)

            scaler = StandardScaler()
            features = scaler.fit_transform(processed_data.iloc[:, :-2])

            processed_data.iloc[:, :-2] = features
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)

            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                print(1)
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data['label'].values
        self.features = processed_data.drop(columns=['label', 'subject']).values
        self.subjects = processed_data['subject'].values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        subject = self.subjects[index]

        if self.transform:
            x = self.transform(x)

        return x, y, subject

    def __len__(self):
        return len(self.features)


In [None]:
class WARDDataset(Dataset): # 50hz   
    '''1. Stand
    2. Sit
    3. Lie down
    4. Walk forward
    5. Walk left-circle
    6. Walk right-circle
    7. Turn left
    8. Turn right
    9. Go upstairs
    10. Go downstairs
    11. Jog
    12. Jump
    13. Push wheelchair''' 
    def __init__(self, root_dir='./ADL/Dataset/14.WARD/', csv_path='./Datapool_new/WARD_data.csv', transform=None):
        
        sensor_prefixes = ['LF_', 'RF_', 'WAIST_', 'LA_', 'RA_']
        feature_suffixes = ['accx', 'accy', 'accz', 'gyrx', 'gyry']
        column_names = [prefix + suffix for prefix in sensor_prefixes for suffix in feature_suffixes]

        try:
            processed_data = pd.read_csv(csv_path)
            print(f"Loaded data from {csv_path}")
        except FileNotFoundError:
            print(f"Processed file not found at {csv_path}. Processing data...")

            all_data = []
            for subject in range(1, 21):
                subject_folder = os.path.join(root_dir, f"Subject{subject}")
                for mat_file in os.listdir(subject_folder):
                    if mat_file.endswith('.mat'):
                        mat_data = loadmat(os.path.join(subject_folder, mat_file))
                        readings = mat_data['WearableData'][0][0][5][0]
                        activity_data = np.hstack([readings[i] for i in range(5)])
                        label = int(mat_file.split('a')[1].split('t')[0])
                        subject_data = np.full((activity_data.shape[0], 1), subject)
                        label_data = np.full((activity_data.shape[0], 1), label)

                        combined_data = np.hstack((activity_data, label_data, subject_data))
                        if np.isinf(combined_data).any():
                            combined_data = combined_data[~np.isinf(combined_data).any(axis=1)]
                        all_data.append(combined_data)

            all_data = np.vstack(all_data)
            columns = column_names + ['label', 'subject']
            processed_data = pd.DataFrame(all_data, columns=columns)
            scaler = StandardScaler()
            features = scaler.fit_transform(processed_data.iloc[:, :-2])

            processed_data.iloc[:, :-2] = features
            processed_data['label'] = processed_data['label'].astype(int)
            processed_data['subject'] = processed_data['subject'].astype(int)

            print(processed_data['label'].min() )
            if processed_data['label'].min() > 0:
                print(1)
                processed_data['label'] = processed_data['label'] - 1
            processed_data.to_csv(csv_path, index=False)
            print(f"Processed data saved to {csv_path}")

        self.labels = processed_data['label'].values
        self.features = processed_data.drop(columns=['label', 'subject']).values
        self.subjects = processed_data['subject'].values
        self.transform = transform

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        subject = self.subjects[index]

        if self.transform:
            x = self.transform(x)

        return x, y, subject

    def __len__(self):
        return len(self.features)


In [None]:
mhealth_dataset = MhealthDataset()
USCHAD_dataset = USCHADDataset()
Motionsense_dataset = MotionsenseDataset()
PAMAP2_dataset = PAMAP2Dataset()
Realdisp_dataset=RealdispDataset()
Realworld_dataset=RealworldDataset()
HHAR_Dataset = HHARDataset()
WISDM_Dataset = WISDMDataset()
DSADS_Dataset=DSADSDataset()
UniMiBSHAR_Dataset=UniMiBSHARDataset()
WARD_Dataset = WARDDataset()

In [None]:
root_dir = './Datapool_new/'
for filename in os.listdir(root_dir):

    if filename.endswith('.csv'):
        csv_file = os.path.join(root_dir, filename)

        df = pd.read_csv(csv_file)

        pickle_file = os.path.join(root_dir, os.path.splitext(filename)[0] + '.pkl')

        df.to_pickle(pickle_file)

        print(f"Converted {csv_file} to {pickle_file}")