In [1]:
import pandas as pd
import numpy as np
import os
from glob import glob

# Function to extract features from Actigraphy data
def extract_features(data):
    required_columns = {'X', 'Y', 'Z', 'enmo', 'light'}
    if not required_columns.issubset(data.columns):
        raise ValueError(f"Missing required columns: {required_columns - set(data.columns)}")
    
    # Statistical features
    mean_x = data['X'].mean()
    mean_y = data['Y'].mean()
    mean_z = data['Z'].mean()
    std_x = data['X'].std()
    std_y = data['Y'].std()
    std_z = data['Z'].std()
    
    mean_enmo = data['enmo'].mean()
    std_enmo = data['enmo'].std()
    max_enmo = data['enmo'].max()
    min_enmo = data['enmo'].min()

    magnitude = np.sqrt(data['X']**2 + data['Y']**2 + data['Z']**2)
    mean_magnitude = magnitude.mean()
    std_magnitude = magnitude.std()

    activity_level = pd.cut(
        magnitude,
        bins=[0, 0.5, 1.0, 1.5, 2.0],
        labels=['Very Low', 'Low', 'Medium', 'High'],
        include_lowest=True
    ).mode()[0] if len(magnitude) > 0 else 'Unknown'

    inactivity_percentage = (data['enmo'] == 0).mean() * 100

    non_wear_percentage = (data['non_wear_flag'] == 1).mean() * 100 if 'non_wear_flag' in data.columns else None

    mean_light = data['light'].mean()
    max_light = data['light'].max()

    features = {
        'mean_x': mean_x,
        'mean_y': mean_y,
        'mean_z': mean_z,
        'std_x': std_x,
        'std_y': std_y,
        'std_z': std_z,
        'mean_enmo': mean_enmo,
        'std_enmo': std_enmo,
        'max_enmo': max_enmo,
        'min_enmo': min_enmo,
        'mean_magnitude': mean_magnitude,
        'std_magnitude': std_magnitude,
        'activity_level': activity_level,
        'inactivity_percentage': inactivity_percentage,
        'non_wear_percentage': non_wear_percentage,
        'mean_light': mean_light,
        'max_light': max_light
    }
    return pd.DataFrame([features])

# Directory containing participant files (replace with actual directory path)
participant_dir = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet/"
output_file = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/consolidated_features.parquet"

# Found all participant files
participant_files = glob(os.path.join(participant_dir, "*.parquet"))
print(f"Found {len(participant_files)} participant files.")

# Initialized an empty DataFrame to hold all features
all_features = pd.DataFrame()

# Processed each participant file
for participant_file in participant_files:
    try:
        participant_data = pd.read_parquet(participant_file)
        participant_features = extract_features(participant_data)
        
        # Added participant ID to the features
        participant_id = participant_file.split('id=')[-1].split('.')[0]
        participant_features['participant_id'] = participant_id
        
        # Appended to the consolidated DataFrame
        all_features = pd.concat([all_features, participant_features], ignore_index=True)
        print(f"Processed participant {participant_id}.")
    
    except Exception as e:
        print(f"Error processing file {participant_file}: {e}")

# Saved the consolidated DataFrame
try:
    all_features.to_parquet(output_file)
    print(f"Consolidated features saved successfully to {output_file}")
    print(all_features.shape)  # Debugging: Check the final DataFrame shape
except Exception as e:
    print(f"Error saving consolidated features: {e}")


Found 996 participant files.
Processed participant 00115b9f.
Processed participant 001f3379.
Processed participant 00f332d1.
Processed participant 01085eb3.
Processed participant 012cadd8.
Processed participant 012e3869.
Processed participant 029a19c9.
Processed participant 02cebf33.
Processed participant 02cf7384.
Processed participant 035c96dd.
Processed participant 03a9019b.
Processed participant 0417c91e.
Processed participant 045a0a94.
Processed participant 04afb6f9.
Processed participant 04bb1a76.
Processed participant 04cb2c30.
Processed participant 04d06a9c.
Processed participant 04f094a8.
Processed participant 051680a0.
Processed participant 055156e2.
Processed participant 059eed01.
Processed participant 05bbed1b.
Processed participant 05db1b9b.
Processed participant 05e94f88.
Processed participant 063b16fc.
Processed participant 064e8da5.
Processed participant 0668373f.
Processed participant 067b9287.
Processed participant 06c8b2fb.
Processed participant 06eb8adb.
Processed p

In [2]:
import pandas as pd

# Path to the input Parquet file
input_parquet_file = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/consolidated_features.parquet"

# Path for the output CSV file
output_csv_file = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/consolidated_features.csv"

# Read the Parquet file into a DataFrame
try:
    df = pd.read_parquet(input_parquet_file)
    print(f"Loaded Parquet file with shape {df.shape}.")
    
    # Save the DataFrame as a CSV file
    df.to_csv(output_csv_file, index=False)
    print(f"CSV file saved successfully to {output_csv_file}.")
except Exception as e:
    print(f"Error processing file: {e}")


Loaded Parquet file with shape (996, 18).
CSV file saved successfully to C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/consolidated_features.csv.


In [3]:
import os
import pandas as pd
import numpy as np

# Function to extract features from Actigraphy data
def extract_features(data):
    mean_x = data['X'].mean()
    mean_y = data['Y'].mean()
    mean_z = data['Z'].mean()
    std_x = data['X'].std()
    std_y = data['Y'].std()
    std_z = data['Z'].std()
    mean_enmo = data['enmo'].mean()
    std_enmo = data['enmo'].std()
    max_enmo = data['enmo'].max()
    min_enmo = data['enmo'].min()
    magnitude = np.sqrt(data['X']**2 + data['Y']**2 + data['Z']**2)
    mean_magnitude = magnitude.mean()
    std_magnitude = magnitude.std()
    inactivity_percentage = (data['enmo'] == 0).mean() * 100
    non_wear_percentage = (data['non-wear_flag'] == 1).mean() * 100

    # Created a dictionary with the extracted features
    features = {
        'mean_x': mean_x,
        'mean_y': mean_y,
        'mean_z': mean_z,
        'std_x': std_x,
        'std_y': std_y,
        'std_z': std_z,
        'mean_enmo': mean_enmo,
        'std_enmo': std_enmo,
        'max_enmo': max_enmo,
        'min_enmo': min_enmo,
        'mean_magnitude': mean_magnitude,
        'std_magnitude': std_magnitude,
        'inactivity_percentage': inactivity_percentage,
        'non_wear_percentage': non_wear_percentage
    }
    return features

# Directory containing the participant files
input_directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/cleaned_series_train.parquet"
output_directory = "C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features"
os.makedirs(output_directory, exist_ok=True)

# Listed all parquet files
parquet_files = [os.path.join(input_directory, file) for file in os.listdir(input_directory) if file.endswith('.parquet')]

# Initialized a list to store features
all_features = []

# Processed each file
for file in parquet_files:
    try:
        # Extracted participant ID from file name
        participant_id = file.split('id=')[-1].split('.')[0]
        
        # Loaded the data
        data = pd.read_parquet(file)
        
        # Extracted features
        features = extract_features(data)
        features['participant_id'] = participant_id  # Add participant ID
        
        # Appended features to the list
        all_features.append(features)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Converted the list of features to a DataFrame
final_features = pd.DataFrame(all_features)

# Saved the DataFrame to a parquet file
output_parquet_file = os.path.join(output_directory, "extracted_features_all_participants.parquet")
final_features.to_parquet(output_parquet_file, index=False)

# Saved the DataFrame to a CSV file
output_csv_file = os.path.join(output_directory, "extracted_features_all_participants.csv")
final_features.to_csv(output_csv_file, index=False)

# Output success messages
print(f"Features successfully saved as a parquet file at: {output_parquet_file}")
print(f"Features successfully saved as a CSV file at: {output_csv_file}")


Features successfully saved as a parquet file at: C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features\extracted_features_all_participants.parquet
Features successfully saved as a CSV file at: C:/Users/Lenovo/OneDrive/Desktop/SEM 2/ml/project/extracted_features\extracted_features_all_participants.csv
