In [4]:
import os
import json
import csv
import pandas as pd
from pathlib import Path

def convert_csv_to_json():
    # Main data object to hold all subjects
    all_data = {}
    
    # Base directory containing all subject folders
    base_dir = Path('Subjects')
    
    # Loop through all subject folders (01-29)
    for i in range(1, 30):
        subject_id = f"subject_{i:02d}"  # Format as subject_01, subject_02, etc.
        subject_path = base_dir / subject_id
        
        print(f"Processing {subject_id}...")
        
        # Check if subject directory exists
        if not subject_path.exists():
            print(f"Skipping {subject_id} - directory not found")
            continue
        
        # Initialize subject data
        all_data[subject_id] = {}
        
        # List of CSV files to process
        csv_files = ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'IBI.csv', 'TEMP.csv']
        
        # Process each CSV file
        for csv_file in csv_files:
            file_path = subject_path / csv_file
            
            if not file_path.exists():
                print(f"  ✗ Skipping {csv_file} - file not found")
                continue
            
            try:
                # Read CSV file using pandas for better handling of various CSV formats
                df = pd.read_csv(file_path)
                
                # Convert dataframe to list of dictionaries (records)
                data_records = df.to_dict('records')
                
                # Store data in the main object (remove .csv from filename)
                data_type = csv_file.replace('.csv', '')
                all_data[subject_id][data_type] = data_records
                
                print(f"  ✓ Processed {csv_file}")
            except Exception as e:
                print(f"  ✗ Error processing {csv_file}: {str(e)}")
    
    # Write the complete dataset to a JSON file
    with open('complete_dataset.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=2)
    print("\nConversion complete! All data saved to complete_dataset.json")
    
    # Write a minified version as well
    with open('complete_dataset.min.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f)
    print("Minified version saved to complete_dataset.min.json")

if __name__ == "__main__":
    convert_csv_to_json()

Processing subject_01...
  ✓ Processed ACC.csv
  ✓ Processed BVP.csv
  ✓ Processed EDA.csv
  ✓ Processed HR.csv
  ✓ Processed IBI.csv
  ✓ Processed TEMP.csv
Processing subject_02...
  ✓ Processed ACC.csv
  ✓ Processed BVP.csv
  ✓ Processed EDA.csv
  ✓ Processed HR.csv
  ✓ Processed IBI.csv
  ✓ Processed TEMP.csv
Processing subject_03...
  ✓ Processed ACC.csv
  ✓ Processed BVP.csv
  ✓ Processed EDA.csv
  ✓ Processed HR.csv
  ✓ Processed IBI.csv
  ✓ Processed TEMP.csv
Processing subject_04...
  ✓ Processed ACC.csv
  ✓ Processed BVP.csv
  ✓ Processed EDA.csv
  ✓ Processed HR.csv
  ✓ Processed IBI.csv
  ✓ Processed TEMP.csv
Processing subject_05...
  ✓ Processed ACC.csv
  ✓ Processed BVP.csv
  ✓ Processed EDA.csv
  ✓ Processed HR.csv
  ✓ Processed IBI.csv
  ✓ Processed TEMP.csv
Processing subject_06...
  ✓ Processed ACC.csv
  ✓ Processed BVP.csv
  ✓ Processed EDA.csv
  ✓ Processed HR.csv
  ✓ Processed IBI.csv
  ✓ Processed TEMP.csv
Processing subject_07...
  ✓ Processed ACC.csv
  ✓ Process

KeyboardInterrupt: 

In [4]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt

def load_data(file_path):
    """Load data from JSON file."""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"Loaded data for {len(data)} participants")
        return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found")
        return None
    except json.JSONDecodeError:
        print(f"Error: File {file_path} is not valid JSON")
        return None

def extract_numeric_value(value):
    """Extract numeric value from various possible formats."""
    if isinstance(value, (int, float)):
        return float(value)
    elif isinstance(value, dict) and 'value' in value:
        return extract_numeric_value(value['value'])
    elif isinstance(value, list):
        # For vector values like accelerometer, compute magnitude
        try:
            return np.sqrt(sum(float(v)**2 for v in value))
        except (TypeError, ValueError):
            print(f"Warning: Could not compute magnitude for vector value: {value}")
            return None
    else:
        print(f"Warning: Unrecognized value format: {type(value)} - {value}")
        return None

def compute_average_metrics(data, exam_duration_minutes=90):
    """
    Compute average metrics across all participants over time.
    
    Since we don't have timestamps in the data, we'll use the array indices 
    as time points and normalize them to the expected exam duration.
    """
    # First, identify all metrics present in the data
    metrics = set()
    for student in data.values():
        for metric in student.keys():
            metrics.add(metric)
    
    print(f"Available metrics: {', '.join(metrics)}")
    
    # For each metric, create arrays to store values at each relative time point
    metric_data = {}
    
    for metric in metrics:
        # Find the maximum length of data arrays for this metric
        max_length = 0
        valid_student_count = 0
        
        for student in data.values():
            if metric in student and isinstance(student[metric], list):
                max_length = max(max_length, len(student[metric]))
                valid_student_count += 1
        
        if max_length == 0:
            print(f"No valid data arrays found for metric {metric}")
            continue
            
        print(f"Found {valid_student_count} students with {metric} data, max length: {max_length}")
        
        # Create arrays to store sums and counts at each time point
        sums = np.zeros(max_length)
        counts = np.zeros(max_length, dtype=int)
        
        # Sum values at each relative time point
        for student in data.values():
            if metric in student and isinstance(student[metric], list):
                values = student[metric]
                
                # Handle different value formats
                for i, value in enumerate(values):
                    # Skip if index is out of bounds
                    if i >= max_length:
                        break
                    
                    # Extract numeric value from whatever format we have
                    numeric_value = extract_numeric_value(value)
                    
                    if numeric_value is not None:
                        sums[i] += numeric_value
                        counts[i] += 1
        
        # Calculate average at each time point
        averages = np.zeros(max_length)
        std_devs = np.zeros(max_length)
        
        for i in range(max_length):
            if counts[i] > 0:
                averages[i] = sums[i] / counts[i]
            # Standard deviations will be calculated separately
        
        # Create time indices normalized to the exam duration
        time_indices = np.arange(max_length)
        minutes_into_exam = time_indices * (exam_duration_minutes / max_length)
        
        # Store results in a dataframe
        metric_data[metric] = pd.DataFrame({
            'time_index': time_indices,
            'minutes_into_exam': minutes_into_exam,
            'average_value': averages,
            'count': counts
        })
        
        print(f"Processed {max_length} time points for {metric}")
    
    return metric_data

def calculate_std_deviations(data, metric_averages, exam_duration_minutes=90):
    """
    Calculate standard deviations for each metric at each time point.
    This requires a second pass through the data after averages are computed.
    """
    for metric, avg_df in metric_averages.items():
        max_length = len(avg_df)
        squared_diffs = np.zeros(max_length)
        counts = np.zeros(max_length, dtype=int)
        
        # Sum squared differences from the mean
        for student in data.values():
            if metric in student and isinstance(student[metric], list):
                values = student[metric]
                
                for i, value in enumerate(values):
                    # Skip if index is out of bounds
                    if i >= max_length:
                        break
                    
                    # Extract numeric value
                    numeric_value = extract_numeric_value(value)
                    
                    if numeric_value is not None:
                        # Compute squared difference from mean
                        squared_diff = (numeric_value - avg_df.loc[i, 'average_value']) ** 2
                        squared_diffs[i] += squared_diff
                        counts[i] += 1
        
        # Calculate standard deviation
        std_devs = np.zeros(max_length)
        for i in range(max_length):
            if counts[i] > 1:  # Need at least 2 points for std dev
                std_devs[i] = np.sqrt(squared_diffs[i] / counts[i])
        
        # Add to dataframe
        avg_df['std_value'] = std_devs
    
    return metric_averages

def save_average_metrics(averages, output_file):
    """Save average metrics to a JSON file."""
    output_data = {}
    
    for metric, df in averages.items():
        output_data[metric] = df.to_dict(orient='records')
    
    with open(output_file, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f"Saved average metrics to {output_file}")

def plot_average_metrics(averages, output_folder='plots', exam_duration_minutes=90, use_reference_values=True):
    """Plot average metrics over time with reference values from the paper."""
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Reference values based on the paper
    reference_values = {
        'EDA': {
            'stress': {'mean': 0.15, 'std': 0.05},
            'no_stress': {'mean': 0.05, 'std': 0.02}
        },
        'HR': {
            'stress': {'mean': 85, 'std': 12},
            'no_stress': {'mean': 70, 'std': 8}
        },
        'TEMP': {
            'stress': {'mean': 33, 'std': 1.5},
            'no_stress': {'mean': 32, 'std': 1}
        }
    }
    
    for metric, df in averages.items():
        plt.figure(figsize=(12, 6))
        
        # Define colors for the metric line and reference bands
        main_color = '#1d3557'  # Dark blue for the main line
        stress_color = '#e63946'  # Red for stress reference
        no_stress_color = '#a8dadc'  # Light blue for no-stress reference
        
        # Plot average with standard deviation band
        plt.plot(df['minutes_into_exam'], df['average_value'], label=f'Average {metric}', color=main_color, linewidth=2.5)
        
        if 'std_value' in df.columns:
            plt.fill_between(
                df['minutes_into_exam'],
                df['average_value'] - df['std_value'],
                df['average_value'] + df['std_value'],
                alpha=0.2,
                color=main_color,
                label=f'±1 Std Dev'
            )
        
        # Add reference values from the paper if available and selected
        if use_reference_values and metric in reference_values:
            # Get y-axis limits to draw reference bands across the whole plot
            y_min, y_max = plt.ylim()
            x_max = df['minutes_into_exam'].max()
            
            # Stress reference band
            stress_mean = reference_values[metric]['stress']['mean']
            stress_std = reference_values[metric]['stress']['std']
            
            plt.axhspan(
                stress_mean - stress_std,
                stress_mean + stress_std,
                alpha=0.15,
                color=stress_color,
                label=f'Stress Range (Paper)'
            )
            plt.axhline(y=stress_mean, linestyle='--', color=stress_color, alpha=0.7, 
                        label=f'Stress Threshold (Paper): {stress_mean}')
            
            # No-stress reference band
            no_stress_mean = reference_values[metric]['no_stress']['mean']
            no_stress_std = reference_values[metric]['no_stress']['std']
            
            plt.axhspan(
                no_stress_mean - no_stress_std,
                no_stress_mean + no_stress_std,
                alpha=0.15,
                color=no_stress_color,
                label=f'No-Stress Range (Paper)'
            )
            plt.axhline(y=no_stress_mean, linestyle='--', color=no_stress_color, alpha=0.7,
                       label=f'No-Stress Reference (Paper): {no_stress_mean}')
        
        # Add task markers based on the paper's protocol
        if exam_duration_minutes == 90:  # Midterm exam
            # These markers are approximate based on the paper's protocol
            markers = [
                (0, "Exam Start"),
                (10, "10m: First Task Complete"),
                (17, "17m: Second Task Complete"),
                (22, "22m: Third Task Complete"),
                (35, "35m: Mathematical Task"),
                (45, "45m: Oral Presentation"),
                (90, "Exam End")
            ]
        else:  # Final exam (assumed to be 180 minutes)
            # Scaled markers for longer exam
            markers = [
                (0, "Exam Start"),
                (20, "20m: First Task Complete"),
                (34, "34m: Second Task Complete"),
                (44, "44m: Third Task Complete"),
                (70, "70m: Mathematical Task"),
                (90, "90m: Oral Presentation"),
                (180, "Exam End")
            ]
        
        # Add vertical lines for task markers
        for minute, label in markers:
            if minute <= df['minutes_into_exam'].max():
                plt.axvline(x=minute, linestyle=':', color='gray', alpha=0.7)
                plt.text(minute, plt.ylim()[1]*0.98, label, rotation=90, verticalalignment='top', fontsize=8)
        
        # Formatting
        plt.xlabel('Minutes into Exam')
        
        # Add appropriate units to the y-axis label
        units = {
            'EDA': 'μS',
            'HR': 'BPM',
            'TEMP': '°C',
            'BVP': 'a.u.'
        }
        y_label = f'{metric} Value'
        if metric in units:
            y_label = f'{metric} ({units[metric]})'
        plt.ylabel(y_label)
        
        plt.title(f'Average {metric} Across All Participants')
        plt.legend(loc='upper right')
        plt.grid(True, alpha=0.3)
        
        # Ensure a reasonable y-axis range
        if metric == 'EDA':
            # EDA typically ranges from 0-20 μS, but focus on lower range
            plt.ylim(0, min(1, plt.ylim()[1]))
        elif metric == 'HR':
            # Heart rate typically 60-100 BPM
            plt.ylim(max(60, plt.ylim()[0] * 0.9), min(120, plt.ylim()[1] * 1.1))
        
        # Add count information in the corner
        min_count = df['count'].min()
        max_count = df['count'].max()
        avg_count = df['count'].mean()
        plt.figtext(0.01, 0.01, f"Data points: min={min_count}, max={max_count}, avg={avg_count:.1f}", fontsize=8)
        
        # Save the plot
        plt.tight_layout()
        plt.savefig(f'{output_folder}/average_{metric}.png', dpi=300)
        plt.close()
    
    print(f"Saved plots to {output_folder} folder")

def classify_stress_periods(averages, reference_values=None):
    """
    Classify each time period as stress or no-stress based on metric values.
    
    Returns a DataFrame with timestamps and stress classifications.
    """
    # Default reference values from the paper if not provided
    if reference_values is None:
        reference_values = {
            'EDA': {'threshold': 0.1, 'weight': 1.0},
            'HR': {'threshold': 78, 'weight': 0.8},
            'TEMP': {'threshold': 32.5, 'weight': 0.6}
        }
    
    # Get common time indices across all metrics
    common_metrics = [m for m in averages.keys() if m in reference_values]
    if not common_metrics:
        print("No metrics available for stress classification")
        return None
    
    # Use the first metric's time points as reference
    reference_metric = common_metrics[0]
    time_points = averages[reference_metric]['minutes_into_exam'].values
    time_indices = averages[reference_metric]['time_index'].values
    
    # Create a DataFrame to store stress classifications
    stress_df = pd.DataFrame({
        'time_index': time_indices,
        'minutes_into_exam': time_points,
        'stress_score': np.zeros(len(time_points)),
        'is_stressed': np.zeros(len(time_points), dtype=bool)
    })
    
    # Calculate weighted stress score for each time point
    total_weight = 0
    
    for metric in common_metrics:
        if metric in averages and metric in reference_values:
            threshold = reference_values[metric]['threshold']
            weight = reference_values[metric]['weight']
            total_weight += weight
            
            # Get average values for this metric
            values = averages[metric]['average_value'].values
            
            # Calculate stress contribution for each time point
            for i, value in enumerate(values):
                if metric == 'EDA' and value > threshold:
                    stress_df.loc[i, 'stress_score'] += weight
                elif metric == 'HR' and value > threshold:
                    stress_df.loc[i, 'stress_score'] += weight
                elif metric == 'TEMP' and value > threshold:
                    stress_df.loc[i, 'stress_score'] += weight
    
    # Normalize stress score and classify
    if total_weight > 0:
        stress_df['stress_score'] = stress_df['stress_score'] / total_weight
        stress_df['is_stressed'] = stress_df['stress_score'] > 0.5
    
    return stress_df

def plot_stress_classification(averages, stress_df, output_folder='plots', exam_duration_minutes=90):
    """Plot stress classification over time."""
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    plt.figure(figsize=(12, 8))
    
    # Create multiple subplot panels
    gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 0.5])
    
    # EDA plot
    if 'EDA' in averages:
        ax1 = plt.subplot(gs[0])
        df = averages['EDA']
        ax1.plot(df['minutes_into_exam'], df['average_value'], color='#1d3557', linewidth=2, label='EDA')
        ax1.set_ylabel('EDA (μS)')
        ax1.set_title('Stress Indicators During Exam')
        ax1.grid(True, alpha=0.3)
        ax1.legend(loc='upper right')
    
    # HR plot
    if 'HR' in averages:
        ax2 = plt.subplot(gs[1])
        df = averages['HR']
        ax2.plot(df['minutes_into_exam'], df['average_value'], color='#e63946', linewidth=2, label='HR')
        ax2.set_ylabel('HR (BPM)')
        ax2.grid(True, alpha=0.3)
        ax2.legend(loc='upper right')
    
    # Stress classification
    ax3 = plt.subplot(gs[2])
    ax3.fill_between(
        stress_df['minutes_into_exam'],
        0,
        stress_df['stress_score'],
        where=stress_df['is_stressed'],
        color='#e63946',
        alpha=0.6,
        label='Stress Period'
    )
    ax3.fill_between(
        stress_df['minutes_into_exam'],
        0,
        stress_df['stress_score'],
        where=~stress_df['is_stressed'],
        color='#a8dadc',
        alpha=0.6,
        label='No-Stress Period'
    )
    ax3.set_ylabel('Stress Level')
    ax3.set_xlabel('Minutes into Exam')
    ax3.set_ylim(0, 1)
    ax3.legend(loc='upper right')
    
    # Add task markers based on the paper's protocol
    if exam_duration_minutes == 90:  # Midterm exam
        markers = [
            (0, "Exam Start"),
            (10, "10m: First Task"),
            (17, "17m: Second Task"),
            (22, "22m: Third Task"),
            (35, "35m: Math Task"),
            (45, "45m: Oral Presentation"),
            (90, "Exam End")
        ]
    else:  # Final exam
        markers = [
            (0, "Exam Start"),
            (20, "20m: First Task"),
            (34, "34m: Second Task"),
            (44, "44m: Third Task"),
            (70, "70m: Math Task"),
            (90, "90m: Oral Presentation"),
            (180, "Exam End")
        ]
    
    # Add vertical lines for task markers to all subplots
    for subplot in [ax1, ax2, ax3]:
        for minute, label in markers:
            if minute <= stress_df['minutes_into_exam'].max():
                subplot.axvline(x=minute, linestyle=':', color='gray', alpha=0.7)
                if subplot == ax3:  # Only add text to the bottom subplot
                    subplot.text(minute, 0, label, rotation=90, verticalalignment='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/stress_classification.png', dpi=300)
    plt.close()
    
    print(f"Saved stress classification plot to {output_folder}")

def inspect_data_structure(data):
    """Inspect the structure of the data to help with debugging."""
    print("\n=== DATA STRUCTURE INSPECTION ===")
    
    # Get a list of students
    students = list(data.keys())
    print(f"Found {len(students)} students: {', '.join(students[:5])}{'...' if len(students) > 5 else ''}")
    
    # Check what metrics are available
    all_metrics = set()
    for student in data.values():
        all_metrics.update(student.keys())
    
    print(f"Found {len(all_metrics)} metrics: {', '.join(all_metrics)}")
    
    # Sample the first student's data for each metric
    if students:
        first_student = students[0]
        print(f"\nSample data for student {first_student}:")
        
        for metric in all_metrics:
            if metric in data[first_student]:
                metric_data = data[first_student][metric]
                
                if isinstance(metric_data, list):
                    print(f"\n{metric}: List with {len(metric_data)} items")
                    
                    # Check first few items
                    if len(metric_data) > 0:
                        print(f"  First item type: {type(metric_data[0])}")
                        if isinstance(metric_data[0], dict):
                            print(f"  First item keys: {list(metric_data[0].keys())}")
                            for key, value in metric_data[0].items():
                                print(f"    {key}: {type(value)} - {value}")
                        else:
                            print(f"  First item: {metric_data[0]}")
                            
                        # If there are many items, check a middle item too
                        if len(metric_data) > 10:
                            mid_idx = len(metric_data) // 2
                            print(f"\n  Middle item ({mid_idx}) type: {type(metric_data[mid_idx])}")
                            if isinstance(metric_data[mid_idx], dict):
                                print(f"  Middle item keys: {list(metric_data[mid_idx].keys())}")
                                for key, value in metric_data[mid_idx].items():
                                    print(f"    {key}: {type(value)} - {value}")
                            else:
                                print(f"  Middle item: {metric_data[mid_idx]}")
                else:
                    print(f"\n{metric}: {type(metric_data)} (not a list)")
                    if isinstance(metric_data, dict):
                        print(f"  Keys: {list(metric_data.keys())}")
    
    print("\n=== END OF INSPECTION ===\n")

def main():
    # File paths
    input_file = 'complete_dataset.json'
    output_file = 'average_metrics.json'
    
    # Exam duration in minutes (from the paper: 90 minutes for midterms, 180 for final)
    exam_duration_minutes = 90  # Change to 180 for final exam
    
    # Load data
    data = load_data(input_file)
    if data is None:
        return
    
    # Inspect data structure to help with debugging
    inspect_data_structure(data)
    
    # Compute average metrics
    averages = compute_average_metrics(data, exam_duration_minutes)
    
    # Calculate standard deviations
    averages = calculate_std_deviations(data, averages, exam_duration_minutes)
    
    # Save average metrics
    save_average_metrics(averages, output_file)
    
    # Plot average metrics
    plot_average_metrics(averages, exam_duration_minutes=exam_duration_minutes)
    
    # Classify stress periods
    stress_df = classify_stress_periods(averages)
    
    # Plot stress classification
    if stress_df is not None:
        plot_stress_classification(averages, stress_df, exam_duration_minutes=exam_duration_minutes)
    
    print("Analysis complete!")

if __name__ == "__main__":
    main()

In [3]:
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import os
from collections import defaultdict

def load_data(file_path):
    """Load data from JSON file."""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"Loaded data for {len(data)} participants")
        return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found")
        return None
    except json.JSONDecodeError:
        print(f"Error: File {file_path} is not valid JSON")
        return None

def load_average_data(file_path):
    """Load precomputed average data from JSON file."""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        print(f"Loaded average data for {len(data)} metrics")
        
        # Convert the loaded data back to DataFrames
        averages = {}
        for metric, records in data.items():
            averages[metric] = pd.DataFrame.from_records(records)
        
        return averages
    except FileNotFoundError:
        print(f"Error: Average data file {file_path} not found")
        return None
    except json.JSONDecodeError:
        print(f"Error: File {file_path} is not valid JSON")
        return None

def extract_numeric_value(value):
    """Extract numeric value from various possible formats."""
    if isinstance(value, (int, float)):
        return float(value)
    elif isinstance(value, dict) and 'value' in value:
        return extract_numeric_value(value['value'])
    elif isinstance(value, list):
        # For vector values like accelerometer, compute magnitude
        try:
            return np.sqrt(sum(float(v)**2 for v in value))
        except (TypeError, ValueError):
            return None
    else:
        return None

def process_participant_data(participant_data, exam_duration_minutes=90):
    """Process a single participant's data into DataFrames for each metric."""
    processed_data = {}
    
    for metric, values in participant_data.items():
        if not isinstance(values, list) or len(values) == 0:
            continue
        
        # Extract numeric values
        numeric_values = []
        for val in values:
            num_val = extract_numeric_value(val)
            if num_val is not None:
                numeric_values.append(num_val)
        
        if not numeric_values:
            continue
        
        # Create time points based on array length
        time_indices = np.arange(len(numeric_values))
        minutes_into_exam = time_indices * (exam_duration_minutes / len(numeric_values))
        
        # Create DataFrame
        processed_data[metric] = pd.DataFrame({
            'time_index': time_indices,
            'minutes_into_exam': minutes_into_exam,
            'value': numeric_values
        })
    
    return processed_data

def sample_participants(data, num_samples=5):
    """Randomly sample a specified number of participants."""
    all_participants = list(data.keys())
    
    if num_samples >= len(all_participants):
        print(f"Requested {num_samples} samples but only {len(all_participants)} participants available")
        return all_participants
    
    return random.sample(all_participants, num_samples)

def plot_individual_vs_average(
    raw_data, 
    average_data, 
    sampled_participants, 
    metrics_to_plot=None, 
    output_folder='individual_plots',
    exam_duration_minutes=90
):
    """
    Plot individual participant data against the average for specified metrics.
    
    Parameters:
    - raw_data: Dictionary of all participant data
    - average_data: Dictionary of average metrics
    - sampled_participants: List of participant IDs to plot
    - metrics_to_plot: List of metrics to plot (default: all available metrics)
    - output_folder: Folder to save plots
    - exam_duration_minutes: Duration of exam in minutes
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # If no metrics specified, use all available in average data
    if metrics_to_plot is None:
        metrics_to_plot = list(average_data.keys())
    
    # Filter to only include metrics that exist in the average data
    metrics_to_plot = [m for m in metrics_to_plot if m in average_data]
    
    if not metrics_to_plot:
        print("No valid metrics to plot")
        return
    
    print(f"Plotting {len(metrics_to_plot)} metrics for {len(sampled_participants)} participants")
    
    # Define a color palette for participants
    participant_colors = plt.cm.tab10(np.linspace(0, 1, len(sampled_participants)))
    
    # Define task markers based on the paper's protocol
    if exam_duration_minutes == 90:  # Midterm exam
        markers = [
            (0, "Start"),
            (10, "Task 1"),
            (17, "Task 2"),
            (22, "Task 3"),
            (35, "Math"),
            (45, "Oral"),
            (90, "End")
        ]
    else:  # Final exam (assumed to be 180 minutes)
        markers = [
            (0, "Start"),
            (20, "Task 1"),
            (34, "Task 2"),
            (44, "Task 3"),
            (70, "Math"),
            (90, "Oral"),
            (180, "End")
        ]
    
    # Process each participant's data
    processed_participants = {}
    for participant_id in sampled_participants:
        if participant_id in raw_data:
            processed_participants[participant_id] = process_participant_data(
                raw_data[participant_id],
                exam_duration_minutes
            )
    
    # Create separate plot for each metric
    for metric in metrics_to_plot:
        plt.figure(figsize=(12, 8))
        
        # Plot average data with thicker line and shaded std dev area
        if metric in average_data:
            avg_df = average_data[metric]
            
            # Check if required columns exist
            if 'minutes_into_exam' not in avg_df.columns:
                print(f"Warning: 'minutes_into_exam' column missing in average data for {metric}")
                # Create it from time_index if available
                if 'time_index' in avg_df.columns:
                    max_index = avg_df['time_index'].max()
                    avg_df['minutes_into_exam'] = avg_df['time_index'] * (exam_duration_minutes / max_index)
                else:
                    print(f"Warning: Cannot plot {metric} due to missing time data")
                    continue
            
            plt.plot(
                avg_df['minutes_into_exam'], 
                avg_df['average_value'], 
                color='black', 
                linewidth=3, 
                label='Average (All Participants)'
            )
            
            if 'std_value' in avg_df.columns:
                plt.fill_between(
                    avg_df['minutes_into_exam'],
                    avg_df['average_value'] - avg_df['std_value'],
                    avg_df['average_value'] + avg_df['std_value'],
                    color='black',
                    alpha=0.1
                )
        
        # Plot individual participant data
        for i, (participant_id, processed_data) in enumerate(processed_participants.items()):
            if metric in processed_data:
                participant_df = processed_data[metric]
                
                # Plot participant data
                plt.plot(
                    participant_df['minutes_into_exam'],
                    participant_df['value'],
                    color=participant_colors[i],
                    linewidth=1.5,
                    alpha=0.8,
                    label=f'Participant {participant_id}'
                )
        
        # Add task markers
        for minute, label in markers:
            if minute <= avg_df['minutes_into_exam'].max():
                plt.axvline(x=minute, linestyle=':', color='gray', alpha=0.7)
                plt.text(minute, plt.ylim()[1]*0.98, label, rotation=90, 
                         verticalalignment='top', fontsize=8)
        
        # Add appropriate units to the y-axis label
        units = {
            'EDA': 'μS',
            'HR': 'BPM',
            'TEMP': '°C',
            'BVP': 'a.u.',
            'IBI': 's'
        }
        y_label = f'{metric} Value'
        if metric in units:
            y_label = f'{metric} ({units[metric]})'
        
        # Set axis labels and title
        plt.xlabel('Minutes into Exam')
        plt.ylabel(y_label)
        plt.title(f'{metric} - Individual Participants vs Average')
        
        # Add legend with smaller font and placed outside the plot
        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
        
        # Add grid
        plt.grid(True, alpha=0.3)
        
        # Apply specific y-axis limits for better visibility
        if metric == 'EDA':
            plt.ylim(0, min(1, plt.ylim()[1]))
        elif metric == 'HR':
            plt.ylim(max(60, plt.ylim()[0] * 0.9), min(120, plt.ylim()[1] * 1.1))
        
        # Save the plot
        plt.tight_layout()
        plt.savefig(f'{output_folder}/{metric}_individual_vs_average.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    print(f"Saved {len(metrics_to_plot)} plots to {output_folder}")

def create_multi_participant_comparison(
    raw_data, 
    average_data, 
    sampled_participants, 
    metrics=['EDA', 'HR'], 
    output_folder='individual_plots',
    exam_duration_minutes=90
):
    """
    Create a single comparison plot with multiple metrics and participants.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Filter to only include metrics that exist in the average data
    valid_metrics = [m for m in metrics if m in average_data]
    
    if not valid_metrics:
        print("No valid metrics to plot")
        return
    
    # Process each participant's data
    processed_participants = {}
    for participant_id in sampled_participants:
        if participant_id in raw_data:
            processed_participants[participant_id] = process_participant_data(
                raw_data[participant_id],
                exam_duration_minutes
            )
    
    # Define a color palette for participants
    participant_colors = plt.cm.tab10(np.linspace(0, 1, len(sampled_participants)))
    
    # Create subplots - one row per metric
    fig = plt.figure(figsize=(12, 4 * len(valid_metrics)))
    gs = GridSpec(len(valid_metrics), 1, figure=fig)
    
    # Define task markers based on the paper's protocol
    if exam_duration_minutes == 90:  # Midterm exam
        markers = [
            (0, "Start"),
            (10, "Task 1"),
            (17, "Task 2"),
            (22, "Task 3"),
            (35, "Math"),
            (45, "Oral"),
            (90, "End")
        ]
    else:  # Final exam (assumed to be 180 minutes)
        markers = [
            (0, "Start"),
            (20, "Task 1"),
            (34, "Task 2"),
            (44, "Task 3"),
            (70, "Math"),
            (90, "Oral"),
            (180, "End")
        ]
    
    # Plot each metric
    for i, metric in enumerate(valid_metrics):
        ax = fig.add_subplot(gs[i])
        
        # Plot average data
        if metric in average_data:
            avg_df = average_data[metric]
            
            # Check if required columns exist
            if 'minutes_into_exam' not in avg_df.columns:
                print(f"Warning: 'minutes_into_exam' column missing in average data for {metric}")
                # Create it from time_index if available
                if 'time_index' in avg_df.columns:
                    max_index = avg_df['time_index'].max()
                    avg_df['minutes_into_exam'] = avg_df['time_index'] * (exam_duration_minutes / max_index)
                else:
                    print(f"Warning: Cannot plot {metric} due to missing time data")
                    continue
            
            ax.plot(
                avg_df['minutes_into_exam'], 
                avg_df['average_value'], 
                color='black', 
                linewidth=3, 
                label='Average'
            )
            
            if 'std_value' in avg_df.columns:
                ax.fill_between(
                    avg_df['minutes_into_exam'],
                    avg_df['average_value'] - avg_df['std_value'],
                    avg_df['average_value'] + avg_df['std_value'],
                    color='black',
                    alpha=0.1
                )
        
        # Plot individual participant data
        for j, (participant_id, processed_data) in enumerate(processed_participants.items()):
            if metric in processed_data:
                participant_df = processed_data[metric]
                
                # Plot participant data
                ax.plot(
                    participant_df['minutes_into_exam'],
                    participant_df['value'],
                    color=participant_colors[j],
                    linewidth=1.5,
                    alpha=0.8,
                    label=f'P{participant_id}'
                )
        
        # Add task markers
        y_min, y_max = ax.get_ylim()
        for minute, label in markers:
            if minute <= avg_df['minutes_into_exam'].max():
                ax.axvline(x=minute, linestyle=':', color='gray', alpha=0.7)
                # Only add text to the bottom plot
                if i == len(valid_metrics) - 1:
                    ax.text(minute, y_min, label, rotation=90, 
                          verticalalignment='bottom', fontsize=8)
        
        # Add appropriate units to the y-axis label
        units = {
            'EDA': 'μS',
            'HR': 'BPM',
            'TEMP': '°C',
            'BVP': 'a.u.',
            'IBI': 's'
        }
        y_label = metric
        if metric in units:
            y_label = f'{metric} ({units[metric]})'
        
        # Set axis labels and title
        if i == len(valid_metrics) - 1:
            ax.set_xlabel('Minutes into Exam')
        ax.set_ylabel(y_label)
        ax.set_title(f'{metric} Comparison')
        
        # Apply specific y-axis limits for better visibility
        if metric == 'EDA':
            ax.set_ylim(0, min(1, y_max))
        elif metric == 'HR':
            ax.set_ylim(max(60, y_min * 0.9), min(120, y_max * 1.1))
        
        # Add grid
        ax.grid(True, alpha=0.3)
        
        # Only add legend to the first plot
        if i == 0:
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/multi_metric_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Saved multi-metric comparison plot to {output_folder}")

def create_stress_level_plot(
    raw_data, 
    average_data, 
    sampled_participants, 
    output_folder='individual_plots',
    exam_duration_minutes=90
):
    """
    Create a plot showing estimated stress levels for individual participants vs average.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Use EDA and HR for stress level estimation if available
    stress_metrics = []
    if 'EDA' in average_data:
        stress_metrics.append('EDA')
    if 'HR' in average_data:
        stress_metrics.append('HR')
    
    if not stress_metrics:
        print("No stress metrics (EDA or HR) available for stress level plot")
        return
    
    # Process each participant's data
    processed_participants = {}
    for participant_id in sampled_participants:
        if participant_id in raw_data:
            processed_participants[participant_id] = process_participant_data(
                raw_data[participant_id],
                exam_duration_minutes
            )
    
    # Define reference values for stress classification
    reference_values = {
        'EDA': {'threshold': 0.1, 'weight': 1.0, 'max_normal': 0.05, 'min_stress': 0.15},
        'HR': {'threshold': 78, 'weight': 0.8, 'max_normal': 70, 'min_stress': 85}
    }
    
    # Calculate normalized stress scores for each participant
    participant_stress = {}
    
    # First, calculate average stress
    avg_stress = None
    avg_stress_scores = None
    
    for metric in stress_metrics:
        if metric in average_data and metric in reference_values:
            avg_df = average_data[metric]
            
            # Check if required columns exist
            if 'minutes_into_exam' not in avg_df.columns:
                print(f"Warning: 'minutes_into_exam' column missing in average data for {metric}")
                # Create it from time_index if available
                if 'time_index' in avg_df.columns:
                    max_index = avg_df['time_index'].max()
                    avg_df['minutes_into_exam'] = avg_df['time_index'] * (exam_duration_minutes / max_index)
                else:
                    print(f"Warning: Cannot use {metric} for stress plot due to missing time data")
                    continue
            
            max_normal = reference_values[metric]['max_normal']
            min_stress = reference_values[metric]['min_stress']
            
            # Normalize values between 0-1 where:
            # 0 = definitely not stressed (at or below max_normal)
            # 1 = definitely stressed (at or above min_stress)
            normalized_values = np.clip(
                (avg_df['average_value'] - max_normal) / (min_stress - max_normal),
                0, 1
            )
            
            if avg_stress is None:
                avg_stress = avg_df[['time_index', 'minutes_into_exam']].copy()
                avg_stress_scores = normalized_values * reference_values[metric]['weight']
            else:
                # Add to existing scores, weighted
                avg_stress_scores += normalized_values * reference_values[metric]['weight']
    
    # If we couldn't calculate stress from average data, return
    if avg_stress is None:
        print("Could not calculate average stress levels")
        return
    
    # Normalize by total weight
    total_weight = sum(reference_values[m]['weight'] for m in stress_metrics)
    if total_weight > 0:
        avg_stress['stress_score'] = avg_stress_scores / total_weight
    
    # Now calculate individual participant stress
    for participant_id, processed_data in processed_participants.items():
        # Check if participant has required metrics
        if not any(metric in processed_data for metric in stress_metrics):
            print(f"Participant {participant_id} is missing required stress metrics")
            continue
            
        participant_stress[participant_id] = None
        stress_scores = None
        
        for metric in stress_metrics:
            if metric in processed_data and metric in reference_values:
                part_df = processed_data[metric]
                max_normal = reference_values[metric]['max_normal']
                min_stress = reference_values[metric]['min_stress']
                
                # Normalized values for this participant
                normalized_values = np.clip(
                    (part_df['value'] - max_normal) / (min_stress - max_normal),
                    0, 1
                )
                
                if participant_stress[participant_id] is None:
                    participant_stress[participant_id] = part_df[['time_index', 'minutes_into_exam']].copy()
                    stress_scores = normalized_values * reference_values[metric]['weight']
                else:
                    # Add if we can align the time indices
                    stress_scores += normalized_values * reference_values[metric]['weight']
        
        # Normalize by total weight
        if total_weight > 0 and stress_scores is not None:
            participant_stress[participant_id]['stress_score'] = stress_scores / total_weight
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    
    # Define a color palette for participants
    participant_colors = plt.cm.tab10(np.linspace(0, 1, len(sampled_participants)))
    
    # Plot average stress
    plt.plot(
        avg_stress['minutes_into_exam'],
        avg_stress['stress_score'],
        color='black',
        linewidth=3,
        label='Average Stress Level'
    )
    
    # Plot individual participant stress
    for i, (participant_id, stress_df) in enumerate(participant_stress.items()):
        if stress_df is not None and 'stress_score' in stress_df.columns:
            plt.plot(
                stress_df['minutes_into_exam'],
                stress_df['stress_score'],
                color=participant_colors[i],
                linewidth=1.5,
                alpha=0.8,
                label=f'Participant {participant_id}'
            )
    
    # Add task markers
    if exam_duration_minutes == 90:  # Midterm exam
        markers = [
            (0, "Start"),
            (10, "Task 1 (Lego without instructions)"),
            (17, "Task 2 (Lego with instructions)"),
            (22, "Task 3 (Lego + counting backwards)"),
            (35, "Mathematical Task"),
            (45, "Oral Presentation"),
            (90, "End")
        ]
    else:  # Final exam
        markers = [
            (0, "Start"),
            (20, "Task 1"),
            (34, "Task 2"),
            (44, "Task 3"),
            (70, "Math Task"),
            (90, "Oral Presentation"),
            (180, "End")
        ]
    
    # Get current y limits
    y_min, y_max = plt.ylim()
    
    for minute, label in markers:
        if minute <= avg_stress['minutes_into_exam'].max():
            plt.axvline(x=minute, linestyle=':', color='gray', alpha=0.7)
            plt.text(minute, y_min - 0.05, label, rotation=90, verticalalignment='top', fontsize=8)
    
    # Add stress level indicators
    plt.axhspan(0, 0.3, alpha=0.1, color='green', label='Low Stress')
    plt.axhspan(0.3, 0.7, alpha=0.1, color='yellow', label='Moderate Stress')
    plt.axhspan(0.7, 1, alpha=0.1, color='red', label='High Stress')
    
    # Set axis labels and title
    plt.xlabel('Minutes into Exam')
    plt.ylabel('Stress Level')
    plt.title('Estimated Stress Levels During Exam')
    plt.ylim(0, 1)
    
    # Add legend with smaller font and placed outside the plot
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
    
    # Add grid
    plt.grid(True, alpha=0.3)
    
    # Save the plot
    plt.tight_layout()
    plt.savefig(f'{output_folder}/stress_level_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Saved stress level comparison plot to {output_folder}")

def main():
    # File paths
    raw_data_file = 'complete_dataset.json'
    average_data_file = 'average_metrics.json'
    output_folder = 'individual_plots'
    
    # Exam duration in minutes (from the paper: 90 minutes for midterms, 180 for final)
    exam_duration_minutes = 90  # Change to 180 for final exam
    
    # Set random seed for reproducibility
    random.seed(42)
    
    # Load raw data
    raw_data = load_data(raw_data_file)
    if raw_data is None:
        return
    
    # Load precomputed average data
    average_data = load_average_data(average_data_file)
    if average_data is None:
        print("Average data not found. Please run the average computation script first.")
        return
    
    # Print columns in average data for debugging
    for metric, df in average_data.items():
        print(f"Columns in {metric} average data: {df.columns.tolist()}")
        # If 'minutes_into_exam' is missing, add it
        if 'minutes_into_exam' not in df.columns and 'time_index' in df.columns:
            max_index = df['time_index'].max()
            df['minutes_into_exam'] = df['time_index'] * (exam_duration_minutes / max_index)
            print(f"Added 'minutes_into_exam' column to {metric} data")
    
    # Sample 5 random participants
    sampled_participants = sample_participants(raw_data, num_samples=5)
    print(f"Sampled participants: {', '.join(sampled_participants)}")
    
    # Create individual vs average plots
    metrics_to_plot = ['EDA', 'HR', 'TEMP', 'BVP', 'IBI']  # Add or remove metrics as needed
    plot_individual_vs_average(
        raw_data, 
        average_data, 
        sampled_participants, 
        metrics_to_plot,
        output_folder,
        exam_duration_minutes
    )
    
    # Create multi-participant comparison plot
    create_multi_participant_comparison(
        raw_data, 
        average_data, 
        sampled_participants, 
        metrics=['EDA', 'HR'],
        output_folder=output_folder,
        exam_duration_minutes=exam_duration_minutes
    )
    
    # Create stress level comparison
    create_stress_level_plot(
        raw_data,
        average_data,
        sampled_participants,
        output_folder=output_folder,
        exam_duration_minutes=exam_duration_minutes
    )
    
    print("Analysis complete!")

if __name__ == "__main__":
    main()

Loaded data for 29 participants
Loaded average data for 6 metrics
Columns in ACC average data: ['time_index', 'minutes_into_exam', 'average_value', 'count', 'std_value']
Columns in TEMP average data: ['time_index', 'minutes_into_exam', 'average_value', 'count', 'std_value']
Columns in IBI average data: ['time_index', 'minutes_into_exam', 'average_value', 'count', 'std_value']
Columns in EDA average data: ['time_index', 'minutes_into_exam', 'average_value', 'count', 'std_value']
Columns in BVP average data: ['time_index', 'minutes_into_exam', 'average_value', 'count', 'std_value']
Columns in HR average data: ['time_index', 'minutes_into_exam', 'average_value', 'count', 'std_value']
Sampled participants: subject_21, subject_04, subject_01, subject_24, subject_09
Plotting 5 metrics for 5 participants
Saved 5 plots to individual_plots
Saved multi-metric comparison plot to individual_plots
Participant subject_21 is missing required stress metrics
Participant subject_04 is missing required s

In [7]:
def get_metric_color(metric):
    """Return a consistent color for each metric."""
    colors = {
        'EDA': '#1d3557',  # Dark blue
        'HR': '#e63946',   # Red
        'TEMP': '#457b9d', # Medium blue
        'BVP': '#a8dadc',  # Light blue
        'IBI': '#f1faee',  # Off-white
        'ACC': '#e9c46a'   # Yellow
    }
    return colors.get(metric, '#666666')  # Gray for unknown metrics

def get_stress_color(stress_level):
    """Return a color based on stress level."""
    colors = {
        'Very Low': '#a8dadc',   # Light blue
        'Low': '#90be6d',        # Green
        'Moderate': '#f9c74f',   # Yellow
        'High': '#f8961e',       # Orange
        'Very High': '#e63946'   # Red
    }
    return colors.get(stress_level, '#666666')  # Gray for unknown levels

def inspect_first_entry(data):
    """Inspect the first entry in the data to understand structure."""
    if not data:
        print("No data to inspect")
        return
    
    try:
        first_participant = list(data.keys())[0]
        print(f"\nInspecting first participant ({first_participant}):")
        
        participant_data = data[first_participant]
        if not isinstance(participant_data, dict):
            print(f"  Warning: Participant data is not a dictionary, it's {type(participant_data)}")
            return
        
        for metric, values in participant_data.items():
            print(f"  Metric: {metric}")
            print(f"  Type: {type(values)}")
            print(f"  Length: {len(values) if hasattr(values, '__len__') else 'N/A'}")
            
            if isinstance(values, list) and len(values) > 0:
                print(f"  First value type: {type(values[0])}")
                print(f"  First value: {values[0]}")
                
                if len(values) > 1:
                    print(f"  Second value type: {type(values[1])}")
                    print(f"  Second value: {values[1]}")
        
        print("\n")
    except Exception as e:
        print(f"Error inspecting first entry: {str(e)}")

def validate_json_file(file_path):
    """Validate a JSON file and provide diagnostics."""
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"Error: File {file_path} does not exist")
            return False
        
        # Check file size
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
        print(f"File size: {file_size:.2f} MB")
        
        # Try to read the first few lines
        print("Checking file format...")
        with open(file_path, 'r') as f:
            first_lines = [next(f) for _ in range(10) if f]
        
        # Check if it starts with a valid JSON marker
        first_line = first_lines[0].strip() if first_lines else ""
        
        if not first_line.startswith('{') and not first_line.startswith('['):
            print("Warning: File does not start with a valid JSON marker ({ or [)")
            
        # Try to load a small portion of the file
        try:
            with open(file_path, 'r') as f:
                content = f.read(10000)  # Read first 10KB
                json.loads(content + ']' if content.startswith('[') else content + '}')
                print("File appears to have valid JSON format at the beginning")
        except json.JSONDecodeError as e:
            print(f"Error parsing first 10KB: {str(e)}")
        
        # Try to validate full file
        try:
            with open(file_path, 'r') as f:
                json.load(f)
                print("Successfully validated the entire JSON file")
                return True
        except json.JSONDecodeError as e:
            print(f"Error parsing complete file: {str(e)}")
            print(f"Error occurred at position {e.pos}, line {e.lineno}, column {e.colno}")
            
            # Try to show problematic section
            with open(file_path, 'r') as f:
                lines = f.readlines()
                
                # Show context around the error
                start_line = max(0, e.lineno - 3)
                end_line = min(len(lines), e.lineno + 3)
                
                print("\nContext around error:")
                for i in range(start_line, end_line):
                    prefix = ">>> " if i + 1 == e.lineno else "    "
                    print(f"{prefix}Line {i+1}: {lines[i].rstrip()}")
                
                # Show position indicator
                if e.lineno <= len(lines):
                    print(" " * (e.colno + 8) + "^-- Error position")
                    
            return False
    except Exception as e:
        print(f"Error validating JSON file: {str(e)}")
        return False

def get_alternative_json_path():
    """Ask user for alternative JSON file path."""
    print("\nPlease provide an alternative path to your JSON data file:")
    file_path = input("Enter file path (or press Enter to use dummy data): ").strip()
    
    if file_path:
        if os.path.exists(file_path):
            return file_path
        else:
            print(f"File {file_path} does not exist")
            return None
    else:
        return None

def main():
    # File paths
    input_file = 'complete_dataset.json'
    output_file = 'average_metrics_windowed.json'
    output_folder = 'windowed_plots'
    
    # First, validate the JSON file
    print(f"Validating JSON file: {input_file}")
    is_valid = validate_json_file(input_file)
    
    # Window parameters
    window_size = 30  # Number of data points per window
    window_overlap = 0.5  # 50% overlap between windows
    
    # Load data with improved error handling
    data = None
    if is_valid:
        data = load_data(input_file)
    
    # If loading failed, try to get alternative file path or use dummy data
    if data is None:
        alternative_path = get_alternative_json_path()
        if alternative_path:
            data = load_data(alternative_path)
        
        # If still no data, create dummy data for testing
        if data is None:
            data = create_dummy_data()
    
    # Inspect data structure to check first entry
    inspect_first_entry(data)
    
    # Compute average metrics using windowing
    averages = compute_windowed_average_metrics(data, window_size, window_overlap)
    
    if not averages:
        print("Error: No average metrics computed. Cannot continue.")
        return
    
    # Save average metrics
    success = save_average_metrics(averages, output_file)
    
    if success:
        # Plot average metrics
        plot_average_metrics(averages, output_folder)
        
        # Create multi-metric summary
        create_multi_metric_summary(averages, output_folder)
        
        print("Analysis complete!")
    else:
        print("Error: Failed to save average metrics. Analysis aborted.")

if __name__ == "__main__":
    main()

Validating JSON file: complete_dataset.json
File size: 225.39 MB
Checking file format...
Error parsing first 10KB: Expecting ',' delimiter: line 660 column 7 (char 10001)
Error parsing complete file: Expecting property name enclosed in double quotes: line 13388360 column 9 (char 236337703)
Error occurred at position 236337703, line 13388360, column 9

Context around error:
    Line 13388358:       },
    Line 13388359:       {
>>> Line 13388360: 
                 ^-- Error position

Please provide an alternative path to your JSON data file:


KeyboardInterrupt: Interrupted by user

In [9]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

def convert_csv_to_json(data_dir='./Subjects/', max_participants=None):
    """
    Convert participant CSV files to a JSON file using windowing approach from the paper.
    
    Parameters:
    - data_dir: Directory containing participant folders
    - max_participants: Maximum number of participants to process (None for all)
    """
    # Get list of participant folders
    participant_folders = [f for f in os.listdir(data_dir) 
                          if os.path.isdir(os.path.join(data_dir, f)) 
                          and f.startswith('subject_')]
    
    # Sort folders to ensure consistent order
    participant_folders.sort()
    
    # Limit number of participants if specified
    if max_participants is not None:
        participant_folders = participant_folders[:max_participants]
    
    # Key metrics mentioned in the paper
    paper_metrics = ['EDA', 'HR', 'TEMP', 'BVP', 'IBI']
    
    # Dictionary to store all data
    all_data = {}
    
    # Process each participant
    for participant in tqdm(participant_folders, desc="Processing participants"):
        print(f"Processing {participant}...")
        all_data[participant] = {}
        
        # Process each metric mentioned in the paper
        for metric in paper_metrics:
            csv_path = os.path.join(data_dir, participant, f"{metric}.csv")
            
            # Check if file exists
            if not os.path.exists(csv_path):
                print(f"  ✗ {metric}.csv not found")
                continue
            
            try:
                # Load and process the CSV file
                df = pd.read_csv(csv_path)
                
                # Simplify the data structure based on metric type
                if metric == 'ACC':
                    # For accelerometer data
                    if len(df.columns) >= 3:
                        # Extract x, y, z columns
                        values = df.iloc[:, 0:3].values.tolist()
                    else:
                        # If columns are missing, use the available ones
                        values = df.values.tolist()
                elif metric == 'IBI':
                    # For IBI data, get the IBI column if available
                    if 'IBI' in df.columns:
                        values = df['IBI'].values.tolist()
                    else:
                        # Otherwise just use the first column
                        values = df.iloc[:, 0].values.tolist()
                else:
                    # For other metrics (EDA, HR, TEMP, BVP)
                    values = df.iloc[:, 0].values.tolist()
                
                # Apply windowing as described in the paper
                window_size = 30  # From paper: "a 30-point window for EDA"
                window_overlap = 0.5  # 50% overlap between windows
                
                # Calculate step size based on overlap
                step_size = int(window_size * (1 - window_overlap))
                
                # Ensure step size is at least 1
                step_size = max(1, step_size)
                
                # Calculate number of windows
                num_windows = (len(values) - window_size) // step_size + 1
                
                # Process each window
                windowed_values = []
                for window_idx in range(num_windows):
                    start_idx = window_idx * step_size
                    end_idx = start_idx + window_size
                    
                    # Ensure end index doesn't exceed data length
                    end_idx = min(end_idx, len(values))
                    
                    # Get window data
                    window_data = values[start_idx:end_idx]
                    
                    # For the JSON file, we're keeping the raw values, not averages
                    # We'll compute averages in the analysis phase
                    windowed_values.append(window_data)
                
                # Store the windowed data
                all_data[participant][metric] = windowed_values
                
                print(f"  ✓ Processed {metric}.csv - {len(windowed_values)} windows")
            except Exception as e:
                print(f"  ✗ Error processing {metric}.csv: {str(e)}")
    
    # Save the data to a JSON file
    output_filename = 'complete_dataset.json'
    
    try:
        with open(output_filename, 'w') as f:
            # Use compact JSON format (no pretty printing) to reduce file size
            json.dump(all_data, f)
        
        print(f"\nSuccessfully saved data to {output_filename}")
        print(f"File size: {os.path.getsize(output_filename) / (1024*1024):.2f} MB")
        return True
    except Exception as e:
        print(f"Error saving data to {output_filename}: {str(e)}")
        
        # Try saving to an alternative file with less indentation
        try:
            with open('complete_dataset_minimal.json', 'w') as f:
                json.dump(all_data, f, separators=(',', ':'))
            print(f"Saved minimal version to complete_dataset_minimal.json")
            return True
        except Exception as e2:
            print(f"Error saving minimal file: {str(e2)}")
            return False

def process_windows_to_average(json_path='complete_dataset.json'):
    """
    Process the windows in the dataset to calculate averages.
    
    This function reads the windowed data and calculates average metrics
    according to the approach mentioned in the paper.
    """
    try:
        # Load the dataset
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        print(f"Loaded data for {len(data)} participants")
        
        # Dictionary to store average metrics
        averages = {}
        
        # Process each metric
        metrics = set()
        for participant_data in data.values():
            for metric in participant_data.keys():
                metrics.add(metric)
        
        print(f"Available metrics: {', '.join(metrics)}")
        
        for metric in metrics:
            print(f"Processing {metric}...")
            
            # Collect all window values across participants
            all_window_data = []
            for participant, participant_data in data.items():
                if metric in participant_data:
                    for window_idx, window in enumerate(participant_data[metric]):
                        # Each window is a list of values
                        # We need to flatten the nested windows from multiple participants
                        
                        # For ACC data which might be multi-dimensional
                        if metric == 'ACC' and isinstance(window[0], list):
                            # Calculate magnitude for each point in the window
                            magnitudes = []
                            for point in window:
                                if len(point) >= 3:
                                    # Calculate magnitude of 3D vector
                                    magnitude = np.sqrt(sum(v*v for v in point[:3]))
                                    magnitudes.append(magnitude)
                            
                            # Store window index and magnitudes
                            if magnitudes:
                                all_window_data.append({
                                    'participant': participant,
                                    'window_idx': window_idx,
                                    'values': magnitudes
                                })
                        else:
                            # For other metrics, store the window values directly
                            all_window_data.append({
                                'participant': participant,
                                'window_idx': window_idx,
                                'values': window
                            })
            
            print(f"  Collected {len(all_window_data)} windows from all participants")
            
            # Calculate average for each window position
            window_averages = {}
            for window_data in all_window_data:
                window_idx = window_data['window_idx']
                values = window_data['values']
                
                # Initialize window stats if not already present
                if window_idx not in window_averages:
                    window_averages[window_idx] = {
                        'sum': sum(values),
                        'count': len(values),
                        'values': values
                    }
                else:
                    # Update existing window stats
                    window_averages[window_idx]['sum'] += sum(values)
                    window_averages[window_idx]['count'] += len(values)
                    window_averages[window_idx]['values'].extend(values)
            
            # Calculate final averages and standard deviations
            result_data = []
            for window_idx, stats in sorted(window_averages.items()):
                if stats['count'] > 0:
                    avg_value = stats['sum'] / stats['count']
                    std_value = np.std(stats['values']) if len(stats['values']) > 1 else 0
                    
                    result_data.append({
                        'window_index': window_idx,
                        'average_value': avg_value,
                        'std_value': std_value,
                        'count': stats['count']
                    })
            
            if result_data:
                # Convert to DataFrame for easier manipulation
                df = pd.DataFrame(result_data)
                
                # Add normalized time for visualization
                max_window = df['window_index'].max()
                if max_window > 0:
                    df['normalized_time'] = df['window_index'] / max_window
                else:
                    df['normalized_time'] = 0.5
                
                # Add minutes_into_exam (assuming 90-minute exam as in the paper)
                df['minutes_into_exam'] = df['normalized_time'] * 90
                
                # Store the processed data
                averages[metric] = df
                
                print(f"  Processed {len(df)} windows with averages for {metric}")
            else:
                print(f"  No valid windows processed for {metric}")
        
        # Save average metrics to a JSON file
        output_file = 'average_metrics.json'
        output_data = {}
        
        for metric, df in averages.items():
            output_data[metric] = df.to_dict(orient='records')
        
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"Saved average metrics to {output_file}")
        print(f"File size: {os.path.getsize(output_file) / 1024:.2f} KB")
        
        return averages
    except Exception as e:
        print(f"Error processing windows to average: {str(e)}")
        return None

def main():
    # Set the directory containing participant folders
    data_dir = '.'  # Current directory - update if your data is in a subfolder
    
    # Optional: Limit to a specific number of participants for testing
    max_participants = None  # Set to a number (e.g., 10) to limit, or None for all
    
    # Convert CSV files to JSON
    success = convert_csv_to_json(data_dir, max_participants)
    
    if success:
        # Process the JSON data to calculate averages
        averages = process_windows_to_average()
        
        if averages:
            print("Dataset and averages successfully generated!")
            return True
        else:
            print("Failed to calculate averages.")
            return False
    else:
        print("Failed to generate dataset.")
        return False

if __name__ == "__main__":
    main()

Processing participants: 0it [00:00, ?it/s]


Successfully saved data to complete_dataset.json
File size: 0.00 MB
Loaded data for 0 participants
Available metrics: 
Saved average metrics to average_metrics.json
File size: 0.00 KB
Failed to calculate averages.





In [2]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

def convert_csv_to_json(data_dir='subjects', max_participants=None, indent=2):
    """
    Convert participant CSV files to a well-formatted JSON file.
    
    Parameters:
    - data_dir: Directory containing participant folders (default: 'subjects')
    - max_participants: Maximum number of participants to process (None for all)
    - indent: Number of spaces for JSON indentation (default: 2)
    """
    # Check if the subjects directory exists
    if not os.path.exists(data_dir):
        print(f"Error: Directory '{data_dir}' not found. Please check the path.")
        return False
    
    # Get list of participant folders within the subjects directory
    participant_folders = [f for f in os.listdir(data_dir) 
                          if os.path.isdir(os.path.join(data_dir, f))]
    
    # Sort folders to ensure consistent order
    participant_folders.sort()
    
    # Limit number of participants if specified
    if max_participants is not None:
        participant_folders = participant_folders[:max_participants]
    
    print(f"Found {len(participant_folders)} participant folders")
    
    # Key metrics mentioned in the paper
    paper_metrics = ['EDA', 'HR', 'TEMP', 'BVP', 'IBI']
    
    # Dictionary to store all data
    all_data = {}
    
    # Process each participant
    for participant in tqdm(participant_folders, desc="Processing participants"):
        print(f"Processing {participant}...")
        all_data[participant] = {}
        
        # Process each metric mentioned in the paper
        for metric in paper_metrics:
            csv_path = os.path.join(data_dir, participant, f"{metric}.csv")
            
            # Check if file exists
            if not os.path.exists(csv_path):
                print(f"  ✗ {metric}.csv not found")
                continue
            
            try:
                # Load and process the CSV file
                df = pd.read_csv(csv_path)
                
                # Simplify the data structure based on metric type
                if metric == 'ACC':
                    # For accelerometer data
                    if len(df.columns) >= 3:
                        # Extract x, y, z columns
                        values = df.iloc[:, 0:3].values.tolist()
                    else:
                        # If columns are missing, use the available ones
                        values = df.values.tolist()
                elif metric == 'IBI':
                    # For IBI data, get the IBI column if available
                    if 'IBI' in df.columns:
                        values = df['IBI'].values.tolist()
                    else:
                        # Otherwise just use the first column
                        values = df.iloc[:, 0].values.tolist()
                else:
                    # For other metrics (EDA, HR, TEMP, BVP)
                    values = df.iloc[:, 0].values.tolist()
                
                # Store the raw data (no windowing in the raw data file)
                all_data[participant][metric] = values
                
                print(f"  ✓ Processed {metric}.csv - {len(values)} data points")
            except Exception as e:
                print(f"  ✗ Error processing {metric}.csv: {str(e)}")
    
    # Save the data to a JSON file with proper formatting
    output_filename = 'complete_dataset.json'
    
    try:
        print(f"\nSaving formatted JSON to {output_filename}...")
        
        # Write the JSON file with proper indentation
        with open(output_filename, 'w') as f:
            json.dump(all_data, f, indent=indent)
        
        print(f"Successfully saved formatted data to {output_filename}")
        print(f"File size: {os.path.getsize(output_filename) / (1024*1024):.2f} MB")
        
        # If the file is extremely large, offer to create a minified version as well
        file_size_mb = os.path.getsize(output_filename) / (1024*1024)
        if file_size_mb > 100:  # If larger than 100MB
            print("\nWarning: The formatted JSON file is quite large.")
            create_minified = input("Would you like to create a minified version as well? (y/n): ").lower() == 'y'
            
            if create_minified:
                minified_filename = 'complete_dataset_min.json'
                with open(minified_filename, 'w') as f:
                    json.dump(all_data, f, separators=(',', ':'))
                print(f"Saved minified version to {minified_filename}")
                print(f"Minified file size: {os.path.getsize(minified_filename) / (1024*1024):.2f} MB")
        
        return True
    except Exception as e:
        print(f"Error saving data to {output_filename}: {str(e)}")
        
        # Try saving to an alternative file with less formatting
        try:
            print("\nAttempting to save with minimal formatting...")
            with open('complete_dataset_min.json', 'w') as f:
                json.dump(all_data, f, separators=(',', ':'))
            print(f"Saved minimal version to complete_dataset_min.json")
            return True
        except Exception as e2:
            print(f"Error saving minimal file: {str(e2)}")
            return False

def split_large_json(filename='complete_dataset.json', max_participants_per_file=5):
    """
    Split a large JSON file into smaller parts.
    Useful if the complete dataset is too large to work with.
    
    Parameters:
    - filename: Path to the large JSON file
    - max_participants_per_file: Maximum number of participants per split file
    """
    try:
        # Load the dataset
        with open(filename, 'r') as f:
            data = json.load(f)
        
        print(f"Loaded data for {len(data)} participants")
        
        # Get participant IDs
        participants = list(data.keys())
        
        # Calculate number of files needed
        num_files = (len(participants) + max_participants_per_file - 1) // max_participants_per_file
        
        print(f"Splitting into {num_files} files with {max_participants_per_file} participants each...")
        
        # Create splits
        for i in range(num_files):
            start_idx = i * max_participants_per_file
            end_idx = min((i + 1) * max_participants_per_file, len(participants))
            
            split_participants = participants[start_idx:end_idx]
            split_data = {p: data[p] for p in split_participants}
            
            # Save split
            split_filename = f'dataset_part_{i+1}of{num_files}.json'
            with open(split_filename, 'w') as f:
                json.dump(split_data, f, indent=2)
            
            print(f"Saved {len(split_participants)} participants to {split_filename}")
        
        return True
    except Exception as e:
        print(f"Error splitting file: {str(e)}")
        return False

def process_windowed_averages(json_path='complete_dataset.json', window_size=30, window_overlap=0.5, indent=2):
    """
    Process the dataset to calculate windowed averages.
    
    This function reads the raw data and applies the windowing technique
    from the paper to calculate average metrics.
    
    Parameters:
    - json_path: Path to the JSON file with raw data
    - window_size: Size of the window in data points (default: 30 from paper)
    - window_overlap: Overlap between consecutive windows (default: 0.5 or 50%)
    - indent: Number of spaces for JSON indentation (default: 2)
    """
    try:
        # Load the dataset
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        print(f"Loaded data for {len(data)} participants")
        
        # Dictionary to store average metrics
        averages = {}
        
        # Get all available metrics
        metrics = set()
        for participant_data in data.values():
            for metric in participant_data.keys():
                metrics.add(metric)
        
        print(f"Available metrics: {', '.join(metrics)}")
        
        # Calculate step size based on overlap
        step_size = int(window_size * (1 - window_overlap))
        if step_size < 1:
            step_size = 1
        
        # Process each metric
        for metric in metrics:
            print(f"Processing {metric} with window size {window_size}, step size {step_size}...")
            
            # Determine the maximum length for this metric across all participants
            max_length = 0
            for participant_data in data.values():
                if metric in participant_data:
                    max_length = max(max_length, len(participant_data[metric]))
            
            if max_length == 0:
                print(f"  No data found for {metric}")
                continue
            
            # Calculate number of windows
            num_windows = (max_length - window_size) // step_size + 1
            if num_windows < 1:
                num_windows = 1
            
            print(f"  Maximum data length: {max_length}, will create {num_windows} windows")
            
            # Initialize arrays to store window data
            window_data = {
                'window_index': [],
                'average_value': [],
                'std_value': [],
                'count': []
            }
            
            # Process each window
            for window_idx in range(num_windows):
                start_idx = window_idx * step_size
                end_idx = start_idx + window_size
                
                # Adjust end index if it exceeds max length
                if end_idx > max_length:
                    end_idx = max_length
                
                # Collect values from all participants for this window
                window_values = []
                
                for participant, participant_data in data.items():
                    if metric in participant_data:
                        participant_values = participant_data[metric]
                        
                        # Only consider windows that fit within this participant's data
                        if start_idx < len(participant_values):
                            # Adjust end index for this participant
                            participant_end_idx = min(end_idx, len(participant_values))
                            
                            # Extract values for this window
                            for i in range(start_idx, participant_end_idx):
                                try:
                                    value = participant_values[i]
                                    
                                    # Extract numeric value based on data type
                                    numeric_value = None
                                    if isinstance(value, (int, float)):
                                        numeric_value = float(value)
                                    elif isinstance(value, list) and len(value) > 0:
                                        # For accelerometer data (3D vectors)
                                        if len(value) >= 3:
                                            numeric_value = np.sqrt(sum(float(v)**2 for v in value[:3]))
                                        else:
                                            numeric_value = float(value[0])
                                    
                                    if numeric_value is not None:
                                        window_values.append(numeric_value)
                                except (IndexError, TypeError, ValueError) as e:
                                    # Skip problematic values
                                    continue
                
                # Calculate statistics if we have values
                if window_values:
                    window_data['window_index'].append(window_idx)
                    window_data['average_value'].append(np.mean(window_values))
                    window_data['std_value'].append(np.std(window_values))
                    window_data['count'].append(len(window_values))
            
            # Create DataFrame
            if window_data['window_index']:
                df = pd.DataFrame(window_data)
                # Add normalized time for visualization
                if len(df) > 1:
                    df['normalized_time'] = df['window_index'] / df['window_index'].max()
                else:
                    df['normalized_time'] = 0.5  # Middle point if only one window
                
                # Add minutes_into_exam field (estimated based on 90-minute exam from paper)
                df['minutes_into_exam'] = df['normalized_time'] * 90
                
                # Store the processed data
                averages[metric] = df
                
                print(f"  Processed {len(df)} windows with averages for {metric}")
            else:
                print(f"  No valid windows processed for {metric}")
        
        # Save average metrics to a JSON file with nice formatting
        output_file = 'average_metrics.json'
        output_data = {}
        
        for metric, df in averages.items():
            output_data[metric] = df.to_dict(orient='records')
        
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=indent)
        
        print(f"Saved average metrics to {output_file}")
        print(f"File size: {os.path.getsize(output_file) / 1024:.2f} KB")
        
        return averages
    except Exception as e:
        print(f"Error processing windowed averages: {str(e)}")
        return None

def main():
    # Set the directory containing participant folders
    data_dir = 'subjects'  # Subfolder containing participant data
    
    # Optional: Limit to a specific number of participants for testing
    max_participants = None  # Set to a number (e.g., 10) to limit, or None for all
    
    # JSON indentation level (set to 2 or 4 spaces for nice formatting)
    indent_level = 2
    
    # Convert CSV files to JSON with proper formatting
    success = convert_csv_to_json(data_dir, max_participants, indent=indent_level)
    
    if success:
        # Check if the file is too large and needs to be split
        file_size_mb = os.path.getsize('complete_dataset.json') / (1024*1024)
        if file_size_mb > 200:  # If larger than 200MB
            print(f"\nWarning: The dataset file is quite large ({file_size_mb:.2f} MB).")
            split_file = input("Would you like to split it into smaller parts? (y/n): ").lower() == 'y'
            
            if split_file:
                split_large_json(max_participants_per_file=5)
        
        # Process the JSON data to calculate windowed averages
        # Using window parameters from the paper: 30-point window with 50% overlap
        averages = process_windowed_averages(
            window_size=30, 
            window_overlap=0.5,
            indent=indent_level
        )
        
        if averages:
            print("Dataset and averages successfully generated with proper formatting!")
            return True
        else:
            print("Failed to calculate averages.")
            return False
    else:
        print("Failed to generate dataset.")
        return False

if __name__ == "__main__":
    main()

Found 29 participant folders


Processing participants:  14%|█▍        | 4/29 [00:00<00:00, 33.31it/s]

Processing subject_01...
  ✓ Processed EDA.csv - 10019 data points
  ✓ Processed HR.csv - 2495 data points
  ✓ Processed TEMP.csv - 10015 data points
  ✓ Processed BVP.csv - 160368 data points
  ✓ Processed IBI.csv - 449 data points
Processing subject_02...
  ✓ Processed EDA.csv - 9419 data points
  ✓ Processed HR.csv - 2345 data points
  ✓ Processed TEMP.csv - 9425 data points
  ✓ Processed BVP.csv - 150776 data points
  ✓ Processed IBI.csv - 209 data points
Processing subject_03...
  ✓ Processed EDA.csv - 8303 data points
  ✓ Processed HR.csv - 2066 data points
  ✓ Processed TEMP.csv - 8305 data points
  ✓ Processed BVP.csv - 132879 data points
  ✓ Processed IBI.csv - 690 data points
Processing subject_04...
  ✓ Processed EDA.csv - 8435 data points
  ✓ Processed HR.csv - 2098 data points
  ✓ Processed TEMP.csv - 8433 data points
  ✓ Processed BVP.csv - 134958 data points
  ✓ Processed IBI.csv - 570 data points
Processing subject_05...
  ✓ Processed EDA.csv - 8333 data points
  ✓ Proc

Processing participants:  55%|█████▌    | 16/29 [00:00<00:00, 50.20it/s]

  ✓ Processed EDA.csv - 8885 data points
  ✓ Processed HR.csv - 2211 data points
  ✓ Processed TEMP.csv - 8881 data points
  ✓ Processed BVP.csv - 142152 data points
  ✓ Processed IBI.csv - 1290 data points
Processing subject_10...
  ✓ Processed EDA.csv - 8831 data points
  ✓ Processed HR.csv - 2198 data points
  ✓ Processed TEMP.csv - 8833 data points
  ✓ Processed BVP.csv - 141349 data points
  ✓ Processed IBI.csv - 90 data points
Processing subject_11...
  ✓ Processed EDA.csv - 8483 data points
  ✓ Processed HR.csv - 2111 data points
  ✓ Processed TEMP.csv - 8481 data points
  ✓ Processed BVP.csv - 135761 data points
  ✓ Processed IBI.csv - 1350 data points
Processing subject_12...
  ✓ Processed EDA.csv - 8921 data points
  ✓ Processed HR.csv - 2220 data points
  ✓ Processed TEMP.csv - 8921 data points
  ✓ Processed BVP.csv - 142790 data points
  ✓ Processed IBI.csv - 210 data points
Processing subject_13...
  ✓ Processed EDA.csv - 8771 data points
  ✓ Processed HR.csv - 2183 data p

Processing participants: 100%|██████████| 29/29 [00:00<00:00, 49.55it/s]

  ✓ Processed BVP.csv - 137356 data points
  ✓ Processed IBI.csv - 870 data points
Processing subject_21...
  ✓ Processed EDA.csv - 8681 data points
  ✓ Processed HR.csv - 2163 data points
  ✓ Processed TEMP.csv - 8681 data points
  ✓ Processed BVP.csv - 138962 data points
  ✓ Processed IBI.csv - 480 data points
Processing subject_22...
  ✓ Processed EDA.csv - 8375 data points
  ✓ Processed HR.csv - 2085 data points
  ✓ Processed TEMP.csv - 8377 data points
  ✓ Processed BVP.csv - 134001 data points
  ✓ Processed IBI.csv - 1020 data points
Processing subject_23...
  ✓ Processed EDA.csv - 8213 data points
  ✓ Processed HR.csv - 2045 data points
  ✓ Processed TEMP.csv - 8217 data points
  ✓ Processed BVP.csv - 131438 data points
  ✓ Processed IBI.csv - 630 data points
Processing subject_24...
  ✓ Processed EDA.csv - 8333 data points
  ✓ Processed HR.csv - 2075 data points
  ✓ Processed TEMP.csv - 8337 data points
  ✓ Processed BVP.csv - 133363 data points
  ✓ Processed IBI.csv - 870 data




Successfully saved formatted data to complete_dataset.json
File size: 61.43 MB
Loaded data for 29 participants
Available metrics: TEMP, IBI, EDA, HR, BVP
Processing TEMP with window size 30, step size 15...
  Maximum data length: 10617, will create 706 windows
  Processed 706 windows with averages for TEMP
Processing IBI with window size 30, step size 15...
  Maximum data length: 1770, will create 117 windows
  Processed 117 windows with averages for IBI
Processing EDA with window size 30, step size 15...
  Maximum data length: 10619, will create 706 windows
  Processed 668 windows with averages for EDA
Processing HR with window size 30, step size 15...
  Maximum data length: 2647, will create 175 windows
  Processed 175 windows with averages for HR
Processing BVP with window size 30, step size 15...
  Maximum data length: 169949, will create 11328 windows
  Processed 11328 windows with averages for BVP
Saved average metrics to average_metrics.json
File size: 2947.42 KB
Dataset and ave