# MMASH Data Overview
This notebook audits raw MMASH files, checks schema consistency, and prepares notes for preprocessing pipelines.

## Checklist
- Verify presence of required raw CSVs under `data/raw/`.
- Inspect timestamp coverage and timezone information.
- Summarise missing values and participant coverage per modality.
- Draft hypotheses connecting light exposure, HRV, and circadian phase.

In [None]:
# Imports for exploratory analysis
import os
from pathlib import Path

import numpy as np
import pandas as pd

from src import preprocess, feature_extraction, solar_features

In [None]:
RAW_DIR = Path('data/raw')
PROCESSED_DIR = Path('data/processed')
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
sorted(RAW_DIR.glob('*.csv'))

In [None]:
# Visualize heart rate, activity, and sleep alignment for one sample processed user
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

processed_dir = Path('data/processed')
# Pick first available processed file
files = sorted(processed_dir.glob('User_*_merged.csv'))
if not files:
    print('No processed user files found in data/processed/. Run process_all_users() first.')
else:
    sample = files[0]
    df = pd.read_csv(sample, parse_dates=['datetime'])
    fig, ax1 = plt.subplots(figsize=(12, 4))
    ax2 = ax1.twinx()
    # Heart rate if available
    if 'heart_rate' in df.columns:
        ax1.plot(df['datetime'], df['heart_rate'], color='tab:red', label='Heart rate (bpm)', linewidth=1)
        ax1.set_ylabel('Heart rate (bpm)', color='tab:red')
        ax1.tick_params(axis='y', labelcolor='tab:red')
    # Activity magnitude if accelerometer axes present
    acc_axes = [c for c in df.columns if c.startswith('acc_')]
    if len(acc_axes) >= 3:
        acc_vm = np.sqrt((df[acc_axes] ** 2).sum(axis=1))
        ax2.plot(df['datetime'], acc_vm, color='tab:blue', label='Activity VM', alpha=0.6)
        ax2.set_ylabel('Activity VM', color='tab:blue')
        ax2.tick_params(axis='y', labelcolor='tab:blue')
    # Shade sleep periods
    if 'is_sleep' in df.columns:
        asleep = df['is_sleep'].astype(bool).values
        ax1.fill_between(df['datetime'].values, ax1.get_ylim()[0], ax1.get_ylim()[1], where=asleep, color='gray', alpha=0.15, step='pre', label='Sleep')
    ax1.set_title(f'Alignment check: {sample.name}')
    ax1.set_xlabel('Time')
    fig.tight_layout()
    # Compose legend from both axes
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines + lines2, labels + labels2, loc='upper right')
    plt.show()