# Setup

## Function to read iMotions sensor file

In [35]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

def read_imotions(path, metadata=None):
    """
    Reads an iMotions CSV file while extracting optional metadata fields.

    Parameters:
        path (str): Path to the iMotions CSV file.
        metadata (list[str], optional): List of metadata keys to extract.

    Returns:
        df (pd.DataFrame): The data as a DataFrame.
        meta_dict (dict): Dictionary containing requested metadata fields.
    """
    meta_dict = {}
    metadata = metadata or []
    meta_lines = []
    count = 0

    with open(path, 'r') as file:
        while True:
            line = file.readline()
            if not line:
                break
            if '#' in line.split(',')[0]:
                meta_lines.append('#'.join(line.strip().split('#')[1:]))
                count += 1
            else:
                break

    # Parse requested metadata
    for line in meta_lines:
        # Remove leading '#' and split by comma
        parts = line.split(',')
        if len(parts) > 1:
            key, value = parts[0].strip(), ','.join(parts[1:])
            if key in metadata:
                meta_dict[key] = value

    # Read data using header row after metadata
    df = pd.read_csv(path, header=count, low_memory=False)
    return df, meta_dict

def get_files(folder, tags=['',]):
    return [f for f in os.listdir(folder) if not f.startswith('.') and all(x in f for x in tags)] 


def get_biometric_data(in_folder, results_folder):

    ######## Define ########
    # Define paths
    out_path = f"{results_folder}/"
    os.makedirs(out_path, exist_ok=True)

    respondents = [1,2,3] #define list of respondent ids

    # Define signal columns
    cols_afdex = [
                "Anger", "Contempt", "Disgust", "Fear", "Joy", "Sadness",
                "Surprise", "Engagement", "Valence", "Sentimentality",
                "Confusion", "Neutral"
        ]
    cols_eeg = ['High Engagement',
        'Low Engagement',
        'Distraction',
        'Drowsy',
        'Workload Average',
        'Frontal Asymmetry Alpha',
        ]


    #Define window lengths in seconds
    window_lengths = [3,]

    ######## Read Inputs #######
    #Get input files
    sensor_files = get_files(f'{in_folder}/Sensors/',tags=['.csv',])

    ### Begin ###

    results = []
    errors = []
    for respondent in respondents:
        error = {'respondent':respondent, 'FAC':None, 'EEG':None, 'GSR':None, 'Blinks':None, 'ET':None}
        interaction = {'respondent':respondent}
        try:
            file = [f for f in sensor_files if respondent in f][0] #may need adjustment
            df_sens_resp,_ = read_imotions(f'{in_folder}/Sensors/{file}')

            # Get sensor data per stimulus
            for task in df_sens_resp['SourceStimuliName'].unique():
                df_sens_task = df_sens_resp.loc[(df_sens_resp['SourceStimuliName']==task)]
                window = task

                # Get facial coding data
                for a in cols_afdex:
                    try:
                        interaction[f'sens_{window}_FAC_{a}_mean']=df_sens_task[a].dropna().mean()
                        auc_data = df_sens_task[['Timestamp',a]].dropna()
                        interaction[f'sens_{window}_FAC_{a}_AUC']=np.trapz(auc_data[a],x=auc_data['Timestamp'])/1000
                        interaction[f'sens_{window}_FAC_{a}_Binary']=df_sens_task[a].dropna().max()>= 50
                    except:
                        error['FAC']='Missing'

                for e in cols_eeg:
                    try:
                        interaction[f'sens_{window}_EEG_{e}_mean']=df_sens_task[e].dropna()[df_sens_int[e] > -9000].mean()
                        auc_data = df_sens_task.loc[df_sens_task[e].notna() & (df_sens_task[e] > -9000), ['Timestamp', e]]
                        interaction[f'sens_{window}_EEG_{e}_AUC']=np.trapz(auc_data[e],x=auc_data['Timestamp'])/1000
                    except:
                        error['EEG']='Missing'

                try:
                    interaction[f'sens_{window}_GSR_PeakDetected_Binary'] =1 if df_sens_task['Peak Detected'].sum()>0 else 0
                    gsr_data = df_sens_task[['Timestamp','Peak Detected']].dropna()
                    mask = gsr_data['Peak Detected'] == 1
                    segments = (mask != mask.shift()).cumsum()  # Assign unique numbers to patches
                    count_patches = gsr_data.loc[mask, 'Peak Detected'].groupby(segments).ngroup().nunique()
                    interaction[f'sens_{window}_GSR_Peaks_Count'] =count_patches
                except:
                    error['GSR']='Missing'

                try:
                    blink_data = df_sens_task[['Timestamp','Blink Detected']].dropna()
                    mask = blink_data['Blink Detected'] == 1
                    segments = (mask != mask.shift()).cumsum()  # Assign unique numbers to patches
                    count_patches = blink_data.loc[mask, 'Blink Detected'].groupby(segments).ngroup().nunique()
                    interaction[f'sens_{window}_ET_Blink_Count'] =count_patches
                    interaction[f'sens_{window}_ET_Blink_Rate'] =count_patches/((df_sens_task['Timestamp'].values[-1]-df_sens_task['Timestamp'].values[0])/(1000 * 60))
                except:
                    error['ET']='Missing'

            # TODO Get sensor data for non-interaction
            ##################################### Add this in
            results.append(interaction)
            errors.append(error)

            pass
        except IndexError:
            print(f'>>> Could not find {respondent} sensor data')
        except:
            print(f'>>> Failed {respondent}')

    results = pd.DataFrame(results)
    results.to_csv(f'{out_path}biometric_results.csv')

    errors = pd.DataFrame(errors)
    errors.to_csv(f'{out_path}errors_biometric.csv')

## Explanation of functions

The above functions are used to read in the sesor data files, one csv at a time, and extract single features per stimulus, and write these features to a simple results file.

The functions must be adjusted to:
- Discern between long form and short form
- Isolate key moments from timings file provided by client
- Extract time series
- Compute group-wide features such as inter-subject correlation

# Preparation
- Create naming dictionary for all stims
- Get total times of all stims
- Prepare key_moments

## Stimulus duration scan
We load one sensor recording per group, extract the unique stimulus names, and estimate the average exposure duration per stimulus using the `Timestamp` column.

In [28]:
# Locate one sensor export per group for duration scanning
project_root = Path.cwd().parent
data_export_dir = project_root / "data" / "Export"

group_sensor_files = {}
for group_dir in sorted(data_export_dir.glob("Group *")):
    if not group_dir.is_dir():
        continue
    sensor_dirs = sorted(group_dir.glob("Analyses/*/Sensor Data"))
    csv_candidates = []
    for sensor_dir in sensor_dirs:
        csv_candidates.extend(sorted(sensor_dir.glob("*.csv")))
    group_sensor_files[group_dir.name] = csv_candidates[0] if csv_candidates else None

sensor_selection = pd.DataFrame([
    {
        "group": group,
        "sensor_file": path.name if path else None
    }
    for group, path in group_sensor_files.items()
]).sort_values("group").reset_index(drop=True)

sensor_selection

Unnamed: 0,group,sensor_file
0,Group A,001_116.csv
1,Group B,001_58.csv
2,Group C,001_114.csv
3,Group D,001_102.csv
4,Group E,001_108.csv
5,Group F,001_107.csv


In [16]:
# Collect per-group stimulus durations without aggregating across groups
duration_tables = []
issues = {}

for group, path in group_sensor_files.items():
    if path is None:
        issues[group] = "No sensor CSV found"
        continue
    try:
        df_group, _ = read_imotions(path)
    except Exception as exc:
        issues[group] = f"read_imotions failed: {exc}"
        continue

    required_cols = {"SourceStimuliName", "Timestamp"}
    if not required_cols.issubset(df_group.columns):
        issues[group] = "Missing SourceStimuliName or Timestamp"
        continue

    df_clean = df_group[["SourceStimuliName", "Timestamp"]].copy()
    df_clean = df_clean.dropna(subset=["SourceStimuliName"])
    df_clean["Timestamp"] = pd.to_numeric(df_clean["Timestamp"], errors="coerce")
    df_clean = df_clean.dropna(subset=["Timestamp"])
    if df_clean.empty:
        issues[group] = "No valid timestamp data"
        continue

    group_duration = (
        df_clean.groupby("SourceStimuliName")["Timestamp"]
        .apply(lambda s: s.max() - s.min())
        .reset_index(name="duration_ms")
    )

    if group_duration.empty:
        issues[group] = "No stimuli with duration"
        continue

    group_duration["duration_seconds"] = group_duration["duration_ms"] / 1000.0
    group_duration["duration_minutes"] = group_duration["duration_seconds"] / 60.0
    group_duration.insert(0, "group", group)
    group_duration.rename(columns={"SourceStimuliName": "stimulus_name"}, inplace=True)
    duration_tables.append(group_duration[["group", "stimulus_name", "duration_seconds", "duration_minutes"]])

if duration_tables:
    stimulus_summary = pd.concat(duration_tables, ignore_index=True)
    stimulus_summary.sort_values(["group", "stimulus_name"], inplace=True)
    stimulus_summary["duration_seconds"] = stimulus_summary["duration_seconds"].round(2)
    stimulus_summary["duration_minutes"] = stimulus_summary["duration_minutes"].round(2)
    stimulus_summary.reset_index(drop=True, inplace=True)
    stimulus_summary
else:
    print("No duration records computed.")

if issues:
    pd.DataFrame(
        {"group": list(issues.keys()), "issue": list(issues.values())}
    ).sort_values("group").reset_index(drop=True)

In [17]:
stimulus_summary.head()

Unnamed: 0,group,stimulus_name,duration_seconds,duration_minutes
0,Group A,A STAR IS BORN,248.67,4.14
1,Group A,HOME ALONE,115.12,1.92
2,Group A,MAD MAX FURY ROAD,226.47,3.77
3,Group A,THE CONJURING,171.31,2.86
4,Group A,THE TOWN,1744.51,29.08


In [18]:
issues

{}

In [19]:
stimulus_summary.shape

(36, 4)

In [21]:
stimulus_summary['duration_seconds'].agg(['min','max']).round(2)

min      59.82
max    1811.54
Name: duration_seconds, dtype: float64

In [22]:
stimuli_per_group = stimulus_summary.groupby('group')['stimulus_name'].nunique().reset_index(name='unique_stimuli')
stimuli_per_group

Unnamed: 0,group,unique_stimuli
0,Group A,6
1,Group B,6
2,Group C,6
3,Group D,6
4,Group E,6
5,Group F,6


In [26]:
stimulus_summary.to_csv(project_root / "results" / "stimulus_summary.csv", index=False)

# Feature Extraction

## Stimulus Annotation Overview
- `stimulus_rename` links each group-specific stimulus from `stimulus_summary` to a clean `title` and its presentation `Form` (`Long` or `Short`).
- Some titles appear in both forms; the long cut (≈30 min) includes the short-form key moment as an embedded segment.
- `key_moments` pinpoints, for every long-form title, when the key moment begins (`Lead-up Duration`) and how long it lasts (`Key moment Duration_LF`).
- These tables let us align short-form clips with the corresponding segment inside the long-form presentation for downstream comparisons.

## Stage 1: Demographics
We extract respondent-level identifiers and timing information from the metadata embedded in each sensor export to seed the unified view (UV). This pass scans every sensor CSV, captures study name, respondent attributes, and recording timestamps, and prepares the foundation for later feature merges.

In [36]:
import re

metadata_keys = [
    "Study name",
    "Respondent Name",
    "Respondent Age",
    "Respondent Gender",
    "Respondent Group",
    "Recording time"
]

sensor_file_paths = sorted(
    (project_root / "data" / "Export").glob("Group */Analyses/*/Sensor Data/*.csv")
)

def first_segment(value):
    if value is None:
        return None
    return str(value).split(',')[0].strip()

def parse_gender(raw_gender):
    if not raw_gender:
        return None
    gender_lower = raw_gender.lower()
    if "female" in gender_lower:
        return "Female"
    if "male" in gender_lower:
        return "Male"
    if "other" in gender_lower:
        return "Other"
    return raw_gender.title()

def extract_group_letter(study_value, fallback_values):
    if study_value:
        terminal_match = re.search(r"([A-Za-z])$", study_value.strip())
        if terminal_match:
            return terminal_match.group(1).upper()
        letters = re.findall(r"[A-Za-z]", study_value)
        if letters:
            return letters[-1].upper()
    for candidate in fallback_values:
        if candidate:
            match = re.search(r"Group\s*([A-F])", str(candidate), flags=re.IGNORECASE)
            if match:
                return match.group(1).upper()
    return None

demographic_records = []

for csv_path in sensor_file_paths:
    _, meta = read_imotions(csv_path, metadata=metadata_keys)

    study_clean = first_segment(meta.get("Study name"))
    respondent_group_clean = first_segment(meta.get("Respondent Group"))
    group_letter = extract_group_letter(study_clean, [respondent_group_clean, csv_path.as_posix()])

    respondent_raw = first_segment(meta.get("Respondent Name"))
    respondent_value = csv_path.stem
    if respondent_raw:
        respondent_digits = re.search(r"\d+", respondent_raw)
        if respondent_digits:
            respondent_value = respondent_digits.group(0)
        else:
            respondent_value = respondent_raw

    age_raw = first_segment(meta.get("Respondent Age"))
    age_numeric = pd.to_numeric(age_raw, errors="coerce")
    if pd.notna(age_numeric):
        age_value = int(age_numeric)
    else:
        age_value = pd.NA

    gender_raw = first_segment(meta.get("Respondent Gender"))
    gender_value = parse_gender(gender_raw)

    recording_raw = meta.get("Recording time")
    date_study = None
    time_study = None
    if recording_raw:
        fragments = [frag.strip() for frag in str(recording_raw).split(',') if frag.strip()]
        date_part = None
        time_part = None
        for fragment in fragments:
            if fragment.lower().startswith("date:"):
                date_part = fragment.split(':', 1)[1].strip()
            elif fragment.lower().startswith("time:"):
                time_part = fragment.split(':', 1)[1].strip()
        if date_part and time_part:
            dt_string = f"{date_part} {time_part}"
            ts = pd.to_datetime(dt_string, utc=True, errors="coerce")
            if pd.isna(ts):
                ts = pd.to_datetime(dt_string, errors="coerce")
                if pd.notna(ts) and ts.tzinfo is None:
                    try:
                        ts = ts.tz_localize("America/Chicago")
                    except Exception:
                        ts = ts.tz_localize("UTC")
            if pd.notna(ts):
                if ts.tzinfo is None:
                    ts = ts.tz_localize("America/Chicago")
                else:
                    ts = ts.tz_convert("America/Chicago")
                date_study = ts.strftime("%m/%d/%Y")
                time_study = ts.strftime("%H:%M:%S")
            else:
                date_study = date_part
        elif date_part:
            date_study = date_part

    demographic_records.append({
        "source_file": csv_path.name,
        "group": group_letter,
        "respondent": respondent_value,
        "age": age_value,
        "gender": gender_value,
        "date_study": date_study,
        "time_study": time_study
    })

uv_stage1 = pd.DataFrame(demographic_records)

if not uv_stage1.empty:
    uv_stage1 = uv_stage1.sort_values(["group", "respondent"]).reset_index(drop=True)
    uv_stage1["respondent"] = uv_stage1["respondent"].astype(str)
    uv_stage1["age"] = uv_stage1["age"].astype("Int64")

uv = uv_stage1.copy()

uv_stage1

KeyboardInterrupt: 

In [38]:
manual_gender_overrides = {
    "8": "Female",
    "56": "Male",
    "16": "Male",
    "6": "Male",
    "46": "Female",
    "69": "Female",
    "44": "Male",
    "50": "Male"
}

uv_stage1["gender"] = uv_stage1.apply(
    lambda row: manual_gender_overrides.get(row["respondent"], row["gender"]), axis=1
)
uv = uv_stage1.copy()

uv_stage1[uv_stage1["respondent"].isin(manual_gender_overrides.keys())][["respondent", "gender"]]

Unnamed: 0,respondent,gender
7,8,Female
16,56,Male
28,16,Male
31,6,Male
45,46,Female
61,69,Female
77,44,Male
78,50,Male


In [40]:
duplicate_respondents = uv_stage1[uv_stage1.duplicated(subset="respondent", keep=False)]
if duplicate_respondents.empty:
    print("No duplicate respondents detected.")
else:
    duplicate_respondents.sort_values("respondent")

In [42]:
uv_stage1

Unnamed: 0,source_file,group,respondent,age,gender,date_study,time_study
0,003_104.csv,A,104,59,Male,10/16/2025,18:09:03
1,002_106.csv,A,106,30,Male,10/16/2025,19:35:05
2,006_11.csv,A,11,33,Female,10/11/2025,09:32:42
3,001_116.csv,A,116,19,Male,10/18/2025,12:37:40
4,007_3.csv,A,3,34,Female,10/10/2025,09:19:22
...,...,...,...,...,...,...,...
78,005_50.csv,F,50,63,Male,10/14/2025,09:54:03
79,004_60.csv,F,60,66,Male,10/15/2025,09:34:06
80,003_70.csv,F,70,61,Female,10/16/2025,09:49:14
81,002_85.csv,F,85,34,Female,10/17/2025,14:37:41
