# Setup

## Function to read iMotions sensor file

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

def read_imotions(path, metadata=None):
    """
    Reads an iMotions CSV file while extracting optional metadata fields.

    Parameters:
        path (str): Path to the iMotions CSV file.
        metadata (list[str], optional): List of metadata keys to extract.

    Returns:
        df (pd.DataFrame): The data as a DataFrame.
        meta_dict (dict): Dictionary containing requested metadata fields.
    """
    meta_dict = {}
    metadata = metadata or []
    meta_lines = []
    count = 0

    with open(path, 'r') as file:
        while True:
            line = file.readline()
            if not line:
                break
            if '#' in line.split(',')[0]:
                meta_lines.append('#'.join(line.strip().split('#')[1:]))
                count += 1
            else:
                break

    # Parse requested metadata
    for line in meta_lines:
        # Remove leading '#' and split by comma
        parts = line.split(',')
        if len(parts) > 1:
            key, value = parts[0].strip(), ','.join(parts[1:])
            if key in metadata:
                meta_dict[key] = value

    # Read data using header row after metadata
    df = pd.read_csv(path, header=count, low_memory=True)
    return df, meta_dict

def get_files(folder, tags=['',]):
    return [f for f in os.listdir(folder) if not f.startswith('.') and all(x in f for x in tags)] 


def get_biometric_data(in_folder, results_folder):

    ######## Define ########
    # Define paths
    out_path = f"{results_folder}/"
    os.makedirs(out_path, exist_ok=True)

    respondents = [1,2,3] #define list of respondent ids

    # Define signal columns
    cols_afdex = [
                "Anger", "Contempt", "Disgust", "Fear", "Joy", "Sadness",
                "Surprise", "Engagement", "Valence", "Sentimentality",
                "Confusion", "Neutral"
        ]
    cols_eeg = ['High Engagement',
        'Low Engagement',
        'Distraction',
        'Drowsy',
        'Workload Average',
        'Frontal Asymmetry Alpha',
        ]


    #Define window lengths in seconds
    window_lengths = [3,]

    ######## Read Inputs #######
    #Get input files
    sensor_files = get_files(f'{in_folder}/Sensors/',tags=['.csv',])

    ### Begin ###

    results = []
    errors = []
    for respondent in respondents:
        error = {'respondent':respondent, 'FAC':None, 'EEG':None, 'GSR':None, 'Blinks':None, 'ET':None}
        interaction = {'respondent':respondent}
        try:
            file = [f for f in sensor_files if respondent in f][0] #may need adjustment
            df_sens_resp,_ = read_imotions(f'{in_folder}/Sensors/{file}')

            # Get sensor data per stimulus
            for task in df_sens_resp['SourceStimuliName'].unique():
                df_sens_task = df_sens_resp.loc[(df_sens_resp['SourceStimuliName']==task)]
                window = task

                # Get facial coding data
                for a in cols_afdex:
                    try:
                        interaction[f'sens_{window}_FAC_{a}_mean']=df_sens_task[a].dropna().mean()
                        auc_data = df_sens_task[['Timestamp',a]].dropna()
                        interaction[f'sens_{window}_FAC_{a}_AUC']=np.trapz(auc_data[a],x=auc_data['Timestamp'])/1000
                        interaction[f'sens_{window}_FAC_{a}_Binary']=df_sens_task[a].dropna().max()>= 50
                    except:
                        error['FAC']='Missing'

                for e in cols_eeg:
                    try:
                        interaction[f'sens_{window}_EEG_{e}_mean']=df_sens_task[e].dropna()[df_sens_int[e] > -9000].mean()
                        auc_data = df_sens_task.loc[df_sens_task[e].notna() & (df_sens_task[e] > -9000), ['Timestamp', e]]
                        interaction[f'sens_{window}_EEG_{e}_AUC']=np.trapz(auc_data[e],x=auc_data['Timestamp'])/1000
                    except:
                        error['EEG']='Missing'

                try:
                    interaction[f'sens_{window}_GSR_PeakDetected_Binary'] =1 if df_sens_task['Peak Detected'].sum()>0 else 0
                    gsr_data = df_sens_task[['Timestamp','Peak Detected']].dropna()
                    mask = gsr_data['Peak Detected'] == 1
                    segments = (mask != mask.shift()).cumsum()  # Assign unique numbers to patches
                    count_patches = gsr_data.loc[mask, 'Peak Detected'].groupby(segments).ngroup().nunique()
                    interaction[f'sens_{window}_GSR_Peaks_Count'] =count_patches
                except:
                    error['GSR']='Missing'

                try:
                    blink_data = df_sens_task[['Timestamp','Blink Detected']].dropna()
                    mask = blink_data['Blink Detected'] == 1
                    segments = (mask != mask.shift()).cumsum()  # Assign unique numbers to patches
                    count_patches = blink_data.loc[mask, 'Blink Detected'].groupby(segments).ngroup().nunique()
                    interaction[f'sens_{window}_ET_Blink_Count'] =count_patches
                    interaction[f'sens_{window}_ET_Blink_Rate'] =count_patches/((df_sens_task['Timestamp'].values[-1]-df_sens_task['Timestamp'].values[0])/(1000 * 60))
                except:
                    error['ET']='Missing'

            # TODO Get sensor data for non-interaction
            ##################################### Add this in
            results.append(interaction)
            errors.append(error)

            pass
        except IndexError:
            print(f'>>> Could not find {respondent} sensor data')
        except:
            print(f'>>> Failed {respondent}')

    results = pd.DataFrame(results)
    results.to_csv(f'{out_path}biometric_results.csv')

    errors = pd.DataFrame(errors)
    errors.to_csv(f'{out_path}errors_biometric.csv')

project_root = Path.cwd().parent
data_export_dir = project_root / "data" / "Export"

## Explanation of functions

The above functions are used to read in the sesor data files, one csv at a time, and extract single features per stimulus, and write these features to a simple results file.

The functions must be adjusted to:
- Discern between long form and short form
- Isolate key moments from timings file provided by client
- Extract time series
- Compute group-wide features such as inter-subject correlation

# Preparation
- Create naming dictionary for all stims
- Get total times of all stims
- Prepare key_moments

## Stimulus duration scan
We load one sensor recording per group, extract the unique stimulus names, and estimate the average exposure duration per stimulus using the `Timestamp` column.

In [2]:
# Locate one sensor export per group for duration scanning


group_sensor_files = {}
for group_dir in sorted(data_export_dir.glob("Group *")):
    if not group_dir.is_dir():
        continue
    sensor_dirs = sorted(group_dir.glob("Analyses/*/Sensor Data"))
    csv_candidates = []
    for sensor_dir in sensor_dirs:
        csv_candidates.extend(sorted(sensor_dir.glob("*.csv")))
    group_sensor_files[group_dir.name] = csv_candidates[0] if csv_candidates else None

sensor_selection = pd.DataFrame([
    {
        "group": group,
        "sensor_file": path.name if path else None
    }
    for group, path in group_sensor_files.items()
]).sort_values("group").reset_index(drop=True)

sensor_selection

Unnamed: 0,group,sensor_file
0,Group A,001_116.csv
1,Group B,001_58.csv
2,Group C,001_114.csv
3,Group D,001_102.csv
4,Group E,001_108.csv
5,Group F,001_107.csv


In [3]:
# Collect per-group stimulus durations without aggregating across groups
duration_tables = []
issues = {}

for group, path in group_sensor_files.items():
    if path is None:
        issues[group] = "No sensor CSV found"
        continue
    try:
        df_group, _ = read_imotions(path)
    except Exception as exc:
        issues[group] = f"read_imotions failed: {exc}"
        continue

    required_cols = {"SourceStimuliName", "Timestamp"}
    if not required_cols.issubset(df_group.columns):
        issues[group] = "Missing SourceStimuliName or Timestamp"
        continue

    df_clean = df_group[["SourceStimuliName", "Timestamp"]].copy()
    df_clean = df_clean.dropna(subset=["SourceStimuliName"])
    df_clean["Timestamp"] = pd.to_numeric(df_clean["Timestamp"], errors="coerce")
    df_clean = df_clean.dropna(subset=["Timestamp"])
    if df_clean.empty:
        issues[group] = "No valid timestamp data"
        continue

    group_duration = (
        df_clean.groupby("SourceStimuliName")["Timestamp"]
        .apply(lambda s: s.max() - s.min())
        .reset_index(name="duration_ms")
    )

    if group_duration.empty:
        issues[group] = "No stimuli with duration"
        continue

    group_duration["duration_seconds"] = group_duration["duration_ms"] / 1000.0
    group_duration["duration_minutes"] = group_duration["duration_seconds"] / 60.0
    group_duration.insert(0, "group", group)
    group_duration.rename(columns={"SourceStimuliName": "stimulus_name"}, inplace=True)
    duration_tables.append(group_duration[["group", "stimulus_name", "duration_seconds", "duration_minutes"]])

if duration_tables:
    stimulus_summary = pd.concat(duration_tables, ignore_index=True)
    stimulus_summary.sort_values(["group", "stimulus_name"], inplace=True)
    stimulus_summary["duration_seconds"] = stimulus_summary["duration_seconds"].round(2)
    stimulus_summary["duration_minutes"] = stimulus_summary["duration_minutes"].round(2)
    stimulus_summary.reset_index(drop=True, inplace=True)
    stimulus_summary
else:
    print("No duration records computed.")

if issues:
    pd.DataFrame(
        {"group": list(issues.keys()), "issue": list(issues.values())}
    ).sort_values("group").reset_index(drop=True)

  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)


In [4]:
stimulus_summary.head()

Unnamed: 0,group,stimulus_name,duration_seconds,duration_minutes
0,Group C,07 The Notebook,65.39,1.09
1,Group C,09 I Am Legend - Infected encounter,118.49,1.97
2,Group C,10 The Town - Bank robbery in nun masks,263.21,4.39
3,Group C,Abbott Elementary - S1E9 - Step Class,1291.75,21.53
4,Group C,HOME ALONE,115.09,1.92


In [5]:
issues

{'Group A': 'read_imotions failed: Unable to allocate 853. MiB for an array with shape (210, 532611) and data type float64',
 'Group B': 'read_imotions failed: Unable to allocate 838. MiB for an array with shape (212, 518407) and data type float64',
 'Group D': 'read_imotions failed: Unable to allocate 843. MiB for an array with shape (221, 499863) and data type float64',
 'Group E': 'read_imotions failed: Unable to allocate 825. MiB for an array with shape (212, 510043) and data type float64',
 'Group F': 'read_imotions failed: Unable to allocate 701. MiB for an array with shape (212, 433145) and data type float64'}

In [6]:
stimulus_summary.shape

(6, 4)

In [7]:
stimulus_summary['duration_seconds'].agg(['min','max']).round(2)

min      65.39
max    1291.75
Name: duration_seconds, dtype: float64

In [8]:
stimuli_per_group = stimulus_summary.groupby('group')['stimulus_name'].nunique().reset_index(name='unique_stimuli')
stimuli_per_group

Unnamed: 0,group,unique_stimuli
0,Group C,6


In [9]:
stimulus_summary.to_csv(project_root / "results" / "stimulus_summary.csv", index=False)

# Feature Extraction

## Stimulus Annotation Overview
- `stimulus_rename` links each group-specific stimulus from `stimulus_summary` to a clean `title` and its presentation `Form` (`Long` or `Short`).
- Some titles appear in both forms; the long cut (≈30 min) includes the short-form key moment as an embedded segment.
- `key_moments` pinpoints, for every long-form title, when the key moment begins (`Lead-up Duration`) and how long it lasts (`Key moment Duration_LF`).
- These tables let us align short-form clips with the corresponding segment inside the long-form presentation for downstream comparisons.

## Stage 1: Demographics
We extract respondent-level identifiers and timing information from the metadata embedded in each sensor export to seed the unified view (UV). This pass scans every sensor CSV, captures study name, respondent attributes, and recording timestamps, and prepares the foundation for later feature merges.

In [10]:
import re

metadata_keys = [
    "Study name",
    "Respondent Name",
    "Respondent Age",
    "Respondent Gender",
    "Respondent Group",
    "Recording time"
]

sensor_file_paths = sorted(
    (project_root / "data" / "Export").glob("Group */Analyses/*/Sensor Data/*.csv")
)

def first_segment(value):
    if value is None:
        return None
    return str(value).split(',')[0].strip()

def parse_gender(raw_gender):
    if not raw_gender:
        return None
    gender_lower = raw_gender.lower()
    if "female" in gender_lower:
        return "Female"
    if "male" in gender_lower:
        return "Male"
    if "other" in gender_lower:
        return "Other"
    return raw_gender.title()

def extract_group_letter(study_value, fallback_values):
    if study_value:
        terminal_match = re.search(r"([A-Za-z])$", study_value.strip())
        if terminal_match:
            return terminal_match.group(1).upper()
        letters = re.findall(r"[A-Za-z]", study_value)
        if letters:
            return letters[-1].upper()
    for candidate in fallback_values:
        if candidate:
            match = re.search(r"Group\s*([A-F])", str(candidate), flags=re.IGNORECASE)
            if match:
                return match.group(1).upper()
    return None

demographic_records = []

for csv_path in sensor_file_paths:
    _, meta = read_imotions(csv_path, metadata=metadata_keys)

    study_clean = first_segment(meta.get("Study name"))
    respondent_group_clean = first_segment(meta.get("Respondent Group"))
    group_letter = extract_group_letter(study_clean, [respondent_group_clean, csv_path.as_posix()])

    respondent_raw = first_segment(meta.get("Respondent Name"))
    respondent_value = csv_path.stem
    if respondent_raw:
        respondent_digits = re.search(r"\d+", respondent_raw)
        if respondent_digits:
            respondent_value = respondent_digits.group(0)
        else:
            respondent_value = respondent_raw

    age_raw = first_segment(meta.get("Respondent Age"))
    age_numeric = pd.to_numeric(age_raw, errors="coerce")
    if pd.notna(age_numeric):
        age_value = int(age_numeric)
    else:
        age_value = pd.NA

    gender_raw = first_segment(meta.get("Respondent Gender"))
    gender_value = parse_gender(gender_raw)

    recording_raw = meta.get("Recording time")
    date_study = None
    time_study = None
    if recording_raw:
        fragments = [frag.strip() for frag in str(recording_raw).split(',') if frag.strip()]
        date_part = None
        time_part = None
        for fragment in fragments:
            if fragment.lower().startswith("date:"):
                date_part = fragment.split(':', 1)[1].strip()
            elif fragment.lower().startswith("time:"):
                time_part = fragment.split(':', 1)[1].strip()
        if date_part and time_part:
            dt_string = f"{date_part} {time_part}"
            ts = pd.to_datetime(dt_string, utc=True, errors="coerce")
            if pd.isna(ts):
                ts = pd.to_datetime(dt_string, errors="coerce")
                if pd.notna(ts) and ts.tzinfo is None:
                    try:
                        ts = ts.tz_localize("America/Chicago")
                    except Exception:
                        ts = ts.tz_localize("UTC")
            if pd.notna(ts):
                if ts.tzinfo is None:
                    ts = ts.tz_localize("America/Chicago")
                else:
                    ts = ts.tz_convert("America/Chicago")
                date_study = ts.strftime("%m/%d/%Y")
                time_study = ts.strftime("%H:%M:%S")
            else:
                date_study = date_part
        elif date_part:
            date_study = date_part

    demographic_records.append({
        "source_file": csv_path.name,
        "group": group_letter,
        "respondent": respondent_value,
        "age": age_value,
        "gender": gender_value,
        "date_study": date_study,
        "time_study": time_study
    })

uv_stage1 = pd.DataFrame(demographic_records)

if not uv_stage1.empty:
    uv_stage1 = uv_stage1.sort_values(["group", "respondent"]).reset_index(drop=True)
    uv_stage1["respondent"] = uv_stage1["respondent"].astype(str)
    uv_stage1["age"] = uv_stage1["age"].astype("Int64")

uv = uv_stage1.copy()

uv_stage1

  df = pd.read_csv(path, header=count, low_memory=True)


MemoryError: Unable to allocate 853. MiB for an array with shape (210, 532611) and data type float64

In [None]:
manual_gender_overrides = {
    "8": "Female",
    "56": "Male",
    "16": "Male",
    "6": "Male",
    "46": "Female",
    "69": "Female",
    "44": "Male",
    "50": "Male"
}

uv_stage1["gender"] = uv_stage1.apply(
    lambda row: manual_gender_overrides.get(row["respondent"], row["gender"]), axis=1
)
uv = uv_stage1.copy()

uv_stage1[uv_stage1["respondent"].isin(manual_gender_overrides.keys())][["respondent", "gender"]]

Unnamed: 0,respondent,gender


In [None]:
# Attach supplemental demographics from grid.csv
grid_path = project_root / "data" / "grid.csv"
grid_rename_map = {
    "QB2. Age.1": "age_group",
    "QC. Ethnicity": "ethnicity",
    "QD. Income": "income_group",
    "Q1. Content Hours Per Week": "content_consumption",
    "Q2. Program Type %- Movies": "content_consumption_movies",
    "Q2. Program Type %- Series": "content_consumption_series",
    "Q2. Program Type %- Short": "content_consumption_short",
}
grid_columns = ["respondent", *grid_rename_map.keys()]
grid_raw = pd.read_csv(grid_path, encoding="latin1", usecols=grid_columns)
grid_raw = grid_raw.dropna(subset=["respondent"])
grid_raw["respondent"] = pd.to_numeric(grid_raw["respondent"], errors="coerce").astype("Int64")
grid_raw = grid_raw.dropna(subset=["respondent"])
grid_subset = grid_raw.rename(columns=grid_rename_map).copy()

grid_subset["respondent"] = grid_subset["respondent"].astype(str).str.strip()
uv_stage1["respondent"] = uv_stage1["respondent"].astype(str).str.strip()

text_cols = ["age_group", "ethnicity", "income_group", "content_consumption"]
for col in text_cols:
    grid_subset[col] = grid_subset[col].apply(lambda value: value.strip() if isinstance(value, str) else value)

numeric_cols = ["content_consumption_movies", "content_consumption_series", "content_consumption_short"]
for col in numeric_cols:
    grid_subset[col] = pd.to_numeric(grid_subset[col], errors="coerce").astype("Int64")

grid_subset = grid_subset.drop_duplicates(subset="respondent", keep="first")

uv_stage1 = uv_stage1.merge(grid_subset, on="respondent", how="left", validate="many_to_one")
uv = uv_stage1.copy()

uv_stage1.loc[:, [
    "respondent",
    "age",
    "age_group",
    "ethnicity",
    "income_group",
    "content_consumption",
    "content_consumption_movies",
    "content_consumption_series",
    "content_consumption_short",
]].head()

Unnamed: 0,respondent,age,age_group,ethnicity,income_group,content_consumption,content_consumption_movies,content_consumption_series,content_consumption_short
0,104,59,44-59,White,"$60,000 or more per year",More than 24 hours per week,10,90,0
1,106,30,28-43,White,"$60,000 or more per year",3 to 12 hours per week,25,50,25
2,11,33,28-43,White,"$35,000  $60,000 per year",12 to 24 hours per week,40,40,20
3,116,19,18-27,White,"$35,000  $60,000 per year",3 to 12 hours per week,25,50,25
4,3,34,28-43,White,"$60,000 or more per year",12 to 24 hours per week,10,70,20


In [None]:
duplicate_respondents = uv_stage1[uv_stage1.duplicated(subset="respondent", keep=False)]
if duplicate_respondents.empty:
    print("No duplicate respondents detected.")
else:
    duplicate_respondents.sort_values("respondent")

In [None]:
uv_stage1.to_csv(project_root / "results" / "uv_stage1_demographics.csv", index=False)
uv_stage1

Unnamed: 0,source_file,group,respondent,age,gender,date_study,time_study,age_group,ethnicity,income_group,content_consumption,content_consumption_movies,content_consumption_series,content_consumption_short
0,003_104.csv,A,104,59,Male,10/16/2025,18:09:03,44-59,White,"$60,000 or more per year",More than 24 hours per week,10,90,0
1,002_106.csv,A,106,30,Male,10/16/2025,19:35:05,28-43,White,"$60,000 or more per year",3 to 12 hours per week,25,50,25
2,006_11.csv,A,11,33,Female,10/11/2025,09:32:42,28-43,White,"$35,000  $60,000 per year",12 to 24 hours per week,40,40,20
3,001_116.csv,A,116,19,Male,10/18/2025,12:37:40,18-27,White,"$35,000  $60,000 per year",3 to 12 hours per week,25,50,25
4,007_3.csv,A,3,34,Female,10/10/2025,09:19:22,28-43,White,"$60,000 or more per year",12 to 24 hours per week,10,70,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,005_50.csv,F,50,63,Other,10/14/2025,09:54:03,60-69,Black/African American,"$60,000 or more per year",12 to 24 hours per week,70,20,10
79,004_60.csv,F,60,66,Male,10/15/2025,09:34:06,60-69,White,"$35,000  $60,000 per year",3 to 12 hours per week,10,80,10
80,003_70.csv,F,70,61,Female,10/16/2025,09:49:14,60-69,Black/African American,"$35,000  $60,000 per year",12 to 24 hours per week,30,30,40
81,002_85.csv,F,85,34,Female,10/17/2025,14:37:41,28-43,White,"$60,000 or more per year",12 to 24 hours per week,40,60,0


## Stage 2: Sensor Data
After validating the workflow on the pilot cohort, we apply the sensor feature extraction pipeline to the complete respondent roster, using the finalized mappings, windowing logic, and feature naming to generate the full unified view.

In [None]:
# Load stimulus annotations and key-moment timing tables
uv_stage1 = pd.read_csv(project_root / "results" / "uv_stage1_demographics.csv")

stimulus_map = pd.read_csv(project_root / "data" / "stimulus_rename.csv")
stimulus_map["group_letter"] = stimulus_map["group"].str.extract(r"Group\s*([A-F])", expand=False).str.upper()
stimulus_map_lookup = stimulus_map.set_index(["group_letter", "stimulus_name"]).sort_index()

key_moments_raw = pd.read_csv(project_root / "data" / "key_moments.csv")
time_columns = ["Lead-up Duration", "Key moment Duration_LF"]
key_moments = key_moments_raw[["title", *time_columns]].dropna(subset=["title"]).copy()

def hhmmss_to_ms(value):
    """Convert hh:mm:ss strings to integer milliseconds (None on failure)."""
    if pd.isna(value):
        return None
    text = str(value).strip()
    if not text:
        return None
    try:
        duration = pd.to_timedelta(text)
    except ValueError:
        # Handle mm:ss formatted entries by padding hours when possible
        parts = text.split(":")
        if len(parts) == 2:
            try:
                duration = pd.to_timedelta(f"00:{text}")
            except ValueError:
                return None
        else:
            return None
    return int(duration.total_seconds() * 1000)

key_moments["lead_up_ms"] = key_moments["Lead-up Duration"].apply(hhmmss_to_ms)
key_moments["key_moment_ms"] = key_moments["Key moment Duration_LF"].apply(hhmmss_to_ms)
key_moment_lookup = key_moments.set_index("title")["lead_up_ms"].to_dict()
key_duration_lookup = key_moments.set_index("title")["key_moment_ms"].to_dict()

stimulus_map_lookup.head()

# Clear in-memory sensor dataframes to free memory
import gc
_sensor_df_names = []
for _name, _obj in list(globals().items()):
    if isinstance(_obj, pd.DataFrame) and "SourceStimuliName" in _obj.columns:
        _sensor_df_names.append(_name)
        del globals()[_name]
gc.collect()
print(f"Cleared sensor dataframes: {_sensor_df_names}")

Unnamed: 0_level_0,Unnamed: 1_level_0,group,title,form
group_letter,stimulus_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,A STAR IS BORN,Group A,A Star Is Born,Short
A,HOME ALONE,Group A,Home Alone,Short
A,MAD MAX FURY ROAD,Group A,Mad Max,Short
A,THE CONJURING,Group A,The Conjuring,Short
A,THE TOWN,Group A,The Town,Long


In [28]:
from typing import Dict
import numpy as np

fac_columns = [
    "Anger", "Contempt", "Disgust", "Fear", "Joy", "Sadness",
    "Surprise", "Engagement", "Sentimentality",
    "Confusion", "Neutral",
 ]
fac_adaptive_metrics = {
    "AdaptiveEngagement": "Adaptive Engagement",
    "PositiveAdaptiveValence": "Positive Adaptive Valence",
    "NegativeAdaptiveValence": "Negative Adaptive Valence",
    "NeutralAdaptiveValence": "Neutral Adaptive Valence",
}
eeg_columns = [
    "High Engagement",
    "Low Engagement",
    "Distraction",
    "Drowsy",
    "Workload Average",
    "Frontal Alpha Asymmetry",
 ]
eeg_alternate_columns = {
    "Frontal Alpha Asymmetry": ["Frontal Alpha Asymmetry", "Frontal Asymmetry Alpha"]
}
eeg_metric_alias = {
    "High Engagement": "HighEngagement",
    "Low Engagement": "LowEngagement",
    "Distraction": "Distraction",
    "Drowsy": "Drowsy",
    "Workload Average": "Workload",
    "Frontal Alpha Asymmetry": "FrontalAlphaAsymmetry",
}
sensor_required_columns = {
    "FAC": fac_columns + list(fac_adaptive_metrics.values()),
    "EEG": eeg_columns,
    "GSR": ["Peak Detected"],
    "ET": [
        "Blink Detected",
        "Fixation Dispersion",
        "Fixation Index",
        "Fixation Duration",
    ],
}

def _trapezoid_integral(values: np.ndarray, time_axis: np.ndarray) -> float:
    """Integrate using numpy.trapezoid when available, falling back to trapz."""
    integrate = getattr(np, "trapezoid", np.trapz)
    return float(integrate(values, x=time_axis) / 1000.0)

def prepare_stimulus_segment(df_sensor: pd.DataFrame, raw_name: str, form: str, title: str) -> pd.DataFrame:
    """Return the time-zeroed slice for the requested stimulus, clipping to key moments when needed."""
    if "SourceStimuliName" not in df_sensor.columns or "Timestamp" not in df_sensor.columns:
        return pd.DataFrame()
    subset = df_sensor.loc[df_sensor["SourceStimuliName"] == raw_name].copy()
    if subset.empty:
        return subset
    subset["Timestamp"] = pd.to_numeric(subset["Timestamp"], errors="coerce")
    subset = subset.dropna(subset=["Timestamp"])
    if subset.empty:
        return subset
    subset.sort_values("Timestamp", inplace=True)
    if "SlideEvent" in subset.columns:
        slide_events = subset["SlideEvent"].astype(str)
        start_candidates = subset.loc[slide_events == "StartMedia", "Timestamp"]
    else:
        start_candidates = pd.Series(dtype=float)
    if not start_candidates.empty:
        start_timestamp = start_candidates.iloc[0]
    else:
        start_timestamp = subset["Timestamp"].min()
    subset["time_from_start"] = subset["Timestamp"] - start_timestamp
    if form == "Long":
        lead_ms = key_moment_lookup.get(title)
        duration_ms = key_duration_lookup.get(title)
        if lead_ms is None or duration_ms is None:
            return pd.DataFrame()
        window_start = lead_ms
        window_end = lead_ms + duration_ms
        subset = subset.loc[(subset["time_from_start"] >= window_start) & (subset["time_from_start"] <= window_end)].copy()
        if subset.empty:
            return subset
        subset["time_from_start"] = subset["time_from_start"] - window_start
    return subset

def register_feature(container: Dict[str, float], form: str, title: str, sensor: str, metric: str, method: str, value: float) -> None:
    if value is None:
        return
    if isinstance(value, float) and np.isnan(value):
        return
    key = f"{form}_{title}_{sensor}_{metric}_{method}"
    container[key] = value

def compute_sensor_features(segment: pd.DataFrame, form: str, title: str) -> Dict[str, float]:
    """Compute FAC, EEG, GSR, and ET summary statistics for a stimulus segment."""
    features: Dict[str, float] = {}
    if segment.empty:
        return features
    if "time_from_start" not in segment.columns or segment["time_from_start"].empty:
        return features
    # Duration in seconds for this stimulus window
    duration_ms = float(segment["time_from_start"].max() - segment["time_from_start"].min())
    if duration_ms <= 0:
        return features
    duration_seconds = duration_ms / 1000.0
    features[f"{form}_{title}_duration"] = duration_seconds
    duration_minutes = duration_seconds / 60.0
    # Facial coding summaries
    for metric in fac_columns:
        if metric not in segment.columns:
            continue
        values = pd.to_numeric(segment[metric], errors="coerce").dropna()
        if values.empty:
            continue
        time_axis = segment.loc[values.index, "time_from_start"].values
        register_feature(features, form, title, "FAC", metric, "Mean", float(values.mean()))
        register_feature(features, form, title, "FAC", metric, "AUC", _trapezoid_integral(values.values, time_axis))
        register_feature(features, form, title, "FAC", metric, "Binary", int(values.max() >= 50))
    for metric, column_name in fac_adaptive_metrics.items():
        if column_name not in segment.columns:
            continue
        values = pd.to_numeric(segment[column_name], errors="coerce").dropna()
        if values.empty:
            continue
        time_axis = segment.loc[values.index, "time_from_start"].values
        register_feature(features, form, title, "FAC", metric, "Mean", float(values.mean()))
        register_feature(features, form, title, "FAC", metric, "AUC", _trapezoid_integral(values.values, time_axis))
    # EEG summaries
    for metric in eeg_columns:
        candidate_columns = [metric, *eeg_alternate_columns.get(metric, [])]
        actual_column = next((col for col in candidate_columns if col in segment.columns), None)
        if actual_column is None:
            continue
        values = pd.to_numeric(segment[actual_column], errors="coerce")
        valid = values.loc[values > -9000].dropna()
        if valid.empty:
            continue
        time_axis = segment.loc[valid.index, "time_from_start"].values
        label = eeg_metric_alias.get(metric, metric)
        register_feature(features, form, title, "EEG", label, "Mean", float(valid.mean()))
        register_feature(features, form, title, "EEG", label, "AUC", _trapezoid_integral(valid.values, time_axis))
    # GSR summaries
    if "Peak Detected" in segment.columns:
        peak_series = pd.to_numeric(segment["Peak Detected"], errors="coerce").fillna(0)
        peak_mask = peak_series >= 1
        register_feature(features, form, title, "GSR", "PeakDetected", "Binary", int(peak_mask.any()))
        if peak_mask.any():
            segments = (peak_mask != peak_mask.shift()).cumsum()
            peak_blocks = segments.loc[peak_mask]
            peak_count = int(peak_blocks.nunique())
            register_feature(features, form, title, "GSR", "Peaks", "Count", peak_count)
            if duration_minutes > 0:
                register_feature(features, form, title, "GSR", "Peaks", "PerMinute", float(peak_count / duration_minutes))
    # ET metrics
    if "Blink Detected" in segment.columns:
        blink_series = pd.to_numeric(segment["Blink Detected"], errors="coerce").fillna(0)
        blink_mask = blink_series >= 1
        if blink_mask.any():
            segments = (blink_mask != blink_mask.shift()).cumsum()
            blink_blocks = segments.loc[blink_mask]
            blink_count = int(blink_blocks.nunique())
            register_feature(features, form, title, "ET", "Blink", "Count", blink_count)
            if duration_minutes > 0:
                register_feature(features, form, title, "ET", "Blink", "Rate", float(blink_count / duration_minutes))
    if "Fixation Dispersion" in segment.columns:
        dispersion = pd.to_numeric(segment["Fixation Dispersion"], errors="coerce").dropna()
        if not dispersion.empty:
            register_feature(features, form, title, "ET", "FixationDispersion", "Mean", float(dispersion.mean()))
    if "Fixation Index" in segment.columns:
        fixation_index = pd.to_numeric(segment["Fixation Index"], errors="coerce").dropna()
        if not fixation_index.empty:
            fixation_count = int(fixation_index.nunique())
            register_feature(features, form, title, "ET", "Fixation", "Count", fixation_count)
            if duration_minutes > 0:
                register_feature(features, form, title, "ET", "Fixation", "PerMinute", float(fixation_count / duration_minutes))
    if "Fixation Duration" in segment.columns:
        fixation_duration = pd.to_numeric(segment["Fixation Duration"], errors="coerce").dropna()
        if not fixation_duration.empty:
            register_feature(features, form, title, "ET", "FixationDuration", "Mean", float(fixation_duration.mean()))
    return features

In [29]:
import gc

# Compute pilot sensor features for selected respondents
sensor_file_index = {
    path.name: path
    for path in (project_root / "data" / "Export").glob("Group */Analyses/*/Sensor Data/*.csv")
}

pilot_ids = ["2", "58", "116"]
pilot_subset = uv_stage1.loc[uv_stage1["respondent"].astype(str).isin(pilot_ids)].copy()
pilot_subset["respondent_numeric"] = pd.to_numeric(pilot_subset["respondent"], errors="coerce")
pilot_subset = pilot_subset.sort_values(["respondent_numeric", "respondent"])

pilot_feature_rows = []
pilot_issue_log = []

for _, row in pilot_subset.iterrows():
    respondent_id = str(row["respondent"]).strip()
    group_letter = str(row.get("group", "")).strip().upper() if pd.notna(row.get("group")) else None
    source_file = row.get("source_file")
    if not source_file or source_file not in sensor_file_index:
        pilot_issue_log.append({
            "respondent": respondent_id,
            "stimulus": None,
            "issue": "Sensor export not located.",
        })
        continue
    df_sensor, _ = read_imotions(sensor_file_index[source_file])
    feature_row: Dict[str, float] = {"respondent": respondent_id}
    try:
        if df_sensor.empty or "SourceStimuliName" not in df_sensor.columns:
            pilot_issue_log.append({
                "respondent": respondent_id,
                "stimulus": None,
                "issue": "Sensor export missing SourceStimuliName column.",
            })
            continue
        # Assess sensor coverage for this respondent
        for sensor_label, required_columns in sensor_required_columns.items():
            if sensor_label == "EEG":
                missing_metrics = []
                for metric in required_columns:
                    candidates = [metric, *eeg_alternate_columns.get(metric, [])]
                    if not any(column in df_sensor.columns for column in candidates):
                        missing_metrics.append(metric)
                feature_row[f"{sensor_label}_data_missing"] = int(bool(missing_metrics))
                if missing_metrics:
                    metrics_display = ", ".join(missing_metrics)
                    pilot_issue_log.append({
                        "respondent": respondent_id,
                        "stimulus": None,
                        "issue": f"Missing EEG columns: {metrics_display}.",
                    })
                continue
            missing_columns = [col for col in required_columns if col not in df_sensor.columns]
            feature_row[f"{sensor_label}_data_missing"] = int(bool(missing_columns))
            if missing_columns:
                pilot_issue_log.append({
                    "respondent": respondent_id,
                    "stimulus": None,
                    "issue": f"Missing {sensor_label} columns: {', '.join(missing_columns)}.",
                })
        unique_stimuli = sorted({str(s).strip() for s in df_sensor["SourceStimuliName"].dropna().unique()})
        for raw_stimulus in unique_stimuli:
            lookup_key = (group_letter, raw_stimulus)
            if lookup_key not in stimulus_map_lookup.index:
                pilot_issue_log.append({
                    "respondent": respondent_id,
                    "stimulus": raw_stimulus,
                    "issue": "Stimulus missing from rename map.",
                })
                continue
            map_row = stimulus_map_lookup.loc[lookup_key]
            if isinstance(map_row, pd.DataFrame):
                map_row = map_row.iloc[0]
            title = map_row["title"]
            form = map_row["form"]
            if form == "Long" and (key_moment_lookup.get(title) is None or key_duration_lookup.get(title) is None):
                pilot_issue_log.append({
                    "respondent": respondent_id,
                    "stimulus": raw_stimulus,
                    "issue": "Key moment timing not defined for long-form title.",
                })
                continue
            segment = prepare_stimulus_segment(df_sensor, raw_stimulus, form, title)
            if segment.empty:
                pilot_issue_log.append({
                    "respondent": respondent_id,
                    "stimulus": raw_stimulus,
                    "issue": "No data after windowing (check key moment timings).",
                })
                del segment
                continue
            try:
                features = compute_sensor_features(segment, form, title)
                if not features:
                    pilot_issue_log.append({
                        "respondent": respondent_id,
                        "stimulus": raw_stimulus,
                        "issue": "No features computed for segment.",
                    })
                    continue
                feature_row.update(features)
            finally:
                del segment
        pilot_feature_rows.append(feature_row)
    finally:
        del df_sensor
        gc.collect()

pilot_features = pd.DataFrame(pilot_feature_rows)
pilot_features

  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)


Unnamed: 0,respondent,FAC_data_missing,EEG_data_missing,GSR_data_missing,ET_data_missing,Short_The Big Bang Theory_duration,Short_The Big Bang Theory_FAC_Anger_Mean,Short_The Big Bang Theory_FAC_Anger_AUC,Short_The Big Bang Theory_FAC_Anger_Binary,Short_The Big Bang Theory_FAC_Contempt_Mean,...,Short_Titanic_EEG_FrontalAlphaAsymmetry_AUC,Short_Titanic_GSR_PeakDetected_Binary,Short_Titanic_GSR_Peaks_Count,Short_Titanic_GSR_Peaks_PerMinute,Short_Titanic_ET_Blink_Count,Short_Titanic_ET_Blink_Rate,Short_Titanic_ET_FixationDispersion_Mean,Short_Titanic_ET_Fixation_Count,Short_Titanic_ET_Fixation_PerMinute,Short_Titanic_ET_FixationDuration_Mean
0,2,0,1,0,0,186.82,0.0,0.0,0.0,0.001964,...,,,,,,,,,,
1,58,0,0,0,0,,,,,,...,,,,,,,,,,
2,116,0,0,0,0,,,,,,...,4.751817,1.0,590.0,591.261358,46.0,46.098343,0.261135,156.0,156.333511,459.838233


In [30]:
# Merge pilot features back into the UV and review any issues
uv_stage1["respondent"] = uv_stage1["respondent"].astype(str)
if not pilot_features.empty:
    pilot_features["respondent"] = pilot_features["respondent"].astype(str)
pilot_uv = (
    uv_stage1.loc[uv_stage1["respondent"].isin(pilot_ids)]
    .copy()
    .merge(pilot_features, on="respondent", how="left")
)
output_path = project_root / "results" / "uv_pilot_features.csv"
try:
    pilot_uv.to_csv(output_path, index=False)
except PermissionError:
    fallback_path = output_path.with_name(
        f"{output_path.stem}_{pd.Timestamp.utcnow().strftime('%Y%m%d%H%M%S')}.csv"
)
    pilot_uv.to_csv(fallback_path, index=False)
    print(
        f"Permission denied for {output_path}. Saved pilot UV to {fallback_path.name} instead.")

In [31]:
issues_df = pd.DataFrame(pilot_issue_log)
issues_df.sort_values(["respondent", "stimulus"], na_position="last") if not issues_df.empty else "No issues logged."

Unnamed: 0,respondent,stimulus,issue
0,2,,"Missing EEG columns: High Engagement, Low Enga..."


In [32]:
# Validate long-form durations against key-moment specifications
tolerance_seconds = 3
duration_columns = [col for col in pilot_features.columns if col.endswith("_duration")]
validation_records = []
for _, feat_row in pilot_features.iterrows():
    respondent_id = feat_row.get("respondent")
    for col in duration_columns:
        value = feat_row.get(col)
        if pd.isna(value):
            continue
        form = col.split('_', 1)[0]
        if form != "Long":
            continue
        title = col[len(form) + 1: -len("_duration")]
        expected_ms = key_duration_lookup.get(title)
        if expected_ms is None:
            continue
        observed_ms = float(value) * 1000.0
        diff_seconds = abs(observed_ms - expected_ms) / 1000.0
        validation_records.append({
            "respondent": respondent_id,
            "title": title,
            "observed_seconds": round(observed_ms / 1000.0, 2),
            "expected_seconds": round(expected_ms / 1000.0, 2),
            "diff_seconds": round(diff_seconds, 2),
            "within_tolerance": diff_seconds <= tolerance_seconds,
        })
duration_validation = pd.DataFrame(validation_records)
duration_validation if not duration_validation.empty else "No long-form durations to validate."

Unnamed: 0,respondent,title,observed_seconds,expected_seconds,diff_seconds,within_tolerance
0,2,The Town,262.0,262.0,0.01,True
1,58,Mad Max,225.0,225.0,0.0,True
2,116,The Town,262.0,262.0,0.0,True


In [33]:
def run_sensor_feature_pipeline(respondent_ids=None, export_label="uv_stage2_full", save_outputs=True):
    """Compute sensor features for the specified respondents and optionally persist outputs."""
    sensor_file_index = {
        path.name: path
        for path in (project_root / "data" / "Export").glob("Group */Analyses/*/Sensor Data/*.csv")
    }
    if respondent_ids is None:
        target_ids = sorted({str(r).strip() for r in uv_stage1["respondent"].astype(str)})
    else:
        cleaned_ids = [str(r).strip() for r in respondent_ids if pd.notna(r)]
        target_ids = sorted(set(cleaned_ids))
    if not target_ids:
        raise ValueError("No respondents provided for sensor feature processing.")
    subset = uv_stage1.loc[uv_stage1["respondent"].astype(str).isin(target_ids)].copy()
    if subset.empty:
        raise ValueError("No matching respondents found in uv_stage1 for the requested IDs.")
    subset["respondent_numeric"] = pd.to_numeric(subset["respondent"], errors="coerce")
    subset = subset.sort_values(["respondent_numeric", "respondent"])
    feature_rows = []
    issue_rows = []

    def log_issue(respondent_id, stimulus, message):
        issue_rows.append({
            "respondent": respondent_id,
            "stimulus": stimulus,
            "issue": message,
        })

    for _, row in subset.iterrows():
        respondent_id = str(row["respondent"]).strip()
        group_letter = str(row.get("group", "")).strip().upper() if pd.notna(row.get("group")) else None
        source_file = row.get("source_file")
        if not source_file or source_file not in sensor_file_index:
            log_issue(respondent_id, None, "Sensor export not located.")
            continue
        df_sensor, _ = read_imotions(sensor_file_index[source_file])
        feature_row: Dict[str, float] = {"respondent": respondent_id}
        try:
            if df_sensor.empty or "SourceStimuliName" not in df_sensor.columns:
                log_issue(respondent_id, None, "Sensor export missing SourceStimuliName column.")
                continue
            for sensor_label, required_columns in sensor_required_columns.items():
                if sensor_label == "EEG":
                    missing_metrics = []
                    for metric in required_columns:
                        candidates = [metric, *eeg_alternate_columns.get(metric, [])]
                        if not any(column in df_sensor.columns for column in candidates):
                            missing_metrics.append(metric)
                    feature_row[f"{sensor_label}_data_missing"] = int(bool(missing_metrics))
                    if missing_metrics:
                        log_issue(respondent_id, None, f"Missing EEG columns: {', '.join(missing_metrics)}.")
                    continue
                missing_columns = [col for col in required_columns if col not in df_sensor.columns]
                feature_row[f"{sensor_label}_data_missing"] = int(bool(missing_columns))
                if missing_columns:
                    log_issue(respondent_id, None, f"Missing {sensor_label} columns: {', '.join(missing_columns)}.")
            unique_stimuli = sorted({str(s).strip() for s in df_sensor["SourceStimuliName"].dropna().unique()})
            for raw_stimulus in unique_stimuli:
                lookup_key = (group_letter, raw_stimulus)
                if lookup_key not in stimulus_map_lookup.index:
                    log_issue(respondent_id, raw_stimulus, "Stimulus missing from rename map.")
                    continue
                map_row = stimulus_map_lookup.loc[lookup_key]
                if isinstance(map_row, pd.DataFrame):
                    map_row = map_row.iloc[0]
                title = map_row["title"]
                form = map_row["form"]
                if form == "Long" and (key_moment_lookup.get(title) is None or key_duration_lookup.get(title) is None):
                    log_issue(respondent_id, raw_stimulus, "Key moment timing not defined for long-form title.")
                    continue
                segment = prepare_stimulus_segment(df_sensor, raw_stimulus, form, title)
                if segment.empty:
                    log_issue(respondent_id, raw_stimulus, "No data after windowing (check key moment timings).")
                    del segment
                    continue
                try:
                    features = compute_sensor_features(segment, form, title)
                    if not features:
                        log_issue(respondent_id, raw_stimulus, "No features computed for segment.")
                        continue
                    feature_row.update(features)
                finally:
                    del segment
            feature_rows.append(feature_row)
        finally:
            del df_sensor
            gc.collect()

    features_df = pd.DataFrame(feature_rows)
    issues_df = pd.DataFrame(issue_rows)
    merged_uv = (
        uv_stage1.loc[uv_stage1["respondent"].astype(str).isin(target_ids)]
        .copy()
    )
    merged_uv["respondent"] = merged_uv["respondent"].astype(str)
    if not features_df.empty:
        features_df["respondent"] = features_df["respondent"].astype(str)
        merged_uv = merged_uv.merge(features_df, on="respondent", how="left")

    def safe_to_csv(df, path):
        if df is None:
            return
        try:
            df.to_csv(path, index=False)
        except PermissionError:
            fallback_path = path.with_name(
                f"{path.stem}_{pd.Timestamp.utcnow().strftime('%Y%m%d%H%M%S')}.csv"
            )
            df.to_csv(fallback_path, index=False)
            print(f"Permission denied for {path}. Saved to {fallback_path.name} instead.")

    if save_outputs:
        results_dir = project_root / "results"
        results_dir.mkdir(parents=True, exist_ok=True)
        safe_to_csv(features_df, results_dir / f"{export_label}_features.csv")
        safe_to_csv(merged_uv, results_dir / f"{export_label}_uv.csv")
        if not issues_df.empty:
            issues_sorted = issues_df.sort_values(["respondent", "stimulus"], na_position="last")
            safe_to_csv(issues_sorted, results_dir / f"{export_label}_issues.csv")
    gc.collect()
    return features_df, issues_df, merged_uv

In [34]:
# Run Stage 2 sensor feature extraction for the full respondent list
full_features, full_issues, full_uv = run_sensor_feature_pipeline()
print(f"Computed features for {len(full_features)} respondents; {len(full_issues)} issues logged.")
full_features.shape

  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memory=True)
  df = pd.read_csv(path, header=count, low_memor

Computed features for 83 respondents; 38 issues logged.


(83, 1076)

## Stage 3: Survey Data

### Overview
Stage 3 integrates the post-exposure survey responses into the unified view so downstream modeling can pair perceptual metrics with the Stage 1 demographics and Stage 2 sensor features. The pipeline now also ingests the screening familiarity composites, aligning them to the survey schema before export. This section documents the ingestion logic and feature engineering steps applied to the raw TSV exports and the supplemental screening file.

### Workflow Summary
1. Load the survey question catalog and rename map to build a metadata lookup for scoring rules.
2. Read every group-level `MERGED_SURVEY_RESPONSE_MATRIX` export, standardize respondent identifiers, and apply group-specific column renames.
3. Convert Likert-style answers to numeric scales (including familiarity and recency variants), compute enjoyment/familiarity composites, and split open-ended responses for archival review.
4. Merge the engineered survey metrics back into the unified view.
5. Append screening familiarity composites from `results/individual_composite_scores.csv`, canonicalizing title strings, inferring `Long`/`Short` form per group, and emitting columns named `{form}_{title}_Screening_Familiarity_{question_code}`.
6. Write the Stage 3 feature, open-ended, and UV exports with timestamped fallbacks when the base filenames are locked.

In [212]:
if "uv_stage1" not in globals():
    uv_stage1 = pd.read_csv(project_root / "results" / "uv_stage1_demographics.csv")
    uv_stage1["respondent"] = uv_stage1["respondent"].astype(str).str.strip()

if "full_uv" not in globals() and (project_root / "results" / "uv_stage2_full_uv.csv").exists():
    full_uv = pd.read_csv(project_root / "results" / "uv_stage2_full_uv.csv")
    full_uv["respondent"] = full_uv["respondent"].astype(str).str.strip()

uv_stage1.shape

(83, 14)

In [213]:
import csv
from collections import defaultdict
import re

survey_rename_map = pd.read_csv(project_root / "data" / "survey_column_rename_stage3.csv")
survey_questions = pd.read_csv(project_root / "data" / "survey_questions.csv")

survey_questions["question_code"] = survey_questions["question_code"].astype(str).str.strip()
survey_questions["question_type"] = survey_questions["question_type"].str.lower()
survey_questions["subscale"] = survey_questions["subscale"].fillna("")
survey_questions["polarity"] = survey_questions["polarity"].fillna("")

survey_metadata = (
    survey_rename_map
    .merge(survey_questions, on="question_code", how="left", suffixes=("", "_details"))
)

survey_metadata["question_type"] = survey_metadata["question_type"].fillna("likert")
survey_metadata["subscale"] = survey_metadata["subscale"].fillna("")
survey_metadata["polarity"] = survey_metadata["polarity"].fillna("")
survey_metadata_lookup = (
    survey_metadata
    .drop_duplicates(subset=["target_column"])
    .set_index("target_column")
)

survey_files = sorted(
    (project_root / "data" / "Export").glob("Group */Analyses/*/Survey/MERGED_SURVEY_RESPONSE_MATRIX-*.txt")
)

if not survey_files:
    raise FileNotFoundError("No survey response text files detected under data/Export/*/Survey/.")

len(survey_files)

12

In [214]:
LIKERT_PATTERN = re.compile(r"^\s*(\d+)(?:\.\d+)?")

LIKERT_KEYWORDS = [
    ("strongly disagree", 1.0),
    ("disagree", 2.0),
    ("neither agree nor disagree", 3.0),
    ("strongly agree", 5.0),
    ("agree", 4.0),
]

FAMILIARITY_KEY_PATTERNS = [
    (0.0, ("never heard", "not familiar")),
    (1.0, ("heard of it, but never watched", "heard of it only")),
    (2.0, ("seen a clip", "seen clips", "seen part")),
    (3.0, ("watched it in full", "just once")),
    (4.0, ("watched multiple", "very familiar")),
]

LASTWATCHED_KEY_PATTERNS = [
    (4.0, ("past week",)),
    (3.0, ("past month", "past 6 months", "past six months")),
    (2.0, ("past 3 months", "past three months")),
    (1.0, ("more than 3 months", "over 3 months")),
    (0.0, ("more than 6 months", "don't remember", "never watched this movie in full")),
]


def _clean_response(value):
    if pd.isna(value):
        return np.nan
    text = str(value).strip()
    if not text or text.upper() == "EMPTY FIELD":
        return np.nan
    return text


def _parse_likert_value(value):
    text = _clean_response(value)
    if pd.isna(text):
        return np.nan
    match = LIKERT_PATTERN.match(text)
    if match:
        return float(match.group(1))
    lowered = text.lower()
    for keyword, score in LIKERT_KEYWORDS:
        if keyword in lowered:
            return score
    try:
        return float(text)
    except ValueError:
        return np.nan


def _score_familiarity(value):
    text = _clean_response(value)
    if pd.isna(text):
        return np.nan
    match = LIKERT_PATTERN.match(text)
    if match:
        numeric = float(match.group(1))
        return float(np.clip(numeric - 1.0, 0.0, 4.0))
    lowered = text.lower()
    for score, patterns in FAMILIARITY_KEY_PATTERNS:
        if any(pattern in lowered for pattern in patterns):
            return score
    try:
        numeric = float(text)
        return float(np.clip(numeric - 1.0, 0.0, 4.0))
    except ValueError:
        return np.nan


def _score_last_watched(value):
    text = _clean_response(value)
    if pd.isna(text):
        return np.nan
    match = LIKERT_PATTERN.match(text)
    if match:
        numeric = float(match.group(1))
        return float(np.clip(numeric - 1.0, 0.0, 4.0))
    lowered = text.lower()
    for score, patterns in LASTWATCHED_KEY_PATTERNS:
        if any(pattern in lowered for pattern in patterns):
            return score
    try:
        numeric = float(text)
        return float(np.clip(numeric - 1.0, 0.0, 4.0))
    except ValueError:
        return np.nan


def _reverse_likert(value):
    if pd.isna(value):
        return np.nan
    return 6.0 - float(value)


In [215]:
def _extract_group_letter(path: Path) -> str:
    for part in path.parts:
        if part.startswith("Group ") and "-" not in part:
            return part.split()[-1].strip().upper()
    raise ValueError(f"Unable to determine group letter from path: {path}")


def _rename_survey_columns(df: pd.DataFrame, group_letter: str) -> pd.DataFrame:
    rename_subset = survey_metadata.loc[survey_metadata["group"] == group_letter]
    rename_dict = {
        raw: target
        for raw, target in zip(rename_subset["raw_column"], rename_subset["target_column"])
        if raw in df.columns
    }
    df = df.rename(columns=rename_dict)
    columns_to_keep = [
        "respondent",
        "survey_group",
        "survey_study",
        "survey_gender",
        "survey_age",
        "survey_file",
        *sorted(rename_dict.values()),
    ]
    existing_columns = [col for col in columns_to_keep if col in df.columns]
    return df.loc[:, existing_columns]


def _load_survey_file(path: Path) -> pd.DataFrame:
    # Some open-ended answers contain newline characters and stray quotes; use python engine with minimal parsing assumptions.
    return pd.read_csv(
        path,
        sep="\t",
        dtype=str,
        engine="python",
        quoting=csv.QUOTE_NONE,
        encoding="utf-8",
        on_bad_lines="warn",
    )


survey_frames = []

for survey_path in survey_files:
    group_letter = _extract_group_letter(survey_path)
    df = _load_survey_file(survey_path)
    df.columns = [col.strip() for col in df.columns]
    df = df.replace({"EMPTY FIELD": np.nan})
    df["RESPONDENT"] = df["RESPONDENT"].astype(str).str.strip()
    df = df.rename(
        columns={
            "RESPONDENT": "respondent",
            "GROUP": "survey_group",
            "STUDY": "survey_study",
            "GENDER": "survey_gender",
            "AGE": "survey_age",
        }
    )
    if "survey_group" not in df.columns:
        df["survey_group"] = group_letter
    df["survey_group"] = df["survey_group"].fillna(group_letter).astype(str).str.strip()
    if "survey_study" in df.columns:
        df["survey_study"] = df["survey_study"].astype(str).str.strip()
    else:
        df["survey_study"] = np.nan
    if "survey_gender" in df.columns:
        df["survey_gender"] = df["survey_gender"].astype(str).str.strip()
    else:
        df["survey_gender"] = np.nan
    df["survey_file"] = survey_path.name
    if "survey_age" in df.columns:
        df["survey_age"] = pd.to_numeric(df["survey_age"], errors="coerce")
    else:
        df["survey_age"] = np.nan
    df = _rename_survey_columns(df, group_letter)
    survey_frames.append(df)

survey_responses = pd.concat(survey_frames, ignore_index=True)
survey_responses

  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})
  df = df.replace({"EMPTY FIELD": np.nan})


Unnamed: 0,respondent,survey_group,survey_study,survey_gender,survey_age,survey_file,Long_The Town_Survey_Enjoyment_E1,Long_The Town_Survey_Enjoyment_E12,Long_The Town_Survey_Enjoyment_E14,Long_The Town_Survey_Enjoyment_E15,...,Short_Abbot Elementary_Survey_Enjoyment_E6,Short_Abbot Elementary_Survey_Enjoyment_E8,Short_Abbot Elementary_Survey_Enjoyment_WBD1,Short_Abbot Elementary_Survey_Enjoyment_WBD2,Short_Abbot Elementary_Survey_Enjoyment_WBD3,Short_Abbot Elementary_Survey_Enjoyment_WBD4,Short_Abbot Elementary_Survey_Enjoyment_WBD5,Short_Abbot Elementary_Survey_Familiarity_F1,Short_Abbot Elementary_Survey_Familiarity_F2,Short_Abbot Elementary_Survey_Familiarity_F3
0,83,Default,Group A,MALE,69,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
1,81,Default,Group A,FEMALE,24,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
2,99,Default,Group A,FEMALE,25,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,4 = Agree,4 = Agree,4 = Agree,4 = Agree,...,,,,,,,,,,
3,52,Default,Group A,FEMALE,50,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,4 = Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
4,8,Default,Group A,OTHER,51,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,85,Default,Group F,FEMALE,34,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
78,70,Default,Group F,FEMALE,61,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
79,96,Default,Group F,FEMALE,29,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
80,41,Default,Group F,FEMALE,53,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,


In [216]:
survey_responses["respondent"] = survey_responses["respondent"].astype(str).str.strip()

duplicate_ids = sorted(survey_responses.loc[survey_responses["respondent"].duplicated(), "respondent"].unique())
if duplicate_ids:
    print(f"Warning: duplicate survey rows detected for respondents: {duplicate_ids}")

survey_numeric = survey_responses.drop_duplicates(subset=["respondent"], keep="first").copy()
survey_numeric



Unnamed: 0,respondent,survey_group,survey_study,survey_gender,survey_age,survey_file,Long_The Town_Survey_Enjoyment_E1,Long_The Town_Survey_Enjoyment_E12,Long_The Town_Survey_Enjoyment_E14,Long_The Town_Survey_Enjoyment_E15,...,Short_Abbot Elementary_Survey_Enjoyment_E6,Short_Abbot Elementary_Survey_Enjoyment_E8,Short_Abbot Elementary_Survey_Enjoyment_WBD1,Short_Abbot Elementary_Survey_Enjoyment_WBD2,Short_Abbot Elementary_Survey_Enjoyment_WBD3,Short_Abbot Elementary_Survey_Enjoyment_WBD4,Short_Abbot Elementary_Survey_Enjoyment_WBD5,Short_Abbot Elementary_Survey_Familiarity_F1,Short_Abbot Elementary_Survey_Familiarity_F2,Short_Abbot Elementary_Survey_Familiarity_F3
0,83,Default,Group A,MALE,69,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
1,81,Default,Group A,FEMALE,24,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
2,99,Default,Group A,FEMALE,25,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,4 = Agree,4 = Agree,4 = Agree,4 = Agree,...,,,,,,,,,,
3,52,Default,Group A,FEMALE,50,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,4 = Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
4,8,Default,Group A,OTHER,51,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,5 = Strongly Agree,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,107,Default,Group F,MALE,33,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
77,85,Default,Group F,FEMALE,34,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
78,70,Default,Group F,FEMALE,61,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
79,96,Default,Group F,FEMALE,29,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,


In [217]:
for column, meta in survey_metadata_lookup.iterrows():
    if column not in survey_numeric.columns:
        continue

    question_code = (meta.get("question_code") or "").strip().upper()
    question_type = (meta.get("question_type") or "").strip().lower()
    polarity = (meta.get("polarity") or "").strip().lower()

    if question_code in {"F1", "F3"} or column.endswith("_Survey_Familiarity_F1") or column.endswith("_Survey_Familiarity_F3"):
        survey_numeric[column] = survey_numeric[column].apply(_score_familiarity)
    elif question_code == "F2" or column.endswith("_Survey_Familiarity_F2"):
        survey_numeric[column] = survey_numeric[column].apply(_score_last_watched)
    elif question_type == "likert":
        survey_numeric[column] = survey_numeric[column].apply(_parse_likert_value)
        if polarity == "negative":
            survey_numeric[column] = survey_numeric[column].apply(_reverse_likert)

def _clip_zero_to_four(value):
    if pd.isna(value):
        return np.nan
    try:
        numeric = float(value)
    except (TypeError, ValueError):
        return np.nan
    return float(np.clip(numeric, 0.0, 4.0))

familiarity_columns = [
    column
    for column in survey_numeric.columns
    if column.endswith("_Survey_Familiarity_F1")
    or column.endswith("_Survey_Familiarity_F2")
    or column.endswith("_Survey_Familiarity_F3")
]

for column in familiarity_columns:
    survey_numeric[column] = survey_numeric[column].apply(_clip_zero_to_four)

survey_numeric

Unnamed: 0,respondent,survey_group,survey_study,survey_gender,survey_age,survey_file,Long_The Town_Survey_Enjoyment_E1,Long_The Town_Survey_Enjoyment_E12,Long_The Town_Survey_Enjoyment_E14,Long_The Town_Survey_Enjoyment_E15,...,Short_Abbot Elementary_Survey_Enjoyment_E6,Short_Abbot Elementary_Survey_Enjoyment_E8,Short_Abbot Elementary_Survey_Enjoyment_WBD1,Short_Abbot Elementary_Survey_Enjoyment_WBD2,Short_Abbot Elementary_Survey_Enjoyment_WBD3,Short_Abbot Elementary_Survey_Enjoyment_WBD4,Short_Abbot Elementary_Survey_Enjoyment_WBD5,Short_Abbot Elementary_Survey_Familiarity_F1,Short_Abbot Elementary_Survey_Familiarity_F2,Short_Abbot Elementary_Survey_Familiarity_F3
0,83,Default,Group A,MALE,69,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,,,,,,,,,
1,81,Default,Group A,FEMALE,24,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,,,,,,,,,
2,99,Default,Group A,FEMALE,25,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,4.0,4.0,4.0,4.0,...,,,,,,,,,,
3,52,Default,Group A,FEMALE,50,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,4.0,5.0,5.0,...,,,,,,,,,,
4,8,Default,Group A,OTHER,51,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,107,Default,Group F,MALE,33,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
77,85,Default,Group F,FEMALE,34,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
78,70,Default,Group F,FEMALE,61,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,
79,96,Default,Group F,FEMALE,29,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,,,,,,


In [218]:
subscale_columns = defaultdict(list)
overall_enjoyment_columns = defaultdict(list)

for column, meta in survey_metadata_lookup.iterrows():
    if column not in survey_numeric.columns:
        continue
    if meta.get("topic") != "enjoyment":
        continue
    if (meta.get("question_type") or "").strip().lower() != "likert":
        continue
    prefix = column.split("_Survey_")[0]
    subscale = (meta.get("subscale") or "").strip()
    overall_enjoyment_columns[prefix].append(column)
    if subscale:
        subscale_columns[(prefix, subscale)].append(column)

for (prefix, subscale), cols in subscale_columns.items():
    values = survey_numeric[cols]
    sum_col = f"{prefix}_Survey_{subscale}_Sum"
    count_col = f"{prefix}_Survey_{subscale}_Count"
    mean_col = f"{prefix}_Survey_{subscale}_Mean"
    norm_col = f"{prefix}_Survey_{subscale}_Normalized"
    count_values = values.notna().sum(axis=1)
    sum_values = values.sum(axis=1, min_count=1)
    survey_numeric[count_col] = count_values
    survey_numeric[sum_col] = sum_values
    survey_numeric[mean_col] = np.where(count_values > 0, sum_values / count_values, np.nan)
    survey_numeric[norm_col] = np.where(
        count_values > 0,
        np.clip((sum_values - count_values) / (4.0 * count_values), 0, 1),
        np.nan,
    )

for prefix, cols in overall_enjoyment_columns.items():
    values = survey_numeric[cols]
    sum_col = f"{prefix}_Survey_EnjoymentComposite_Sum"
    count_col = f"{prefix}_Survey_EnjoymentComposite_Count"
    mean_col = f"{prefix}_Survey_EnjoymentComposite_Mean"
    norm_col = f"{prefix}_Survey_EnjoymentComposite_Normalized"
    norm_corrected_col = f"{prefix}_Survey_EnjoymentComposite_NormalizedCorrected"
    corrected_col = f"{prefix}_Survey_EnjoymentComposite_Corrected"
    count_values = values.notna().sum(axis=1)

    raw_components = pd.DataFrame(index=values.index, dtype=float)
    corrected_components = pd.DataFrame(index=values.index, dtype=float)

    for column in cols:
        polarity_meta = ""
        if column in survey_metadata_lookup.index:
            polarity_meta = (survey_metadata_lookup.loc[column].get("polarity") or "").strip().lower()
        if column in survey_responses.columns:
            raw_series = survey_responses.loc[values.index, column]
            parsed_series = raw_series.apply(_parse_likert_value)
        else:
            fallback_series = pd.to_numeric(survey_numeric.loc[values.index, column], errors="coerce")
            if polarity_meta == "negative":
                parsed_series = fallback_series.apply(_reverse_likert)
            else:
                parsed_series = fallback_series
        raw_components[column] = pd.to_numeric(parsed_series, errors="coerce")
        corrected_series = pd.to_numeric(parsed_series, errors="coerce")
        if polarity_meta == "negative":
            corrected_series = corrected_series.apply(_reverse_likert)
        corrected_components[column] = corrected_series

    raw_sum_values = raw_components.sum(axis=1, min_count=1)
    corrected_sum = corrected_components.sum(axis=1, min_count=1)
    raw_normalized = np.where(
        count_values > 0,
        np.clip((raw_sum_values - count_values) / (4.0 * count_values), 0, 1),
        np.nan,
    )
    corrected_normalized = np.where(
        count_values > 0,
        np.clip((corrected_sum - count_values) / (4.0 * count_values), 0, 1),
        np.nan,
    )
    corrected_mean = np.where(count_values > 0, corrected_sum / count_values, np.nan)

    survey_numeric[count_col] = count_values
    survey_numeric[sum_col] = raw_sum_values
    survey_numeric[corrected_col] = corrected_sum
    survey_numeric[mean_col] = corrected_mean
    survey_numeric[norm_col] = raw_normalized
    survey_numeric[norm_corrected_col] = corrected_normalized

survey_numeric

  survey_numeric[count_col] = count_values
  survey_numeric[sum_col] = sum_values
  survey_numeric[mean_col] = np.where(count_values > 0, sum_values / count_values, np.nan)
  survey_numeric[norm_col] = np.where(
  survey_numeric[count_col] = count_values
  survey_numeric[sum_col] = sum_values
  survey_numeric[mean_col] = np.where(count_values > 0, sum_values / count_values, np.nan)
  survey_numeric[norm_col] = np.where(
  survey_numeric[count_col] = count_values
  survey_numeric[sum_col] = sum_values
  survey_numeric[mean_col] = np.where(count_values > 0, sum_values / count_values, np.nan)
  survey_numeric[norm_col] = np.where(
  survey_numeric[count_col] = count_values
  survey_numeric[sum_col] = sum_values
  survey_numeric[mean_col] = np.where(count_values > 0, sum_values / count_values, np.nan)
  survey_numeric[norm_col] = np.where(
  survey_numeric[count_col] = count_values
  survey_numeric[sum_col] = sum_values
  survey_numeric[mean_col] = np.where(count_values > 0, sum_values / c

Unnamed: 0,respondent,survey_group,survey_study,survey_gender,survey_age,survey_file,Long_The Town_Survey_Enjoyment_E1,Long_The Town_Survey_Enjoyment_E12,Long_The Town_Survey_Enjoyment_E14,Long_The Town_Survey_Enjoyment_E15,...,Long_Abbot Elementary_Survey_EnjoymentComposite_Corrected,Long_Abbot Elementary_Survey_EnjoymentComposite_Mean,Long_Abbot Elementary_Survey_EnjoymentComposite_Normalized,Long_Abbot Elementary_Survey_EnjoymentComposite_NormalizedCorrected,Short_Abbot Elementary_Survey_EnjoymentComposite_Count,Short_Abbot Elementary_Survey_EnjoymentComposite_Sum,Short_Abbot Elementary_Survey_EnjoymentComposite_Corrected,Short_Abbot Elementary_Survey_EnjoymentComposite_Mean,Short_Abbot Elementary_Survey_EnjoymentComposite_Normalized,Short_Abbot Elementary_Survey_EnjoymentComposite_NormalizedCorrected
0,83,Default,Group A,MALE,69,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,,,,0,,,,,
1,81,Default,Group A,FEMALE,24,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,,,,0,,,,,
2,99,Default,Group A,FEMALE,25,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,4.0,4.0,4.0,4.0,...,,,,,0,,,,,
3,52,Default,Group A,FEMALE,50,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,4.0,5.0,5.0,...,,,,,0,,,,,
4,8,Default,Group A,OTHER,51,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,,,,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,107,Default,Group F,MALE,33,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,43.0,3.583333,0.645833,0.645833,0,,,,,
77,85,Default,Group F,FEMALE,34,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,43.0,3.583333,0.687500,0.645833,0,,,,,
78,70,Default,Group F,FEMALE,61,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,59.0,4.916667,0.895833,0.979167,0,,,,,
79,96,Default,Group F,FEMALE,29,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,53.0,4.416667,0.770833,0.854167,0,,,,,


In [219]:
familiarity_prefixes = sorted({
    column.split("_Survey_")[0]
    for column in survey_numeric.columns
    if column.endswith("_Survey_Familiarity_F1")
})

for prefix in familiarity_prefixes:
    f1_col = f"{prefix}_Survey_Familiarity_F1"
    f2_col = f"{prefix}_Survey_Familiarity_F2"
    if f1_col not in survey_numeric.columns or f2_col not in survey_numeric.columns:
        continue
    c1_col = f"{prefix}_Survey_Familiarity_C1"
    count_col = f"{prefix}_Survey_Familiarity_C1_Count"
    norm_col = f"{prefix}_Survey_Familiarity_C1_Normalized"
    pair = survey_numeric[[f1_col, f2_col]]
    sum_values = pair.fillna(0).sum(axis=1)
    count_values = pair.notna().sum(axis=1)
    survey_numeric[c1_col] = sum_values
    survey_numeric[count_col] = count_values
    survey_numeric[norm_col] = np.where(
        count_values > 0,
        np.clip(sum_values / (4 * count_values), 0, 1),
        np.nan,
    )

survey_numeric

  survey_numeric[c1_col] = sum_values
  survey_numeric[count_col] = count_values
  survey_numeric[norm_col] = np.where(
  survey_numeric[c1_col] = sum_values
  survey_numeric[count_col] = count_values
  survey_numeric[norm_col] = np.where(
  survey_numeric[c1_col] = sum_values
  survey_numeric[count_col] = count_values
  survey_numeric[norm_col] = np.where(
  survey_numeric[c1_col] = sum_values
  survey_numeric[count_col] = count_values
  survey_numeric[norm_col] = np.where(
  survey_numeric[c1_col] = sum_values
  survey_numeric[count_col] = count_values
  survey_numeric[norm_col] = np.where(
  survey_numeric[c1_col] = sum_values
  survey_numeric[count_col] = count_values
  survey_numeric[norm_col] = np.where(


Unnamed: 0,respondent,survey_group,survey_study,survey_gender,survey_age,survey_file,Long_The Town_Survey_Enjoyment_E1,Long_The Town_Survey_Enjoyment_E12,Long_The Town_Survey_Enjoyment_E14,Long_The Town_Survey_Enjoyment_E15,...,Long_The Town_Survey_Familiarity_C1_Normalized,Short_Abbot Elementary_Survey_Familiarity_C1,Short_Abbot Elementary_Survey_Familiarity_C1_Count,Short_Abbot Elementary_Survey_Familiarity_C1_Normalized,Short_Mad Max_Survey_Familiarity_C1,Short_Mad Max_Survey_Familiarity_C1_Count,Short_Mad Max_Survey_Familiarity_C1_Normalized,Short_The Town_Survey_Familiarity_C1,Short_The Town_Survey_Familiarity_C1_Count,Short_The Town_Survey_Familiarity_C1_Normalized
0,83,Default,Group A,MALE,69,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,0.000,0.0,0,,0.0,2,0.000,0.0,0,
1,81,Default,Group A,FEMALE,24,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,0.750,0.0,0,,4.0,2,0.500,0.0,0,
2,99,Default,Group A,FEMALE,25,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,4.0,4.0,4.0,4.0,...,0.000,0.0,0,,1.0,2,0.125,0.0,0,
3,52,Default,Group A,FEMALE,50,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,4.0,5.0,5.0,...,0.000,0.0,0,,0.0,2,0.000,0.0,0,
4,8,Default,Group A,OTHER,51,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,0.125,0.0,0,,3.0,2,0.375,0.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,107,Default,Group F,MALE,33,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,0.0,0,,4.0,2,0.500,0.0,0,
77,85,Default,Group F,FEMALE,34,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,0.0,0,,5.0,2,0.625,0.0,0,
78,70,Default,Group F,FEMALE,61,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,0.0,0,,0.0,2,0.000,0.0,0,
79,96,Default,Group F,FEMALE,29,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,0.0,0,,3.0,2,0.375,0.0,0,


In [220]:
open_ended_columns = [
    column
    for column, meta in survey_metadata_lookup.iterrows()
    if column in survey_numeric.columns and (meta.get("question_type") or "").strip().lower() == "open ended"
]

survey_open_ended = survey_responses[[
    "respondent",
    "survey_group",
    "survey_study",
    "survey_gender",
    "survey_age",
    "survey_file",
    *open_ended_columns,
]].copy()

survey_features = survey_numeric.drop(columns=[col for col in open_ended_columns if col in survey_numeric.columns])

# Integrate screening familiarity composites
screening_path = project_root / "results" / "individual_composite_scores.csv"
if screening_path.exists():
    screening_raw = pd.read_csv(screening_path)
    screening_raw["respondent"] = screening_raw["respondent"].astype(str).str.strip()

    screening_value_columns = [
        col
        for col in screening_raw.columns
        if col.endswith("_Survey_Familiarity_F1")
        or col.endswith("_Survey_Familiarity_F2")
        or col.endswith("_Survey_Familiarity_F3")
        or col.endswith("_Survey_Familiarity_C1")
    ]

    respondent_groups = (
        uv_stage1
        .loc[:, ["respondent", "group"]]
        .assign(group=lambda df: df["group"].astype(str).str.strip().str.upper())
        .set_index("respondent")
        .to_dict()
    )["group"]

    title_normalization = {
        "mad max fury road": "Mad Max",
        "mad max": "Mad Max",
        "the town": "The Town",
        "abbot elementary": "Abbot Elementary",
        "abbott elementary": "Abbot Elementary",
    }

    def canonicalize_title(raw_title: str) -> str:
        cleaned = str(raw_title).strip()
        return title_normalization.get(cleaned.lower(), cleaned)

    stimulus_map = pd.read_csv(project_root / "data" / "stimulus_rename.csv")
    stimulus_map["group_letter"] = stimulus_map["group"].str.extract(r"([A-F])", expand=False)
    stimulus_map["title_clean"] = stimulus_map["title"].astype(str).str.strip()

    group_title_form_lookup = {}
    default_form_per_title = {}

    for row in stimulus_map.itertuples():
        if pd.isna(row.group_letter) or pd.isna(row.title_clean) or pd.isna(row.form):
            continue
        canonical_title = canonicalize_title(row.title_clean)
        form_value = str(row.form).title()
        group_title_form_lookup[(row.group_letter, canonical_title)] = form_value
        default_form_per_title.setdefault(canonical_title, form_value)

    screening_records = []

    for _, row in screening_raw.iterrows():
        respondent_id = row.get("respondent")
        if respondent_id is None:
            continue
        respondent_id = str(respondent_id).strip()
        group_letter = respondent_groups.get(respondent_id)
        for column in screening_value_columns:
            value = row.get(column)
            if pd.isna(value) or value == "":
                continue
            base_part, _, suffix_part = column.partition("_Survey_Familiarity_")
            if not suffix_part:
                continue
            question_code = suffix_part.strip()
            canonical_title = canonicalize_title(base_part.strip())
            form_value = None
            if group_letter:
                form_value = group_title_form_lookup.get((group_letter, canonical_title))
            if form_value is None:
                form_value = default_form_per_title.get(canonical_title, "Long")
            target_column = f"{form_value}_{canonical_title}_Screening_Familiarity_{question_code}"
            screening_records.append({
                "respondent": respondent_id,
                "target_column": target_column,
                "value": pd.to_numeric(value, errors="coerce")
            })

    if screening_records:
        screening_features = (
            pd.DataFrame(screening_records)
            .pivot_table(index="respondent", columns="target_column", values="value", aggfunc="first")
            .reset_index()
        )
        screening_features.columns.name = None
        survey_features = survey_features.merge(screening_features, on="respondent", how="left")

survey_features

Unnamed: 0,respondent,survey_group,survey_study,survey_gender,survey_age,survey_file,Long_The Town_Survey_Enjoyment_E1,Long_The Town_Survey_Enjoyment_E12,Long_The Town_Survey_Enjoyment_E14,Long_The Town_Survey_Enjoyment_E15,...,Long_The Town_Screening_Familiarity_F2,Short_Abbot Elementary_Screening_Familiarity_C1,Short_Abbot Elementary_Screening_Familiarity_F1,Short_Abbot Elementary_Screening_Familiarity_F2,Short_Mad Max_Screening_Familiarity_C1,Short_Mad Max_Screening_Familiarity_F1,Short_Mad Max_Screening_Familiarity_F2,Short_The Town_Screening_Familiarity_C1,Short_The Town_Screening_Familiarity_F1,Short_The Town_Screening_Familiarity_F2
0,83,Default,Group A,MALE,69,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,,5.0,4.0,1.0,1.0,1.0,,,,
1,81,Default,Group A,FEMALE,24,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,2.0,1.0,1.0,,4.0,3.0,1.0,,,
2,99,Default,Group A,FEMALE,25,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,4.0,4.0,4.0,4.0,...,,2.0,2.0,,4.0,3.0,1.0,,,
3,52,Default,Group A,FEMALE,50,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,4.0,5.0,5.0,...,,1.0,1.0,,2.0,2.0,,,,
4,8,Default,Group A,OTHER,51,MERGED_SURVEY_RESPONSE_MATRIX-A1.txt,5.0,5.0,5.0,5.0,...,1.0,4.0,3.0,1.0,4.0,3.0,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,107,Default,Group F,MALE,33,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,5.0,4.0,1.0,5.0,4.0,1.0
77,85,Default,Group F,FEMALE,34,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,4.0,3.0,1.0,0.0,0.0,
78,70,Default,Group F,FEMALE,61,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,1.0,1.0,,1.0,1.0,
79,96,Default,Group F,FEMALE,29,MERGED_SURVEY_RESPONSE_MATRIX-F2.txt,,,,,...,,,,,4.0,3.0,1.0,4.0,3.0,1.0


In [221]:
def _safe_write_csv(df: pd.DataFrame, path: Path) -> Path:
    try:
        df.to_csv(path, index=False)
        return path
    except PermissionError:
        fallback = path.with_name(f"{path.stem}_{pd.Timestamp.utcnow().strftime('%Y%m%d%H%M%S')}.csv")
        df.to_csv(fallback, index=False)
        print(f"Permission denied for {path}. Saved to {fallback.name} instead.")
        return fallback

base_uv = full_uv.copy() if "full_uv" in globals() else uv_stage1.copy()
base_uv["respondent"] = base_uv["respondent"].astype(str).str.strip()

survey_features["respondent"] = survey_features["respondent"].astype(str).str.strip()
uv_stage3 = base_uv.merge(survey_features, on="respondent", how="left")

results_dir = project_root / "results"
results_dir.mkdir(parents=True, exist_ok=True)

features_path = _safe_write_csv(survey_features, results_dir / "uv_stage3_full_features.csv")
open_ended_path = _safe_write_csv(survey_open_ended, results_dir / "uv_stage3_full_open_ended.csv")
uv_path = _safe_write_csv(uv_stage3, results_dir / "uv_stage3_full_uv.csv")

print(
    f"Stage 3 survey features saved to {features_path.name} with {survey_features.shape[0]} respondents and "
    f"{survey_features.shape[1] - 1} feature columns."
)
print(f"Open-ended responses archived to {open_ended_path.name}.")
print(f"Unified view with Stage 3 survey data exported to {uv_path.name}.")

uv_stage3

Stage 3 survey features saved to uv_stage3_full_features.csv with 81 respondents and 356 feature columns.
Open-ended responses archived to uv_stage3_full_open_ended.csv.
Unified view with Stage 3 survey data exported to uv_stage3_full_uv.csv.


Unnamed: 0,source_file,group,respondent,age,gender,date_study,time_study,age_group,ethnicity,income_group,...,Long_The Town_Screening_Familiarity_F2,Short_Abbot Elementary_Screening_Familiarity_C1,Short_Abbot Elementary_Screening_Familiarity_F1,Short_Abbot Elementary_Screening_Familiarity_F2,Short_Mad Max_Screening_Familiarity_C1,Short_Mad Max_Screening_Familiarity_F1,Short_Mad Max_Screening_Familiarity_F2,Short_The Town_Screening_Familiarity_C1,Short_The Town_Screening_Familiarity_F1,Short_The Town_Screening_Familiarity_F2
0,003_104.csv,A,104,59,Male,10/16/2025,18:09:03,44-59,White,"$60,000 or more per year",...,1.0,0.0,0.0,,1.0,1.0,,,,
1,002_106.csv,A,106,30,Male,10/16/2025,19:35:05,28-43,White,"$60,000 or more per year",...,,4.0,3.0,1.0,4.0,3.0,1.0,,,
2,006_11.csv,A,11,33,Female,10/11/2025,09:32:42,28-43,White,"$35,000  $60,000 per year",...,,,,,1.0,1.0,,0.0,0.0,
3,006_11.csv,A,11,33,Female,10/11/2025,09:32:42,28-43,White,"$35,000  $60,000 per year",...,,,,,1.0,1.0,,0.0,0.0,
4,001_116.csv,A,116,19,Male,10/18/2025,12:37:40,18-27,White,"$35,000  $60,000 per year",...,,1.0,1.0,,4.0,3.0,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,005_50.csv,F,50,63,Other,10/14/2025,09:54:03,60-69,Black/African American,"$60,000 or more per year",...,,,,,6.0,4.0,2.0,6.0,3.0,3.0
81,004_60.csv,F,60,66,Male,10/15/2025,09:34:06,60-69,White,"$35,000  $60,000 per year",...,,,,,1.0,1.0,,1.0,1.0,
82,003_70.csv,F,70,61,Female,10/16/2025,09:49:14,60-69,Black/African American,"$35,000  $60,000 per year",...,,,,,1.0,1.0,,1.0,1.0,
83,002_85.csv,F,85,34,Female,10/17/2025,14:37:41,28-43,White,"$60,000 or more per year",...,,,,,4.0,3.0,1.0,0.0,0.0,


In [222]:
print(survey_features.loc[(survey_features["respondent"].astype(int)==82)]['Long_Abbot Elementary_Survey_EnjoymentComposite_Sum'])
print(survey_features.loc[(survey_features["respondent"].astype(int)==82)]['Long_Abbot Elementary_Survey_Enjoyment_E18'])
print(survey_features.loc[(survey_features["respondent"].astype(int)==82)]['Long_Abbot Elementary_Survey_EnjoymentComposite_Corrected'])

31    25.0
Name: Long_Abbot Elementary_Survey_EnjoymentComposite_Sum, dtype: float64
31    2.0
Name: Long_Abbot Elementary_Survey_Enjoyment_E18, dtype: float64
31    23.0
Name: Long_Abbot Elementary_Survey_EnjoymentComposite_Corrected, dtype: float64


In [223]:
prefix = "Long_Abbot Elementary"
columns_to_show = [
    f"{prefix}_Survey_EnjoymentComposite_Sum",
    f"{prefix}_Survey_EnjoymentComposite_Corrected",
    f"{prefix}_Survey_EnjoymentComposite_Count",
    f"{prefix}_Survey_EnjoymentComposite_Normalized",
    f"{prefix}_Survey_EnjoymentComposite_NormalizedCorrected",
    f"{prefix}_Survey_EnjoymentComposite_Mean",
]
comparison = survey_features.loc[
    survey_features[f"{prefix}_Survey_EnjoymentComposite_Count"].gt(0),
    columns_to_show
].head(10).copy()
comparison["normalized_from_corrected"] = np.clip(
    (comparison[f"{prefix}_Survey_EnjoymentComposite_Corrected"]
     - comparison[f"{prefix}_Survey_EnjoymentComposite_Count"])
    / (4.0 * comparison[f"{prefix}_Survey_EnjoymentComposite_Count"]),
    0,
    1,
 )
comparison

Unnamed: 0,Long_Abbot Elementary_Survey_EnjoymentComposite_Sum,Long_Abbot Elementary_Survey_EnjoymentComposite_Corrected,Long_Abbot Elementary_Survey_EnjoymentComposite_Count,Long_Abbot Elementary_Survey_EnjoymentComposite_Normalized,Long_Abbot Elementary_Survey_EnjoymentComposite_NormalizedCorrected,Long_Abbot Elementary_Survey_EnjoymentComposite_Mean,normalized_from_corrected
22,41.0,41.0,12,0.604167,0.604167,3.416667,0.604167
23,51.0,53.0,12,0.8125,0.854167,4.416667,0.854167
24,50.0,54.0,12,0.791667,0.875,4.5,0.875
25,45.0,49.0,12,0.6875,0.770833,4.083333,0.770833
26,51.0,55.0,12,0.8125,0.895833,4.583333,0.895833
27,35.0,37.0,12,0.479167,0.520833,3.083333,0.520833
28,19.0,15.0,12,0.145833,0.0625,1.25,0.0625
29,19.0,15.0,12,0.145833,0.0625,1.25,0.0625
30,55.0,55.0,12,0.895833,0.895833,4.583333,0.895833
31,25.0,23.0,12,0.270833,0.229167,1.916667,0.229167
