## Aux Functions

In [15]:
from pathlib import Path
from datetime import datetime
import json
import os
import shutil
import subprocess
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import seaborn as sns
import statsmodels.formula.api as smf

### Others

In [16]:
def dedupe_inplace(seq):
    seen = set()
    write = 0
    for item in seq:
        if item not in seen:
            seen.add(item)
            seq[write] = item
            write += 1
    del seq[write:]

### Add a simulation to the list

In [17]:
# add simulations to list
def add(subject=None, story=None, simulations=None):
    if simulations is None:
        raise ValueError("Simulations list is required")    
    if subject is None and story is None:
        raise ValueError("Provide at least subject or story")
    # add all simulations of a subject
    if story == None:
        _add_subject(subject, simulations)
    # add all simulations of a story
    if subject == None:
        _add_story(story, simulations)
    # add simulation for a subject and story
    if subject != None and story != None:
        sim_path = _make_simulation_path(subject, story)
        if os.path.exists(sim_path):
            simulations.append(sim_path)
        else:
            raise ValueError(f"Simulation does not exist {sim_path}")
        
    dedupe_inplace(simulations)
    return simulations
        
# add all simulations of a subject
def _add_subject(subject, simulations):
    suffix = f"{subject}.tsv"
    simulations.extend(str(p) for p in Path("simulations").rglob("*.tsv")
                       if p.name.lower().endswith(suffix))

# add all simulations of a story
def _add_story(story, simulations):
    story_dir = Path("simulations") / story
    if story_dir.is_dir():
        simulations.extend(str(p) for p in story_dir.rglob("*.tsv"))


def _make_simulation_path(subject, story):
    return f'simulations/{story}/{story}_ob1_{subject}.tsv'

### Correlation

In [18]:
def load_pkl(file_path):
    return pd.read_pickle(file_path)

In [19]:
def load_all_subjects(folder):
    all_dfs = []
    for fname in os.listdir(folder):
        if fname.endswith(".pkl"):
            path = os.path.join(folder, fname)
            df = pd.read_pickle(path)
            df["subj"] = fname.replace(".pkl", "")
            all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

In [20]:
def average_metrics_over_subjects(df):
    # metricas en las que 0 significa que no hubo fijacion -> van a NAN para que no se tengan en cuenta
    duration_metrics = ["FFD", "SFD", "FPRT", "RPD", "TFD", "RRT", "SPRT"]
    df[duration_metrics] = df[duration_metrics].replace(0, np.nan)
    # sacar las excluded
    #df = df[df["excluded"] == False]
    # hacer la media por id en oracion, id de palabra y palabra
    mean_df = df.groupby(["sentence_idx", "word_idx", "word"], as_index=False).mean(numeric_only=True)
    mean_df = mean_df.fillna(0)
    return mean_df

In [21]:
simulations_list = []

## Setup

Add simulations to the simulations_list using the add function, the possible parameter combinations :

- subject and story, to add a specific subject-story simulation
- only subject, to add all simulations for that subject
- only story, to add all simulations for that story

In [22]:
# add(subject = *, story = *, simulations=simulations_list)
# choose a subject and a story simulation or just a subject/story to add all its simulations
add(subject='subject0', story='Axolotl', simulations=simulations_list)
#add(subject='perfil5', story='Axolotl', simulations=simulations_list)
#add(story='cuento2', simulations=simulations_list)
#add(subject='perfil4',simulations=simulations_list)

['simulations/Axolotl/Axolotl_ob1_subject0.tsv']

## Experiment

### Make experiment folder & save files used

Creates a folder to save experiment information

In [23]:
stamp = datetime.now().strftime("%Y_%m_%d_%H%M")
experiment_folder = Path("experiments") / stamp
experiment_folder.mkdir(parents=True, exist_ok=True)
print("Created:", experiment_folder)

Created: experiments/2026_01_13_2248


In [24]:
out_path = Path(experiment_folder) / "simulations_list.json"
with out_path.open("w", encoding="utf-8") as f:
    json.dump(simulations_list, f, ensure_ascii=False, indent=2)

### Generate processed folder

Generates the processed folder the analysis script uses

Creates the folder

In [25]:
processed_path = "../data/processed"

if os.path.exists(processed_path):
    try:
        shutil.rmtree(processed_path)
        print(f"Folder '{processed_path}' and its contents deleted successfully.")
    except OSError as e:
        print(f"Error: {processed_path} : {e.strerror}")
else:
    print(f"Folder '{processed_path}' does not exist.")

Folder '../data/processed' and its contents deleted successfully.


In [26]:
trials_path = processed_path + '/trials'
os.makedirs(trials_path)
print(f"Folder '{trials_path}' created successfully.")

Folder '../data/processed/trials' created successfully.


Populates the folder in the correct format with the simulations in simulations_list

In [27]:
examples_directory = '../data/processed_examples'

In [28]:
def add_screens(simulation, destination_directory):
    
    eye_value = 'R'
    pupil_value = -1
    df = pd.read_csv(simulation, sep="\t")
    
    for i in range(df["text_id"].nunique()):
        df_screen = df[df["text_id"] == i]
        data = []
        time_acum = 0
        
        for idx, row in df_screen.iterrows():
            index = row["fixation_counter"]
            t_start = time_acum
            duration = row["fixation_duration"]
            time_acum += duration

            data.append({"index": index,
                        "eye": eye_value,
                        "tStart": t_start,
                        "tEnd": time_acum,
                        "duration": duration,
                        "xAvg": 0,
                        "yAvg": 0,
                        "pupilAvg": pupil_value              
                            })
            
        output_df = pd.DataFrame(data)
        output_dir = destination_directory + f"/screen_{i+1}/fixations.pkl"
        if os.path.exists(output_dir):
            os.remove(output_dir)
        output_df = pd.to_pickle(output_df, output_dir)
        
    print(f"Screens added to {destination_directory}")


In [29]:
for simulation in simulations_list:
    
    story = Path(simulation).stem.split("_")[0]
    subject = Path(simulation).stem.split("_")[2]
    subject_path = trials_path + '/' + subject
    print(f"Processing story {story} for subject {subject}")
    
    if not os.path.exists(processed_path):
        os.makedirs(subject_path)
    source_directory = examples_directory + '/' + story
    destination_directory = subject_path + '/' + story

    try:
        shutil.copytree(source_directory, destination_directory)
        print(f"Folder '{source_directory}' successfully copied to '{destination_directory}'")
    except FileExistsError:
        print(f"Error: Destination directory '{destination_directory}' already exists.")
    except Exception as e:
        print(f"An error occurred: {e}")
    
    add_screens(simulation, destination_directory)
    print()  
    

Processing story Axolotl for subject subject0
Folder '../data/processed_examples/Axolotl' successfully copied to '../data/processed/trials/subject0/Axolotl'
Screens added to ../data/processed/trials/subject0/Axolotl



### Run analysis script

In [30]:
stories = []
for sim in simulations_list:
    story = sim.split('/')[1]
    print(story)
    if story not in stories:
        stories.append(story)

Axolotl


In [31]:
%%bash
rm -rf ../results/*

In [32]:
for story in stories:
    subprocess.run(
        ["python3", "em_analysis.py", "--item", story, "--reprocess"],
        cwd="..",
        check=True
    )

Traceback (most recent call last):
  File "/home/lucy/Documents/facu/reading-et/tesis/lib/python3.12/site-packages/scipy/io/matlab/_mio.py", line 39, in _open_file
    return open(file_like, mode), True
           ^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'metadata/stimuli_questions.mat'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/lucy/Documents/facu/reading-et/em_analysis.py", line 244, in <module>
    subjects_associations, words_associations = parse_wa_task(questions_file, participants_path)
                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lucy/Documents/facu/reading-et/scripts/data_processing/wa_task.py", line 27, in parse_wa_task
    questions = load_matfile(str(questions_file))['stimuli_questions']
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lucy/Documents/facu/reading-et/scripts/data_proce

CalledProcessError: Command '['python3', 'em_analysis.py', '--item', 'Axolotl', '--reprocess']' returned non-zero exit status 1.

In [None]:
source_directory = "../results"
destination_directory = experiment_folder / "results"

try:
    shutil.copytree(source_directory, destination_directory)
    print(f"Folder '{source_directory}' successfully copied to '{destination_directory}'")
except FileExistsError:
    print(f"Error: Destination directory '{destination_directory}' already exists.")
except Exception as e:
    print(f"An error occurred: {e}")
    
    

Folder '../results' successfully copied to 'experiments/2025_12_17_2302/results'


### Correlations

In [None]:
experiment_folder

NameError: name 'experiment_folder' is not defined

In [None]:
# medias de todos los sujetos originales
folder_original = "results_og_all/measures/Axolotl"
all_data_original = load_all_subjects(folder_original)
mean_metrics_original = average_metrics_over_subjects(all_data_original)
# ob1
folder_ob1 = "results_ob1_normal_1/measures/Axolotl"
all_data_ob1 = load_all_subjects(folder_ob1)
mean_metrics_ob1 = average_metrics_over_subjects(all_data_ob1)