In [14]:
# from preprocessing.data_collection.multipleye_data_collection import prepare_language_folder
from preprocessing.data_collection.multipleye_data_collection import MultipleyeDataCollection
from pathlib import Path

import preprocessing
from preprocessing import constants

## Pre-processing MultiplEYE Data

In [15]:
data_collection_name = "MultiplEYE_SQ_CH_Zurich_1_2025"
# data_collection_name = 'MultiplEYE_SL_SI_Ljubljana_1_2025'

If necessary, prepare the data folder by unzipping the downloaded files. Works only for MultiplEYE and MeRID data collections so far. Also, there might be some manual steps necessary.

In [16]:
this_repo = Path().resolve()
data_folder_path = this_repo / "data" / data_collection_name

# MultipleyeDataCollection.create_from_data_folder(data_folder_path)


In [17]:
multipleye_sq = MultipleyeDataCollection.create_from_data_folder(data_folder_path)

preprocessed_data_folder = this_repo / "preprocessed_data" / data_collection_name
preprocessed_data_folder.mkdir(parents=True, exist_ok=True)

Folder test_sessions does not match the regex pattern \d\d\d_SQ_CH_1_ET\d. Not considered as session.
Folder pilot_sessions does not match the regex pattern \d\d\d_SQ_CH_1_ET\d. Not considered as session.


In [24]:
multipleye_sq.convert_edf_to_asc()
multipleye_sq.prepare_session_level_information()

Converting EDF to ASC: 100%|██████████| 6/6 [00:00<00:00, 1579.38it/s]
Preparing session 011_SQ_CH_1_ET1: 100%|██████████| 6/6 [00:08<00:00,  1.50s/it]


In [25]:
multipleye_sq

Title	MultiplEYE_SQ_CH_Zurich_1_2025
Dataset_type	MultiplEYE
Number_of_sessions	6
Number_of_pilots	0
Tested_language	SQ
Country	CH
Year	2025
Number of eye-tracking (ET) sessions per participant	1

In [26]:
sessions = [s for s in multipleye_sq]
sess = sessions[0]
idf = sess.session_identifier

## Creating Gaze Frame from ASCII File

In [27]:
asc = sess.asc_path
output_folder = preprocessed_data_folder / idf
output_folder.mkdir(parents=True, exist_ok=True)

In [29]:
gaze = preprocessing.load_gaze_data(
    asc_file=asc,
    lab_config=sess.lab_config,
    session_idf=idf,
    trial_cols=constants.TRIAL_COLS,
)

In [30]:
preprocessing.save_raw_data(output_folder / "raw_data", sess.session_identifier, gaze)

This usually happens if you did not specify any column content and the content could not be autodetected from the column names. 
Please specify 'pixel_columns', 'position_columns', 'velocity_columns' or 'acceleration_columns' explicitly during initialization. Otherwise, transformation methods may fail.
  warn(


## Coordinate and Velocity Preprocessing

Eye movements are recorded in screen pixel coordinates, which depend on stimulus size and monitor setup. To compare gaze behavior across participants, screens, or datasets, it is standard to convert pixel positions 
into **degrees of visual angle (dva)**. Next, we compute **gaze velocity**, which allows us to detect saccades and distinguish them from fixations.

In [31]:
preprocessing.preprocess_gaze(gaze)

## Detect Events and Compute Their Properties

Eye-tracking data are typically segmented into events, i.e. `fixations` and `saccades`. Fixations represent moments when the eyes remain relatively still, allowing visual information to be processed, while saccades are the rapid movements between fixations that reposition the gaze. Detecting these events and computing their properties, such as `dispersion`, fixation `duration`, saccade `amplitude`, and `peak velocity`, provides the foundation for analyzing visual behavior and understanding how participants explore a stimulus.

### Fixations

We can detect fixations by applying the `I-VT` or the `I-DT` method.

The **I-VT (Velocity-Threshold Identification)** method distinguishes fixation and saccade points based on their point-to-point velocities. Each point is classified as a fixation if its velocity is below the specified threshold. Consecutive fixation points are then merged into a single fixation. A threshold of 20 degrees/second is commonly used as a default maximum value. Read more about [the IVT algorithm in the documentation](https://pymovements.readthedocs.io/en/stable/reference/api/pymovements.events.detection.ivt.html) 

The **I-DT (Dispersion-Threshold Identification)** method finds fixations by grouping consecutive points within a maximum separation (dispersion) threshold and a minimum duration threshold. The algorithm slides a moving window across the data: if the dispersion within the window is below the threshold, the window represents a fixation and is gradually expanded until the dispersion exceeds the threshold.
Read more about [our implementation of the IDT method](https://pymovements.readthedocs.io/en/stable/reference/api/pymovements.events.detection.idt.html).

We use the `I-VT` algorithm with the following key deafault parameters:
- `minimum duration`: 100 ms 
- `velocity threshold`: 20.0

Such properties as `location`, containing the centroid coordinates of each fixation, and `dispersion` will also be calculated.

In [32]:
preprocessing.detect_fixations(
    gaze,
)

### Saccades

Saccades are rapid eye movements that shift the point of fixation from one location to another. We detect saccades (or micro-saccades) from the velocity sequence of gaze data using the [microsaccades algorithm](https://pymovements.readthedocs.io/en/stable/reference/api/pymovements.events.detection.microsaccades.html#pymovements.events.detection.microsaccades). This algorithm implements a noise-adaptive velocity threshold, meaning that the detection threshold automatically scales with the noise level of the velocity signal. Such properties as `amplitude` and `peak velocity` of the detected saccades will also be calcuated.

The key default parameters are:
- `threshold_factor`: Multiplier used to determine the velocity threshold relative to the noise level of the signal. The default value is 6. A higher factor makes the algorithm more conservative (detects fewer saccades), while a lower factor makes it more sensitive.
- `minimum_duration`: Defines how long a velocity peak must persist to be classified as a saccade. The duration is expressed in the same units as timesteps. If no timesteps are provided, the value refers to the number of samples (default = 6), which corresponds to about 12 ms at a 500 Hz sampling rate. Shorter events are ignored as noise. 

In [33]:
preprocessing.detect_saccades(
    gaze,
)

In [36]:
preprocessing.map_fixations_to_aois(
    gaze,
    sess.stimuli,
)

In [37]:
gaze.save(output_folder / 'preprocessed_gaze', save_events=True, save_samples=True, verbose=2)

Saving events to  /Users/debor/repos/multipleye-preprocessing/preprocessed_data/MultiplEYE_SQ_CH_Zurich_1_2025/006_SQ_CH_1_ET1/preprocessed_gaze/events.feather
Saving samples to /Users/debor/repos/multipleye-preprocessing/preprocessed_data/MultiplEYE_SQ_CH_Zurich_1_2025/006_SQ_CH_1_ET1/preprocessed_gaze/samples.feather
Saving experiment file to /Users/debor/repos/multipleye-preprocessing/preprocessed_data/MultiplEYE_SQ_CH_Zurich_1_2025/006_SQ_CH_1_ET1/preprocessed_gaze


time,pupil,activity,stimulus,trial,practice,page,session,pixel,position,velocity
i64,f64,str,str,str,bool,str,str,list[f64],list[f64],list[f64]
2629468,1085.0,"""reading""","""Enc_WikiMoon_13""","""PRACTICE_trial_1""",true,"""page_1""","""006_SQ_CH_1_ET1""","[46.3, 126.2]","[-15.974782, -9.88542]","[-1.204677, -0.592322]"
2629469,1093.0,"""reading""","""Enc_WikiMoon_13""","""PRACTICE_trial_1""",true,"""page_1""","""006_SQ_CH_1_ET1""","[44.7, 124.9]","[-16.01472, -9.919118]","[-1.191805, -0.608509]"
2629470,1083.0,"""reading""","""Enc_WikiMoon_13""","""PRACTICE_trial_1""",true,"""page_1""","""006_SQ_CH_1_ET1""","[44.1, 126.3]","[-16.029693, -9.882828]","[-1.215066, -0.604523]"
2629471,1088.0,"""reading""","""Enc_WikiMoon_13""","""PRACTICE_trial_1""",true,"""page_1""","""006_SQ_CH_1_ET1""","[44.7, 126.4]","[-16.01472, -9.880235]","[-1.261583, -0.63971]"
2629472,1087.0,"""reading""","""Enc_WikiMoon_13""","""PRACTICE_trial_1""",true,"""page_1""","""006_SQ_CH_1_ET1""","[44.3, 124.7]","[-16.024702, -9.924302]","[-1.251194, -0.60687]"
…,…,…,…,…,…,…,…,…,…,…
10558289,877.0,"""question""","""Arg_PISARapaNui_11""","""trial_10""",false,"""question_11131""","""006_SQ_CH_1_ET1""","[278.9, 887.1]","[-10.015618, 10.229845]","[-0.553149, -0.989099]"
10558290,880.0,"""question""","""Arg_PISARapaNui_11""","""trial_10""",false,"""question_11131""","""006_SQ_CH_1_ET1""","[279.7, 887.1]","[-9.99466, 10.229845]","[-0.434833, -0.882126]"
10558291,878.0,"""question""","""Arg_PISARapaNui_11""","""trial_10""",false,"""question_11131""","""006_SQ_CH_1_ET1""","[278.1, 888.0]","[-10.036573, 10.253125]","[-0.353749, -0.792705]"
10558292,879.0,"""question""","""Arg_PISARapaNui_11""","""trial_10""",false,"""question_11131""","""006_SQ_CH_1_ET1""","[278.5, 885.7]","[-10.026096, 10.193624]","[-0.32388, -0.776548]"

trial,stimulus,page,name,onset,offset,duration,dispersion,amplitude,peak_velocity,dispersion_right,location_x,location_y,char_idx,char,top_left_x,top_left_y,width,height,char_idx_in_line,line_idx,word_idx,word_idx_in_line,word
str,str,str,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,i64,str,f64,f64,i64,i64,i64,i64,i64,i64,str
"""PRACTICE_trial_1""","""Enc_WikiMoon_13""","""page_1""","""fixation""",2629468,2629981,513,0.583651,,,,44.820233,120.750584,,,,,,,,,,,
"""PRACTICE_trial_1""","""Enc_WikiMoon_13""","""page_1""","""fixation""",2630016,2630288,272,0.543708,,,,92.486447,123.882784,,,,,,,,,,,
"""PRACTICE_trial_1""","""Enc_WikiMoon_13""","""page_1""","""fixation""",2630429,2630555,126,0.331975,,,,108.837795,204.895276,5,"""t""",95.0,184.7,14,33,1,1,1,0,"""https://sq.wikipedia.org/wiki/…"
"""PRACTICE_trial_1""","""Enc_WikiMoon_13""","""page_1""","""fixation""",2630599,2630794,195,0.484497,,,,194.780612,224.294388,,,,,,,,,,,
"""PRACTICE_trial_1""","""Enc_WikiMoon_13""","""page_1""","""fixation""",2630834,2631374,540,1.651275,,,,121.55915,214.301479,6,"""t""",109.0,184.7,14,33,2,1,1,0,"""https://sq.wikipedia.org/wiki/…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""trial_10""","""Arg_PISARapaNui_11""","""question_11131""","""saccade""",10556812,10556851,39,,1.31632,29.422723,1.655,,,,,,,,,,,,,
"""trial_10""","""Arg_PISARapaNui_11""","""question_11131""","""saccade""",10557104,10557144,40,,1.476838,35.549238,1.635209,,,,,,,,,,,,,
"""trial_10""","""Arg_PISARapaNui_11""","""question_11131""","""saccade""",10557229,10557320,91,,18.353944,359.532422,21.382693,,,,,,,,,,,,,
"""trial_10""","""Arg_PISARapaNui_11""","""question_11131""","""saccade""",10557444,10557460,16,,0.477121,12.711628,0.632066,,,,,,,,,,,,,


## Calculate Reading Measures

In [38]:
from preprocessing.metrics.words import all_tokens_from_aois, mark_skipped_tokens, repair_word_labels
from preprocessing.metrics.fixations import annotate_fixations
from preprocessing.metrics.reading_measures import build_word_level_table

import polars as pl

trial = 4
trial_label = "trial_" + str(trial)
aois = sess.stimuli[trial].text_stimulus.aois

In [39]:
# add word label to blank spaces between words in AOIs
aois_clean = repair_word_labels(aois)

# collect all words from AOIs for the given trial
all_tokens = all_tokens_from_aois(aois_clean, trial=trial_label)

### Fixation-based Metrics

In [44]:
# create a fixation table
fixation_table = annotate_fixations(gaze.events.frame)
fixation_table.head(20)

trial,page,fixation_id,onset,word_idx,char_idx,char,run_id,is_first_pass,duration,word,prev_word_idx,next_word_idx,is_reg_in,is_reg_out,is_first_fix
str,str,u32,i64,i64,i64,str,i64,bool,i64,str,i64,i64,bool,bool,bool
"""PRACTICE_trial_1""","""page_1""",0,2630429,1,5,"""t""",1,true,126,"""https://sq.wikipedia.org/wiki/…",,1,,false,true
"""PRACTICE_trial_1""","""page_1""",1,2630834,1,6,"""t""",1,true,540,"""https://sq.wikipedia.org/wiki/…",1,3,false,false,false
"""PRACTICE_trial_1""","""page_1""",2,2634207,3,45,"""i""",2,true,108,"""Wikipedia,""",1,4,false,false,true
"""PRACTICE_trial_1""","""page_1""",3,2635216,4,57,"""k""",3,true,200,"""enciklopedia""",3,7,false,false,true
"""PRACTICE_trial_1""","""page_1""",4,2636548,7,74,"""n""",4,true,306,"""Hëna""",4,8,false,false,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""PRACTICE_trial_1""","""page_1""",15,2640797,21,154,"""t""",14,true,290,"""distancë""",20,22,false,false,true
"""PRACTICE_trial_1""","""page_1""",16,2641130,22,161,"""e""",15,true,180,"""mesatare""",21,22,false,false,true
"""PRACTICE_trial_1""","""page_1""",17,2641350,22,166,"""r""",15,true,169,"""mesatare""",22,23,false,false,false
"""PRACTICE_trial_1""","""page_1""",18,2641562,23,172,"""j""",16,true,178,"""prej""",22,24,false,false,true


In [41]:
#  annotate skipped words based on fixation table and all tokens
words_with_skip = mark_skipped_tokens(all_tokens, fixation_table)

In [42]:
# calculate word-level reading measures
word_level_table = build_word_level_table(
    words=words_with_skip,
    fix=fixation_table,
)

  pl.DataFrame(


In [45]:
with pl.Config(tbl_rows=50):
    print(word_level_table.filter(pl.col("page") == "page_1").select([
        "word_idx", "word", "skipped", "FPF", "TFC", "SL_in", "RPD_inc", "RBRT", "TFT"
    ]))

shape: (109, 9)
┌──────────┬────────────┬─────────┬─────┬───┬───────┬─────────┬──────┬─────┐
│ word_idx ┆ word       ┆ skipped ┆ FPF ┆ … ┆ SL_in ┆ RPD_inc ┆ RBRT ┆ TFT │
│ ---      ┆ ---        ┆ ---     ┆ --- ┆   ┆ ---   ┆ ---     ┆ ---  ┆ --- │
│ i64      ┆ str        ┆ i8      ┆ i8  ┆   ┆ i64   ┆ i64     ┆ i64  ┆ i64 │
╞══════════╪════════════╪═════════╪═════╪═══╪═══════╪═════════╪══════╪═════╡
│ 0        ┆ Mali       ┆ 0       ┆ 1   ┆ … ┆ 0     ┆ 115     ┆ 115  ┆ 636 │
│ 1        ┆ Magjik     ┆ 0       ┆ 1   ┆ … ┆ 1     ┆ 1078    ┆ 557  ┆ 557 │
│ 2        ┆ -          ┆ 1       ┆ 0   ┆ … ┆ 0     ┆ 0       ┆ 0    ┆ 0   │
│ 3        ┆ Dy         ┆ 1       ┆ 0   ┆ … ┆ 0     ┆ 0       ┆ 0    ┆ 0   │
│ 4        ┆ fjalë      ┆ 0       ┆ 1   ┆ … ┆ 3     ┆ 273     ┆ 273  ┆ 273 │
│ 5        ┆ Hyrjeje    ┆ 0       ┆ 1   ┆ … ┆ 1     ┆ 450     ┆ 450  ┆ 450 │
│ 6        ┆ Historia   ┆ 0       ┆ 1   ┆ … ┆ 1     ┆ 251     ┆ 251  ┆ 251 │
│ 7        ┆ për        ┆ 0       ┆ 1   ┆ … ┆ 1     ┆ 146   

## The END
