### For users - just run Pt.1 + Pt.3 !!!

# 1. Loading the data that we need

In [1]:
from cloudpathlib import S3Path, S3Client
from pathlib import Path

# Set up our cache path:
cache_path = Path('/tmp/cache')
if not cache_path.exists():
    cache_path.mkdir()

# Create the root S3Path for the NSD:
nsd_base_path = S3Path(
    's3://natural-scenes-dataset/',
    client=S3Client(
        no_sign_request=True,
        local_cache_dir=cache_path))


In [2]:
behav_basepath = nsd_base_path / 'nsddata' / 'ppdata' 

import pandas as pd

def load_behav_data(participant_id, local_cache_dir=(Path.home() / 'cache')):
    if local_cache_dir is not None:
        local_cache_dir = Path(local_cache_dir)
        local_cache_dir.mkdir(exist_ok=True)

    participant_path = behav_basepath / f'subj{participant_id:02d}'
    tsv_path = participant_path / 'behav' / 'responses.tsv'

    with tsv_path.open('r') as f:
        data = pd.read_csv(f, sep='\t')

    # Return the loaded data:
    return data

In [3]:
# trial-level data for subject 1

example_id = 2
sub01_behav = load_behav_data(example_id)

sub01_behav

Unnamed: 0,SUBJECT,SESSION,RUN,TRIAL,73KID,10KID,TIME,ISOLD,ISCORRECT,RT,CHANGEMIND,MEMORYRECENT,MEMORYFIRST,ISOLDCURRENT,ISCORRECTCURRENT,TOTAL1,TOTAL2,BUTTON,MISSINGDATA
0,2,1,1,1,46003,626,0.731077,0,1.0,871.814177,0.0,,,0,1.0,1,0,1.0,0
1,2,1,1,2,42020,5013,0.731123,0,1.0,899.369976,0.0,,,0,1.0,1,0,1.0,0
2,2,1,1,3,22500,4850,0.731169,0,1.0,534.988883,0.0,,,0,1.0,1,0,1.0,0
3,2,1,1,4,61983,8823,0.731216,0,1.0,525.075803,0.0,,,0,1.0,1,0,1.0,0
4,2,1,1,5,48145,9538,0.731262,0,1.0,551.546343,0.0,,,0,1.0,1,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,2,40,12,58,22218,8984,294.717581,1,1.0,922.193575,0.0,20963.0,21540.0,0,0.0,0,1,2.0,0
29996,2,40,12,59,30294,6026,294.717627,1,1.0,1103.050162,0.0,16.0,17622.0,1,1.0,0,1,2.0,0
29997,2,40,12,60,61376,4841,294.717673,1,1.0,1078.099882,0.0,9483.0,11912.0,0,0.0,0,1,2.0,0
29998,2,40,12,61,42648,7323,294.717719,1,1.0,725.315119,0.0,83.0,12162.0,1,1.0,0,1,2.0,0


In [4]:
# stimulus information - if it's included in shared1000, on what trial was it presented, etc

stiminfo = nsd_base_path / 'nsddata' / 'experiments' / 'nsd' / 'nsd_stim_info_merged.csv'

with stiminfo.open('r') as f:
        stimdata = pd.read_csv(f)

stimdata

Unnamed: 0.1,Unnamed: 0,cocoId,cocoSplit,cropBox,loss,nsdId,flagged,BOLD5000,shared1000,subject1,...,subject5_rep2,subject6_rep0,subject6_rep1,subject6_rep2,subject7_rep0,subject7_rep1,subject7_rep2,subject8_rep0,subject8_rep1,subject8_rep2
0,0,532481,val2017,"(0, 0, 0.1671875, 0.1671875)",0.100000,0,False,False,False,0,...,0,0,0,0,0,0,0,0,0,0
1,1,245764,val2017,"(0, 0, 0.125, 0.125)",0.000000,1,False,False,False,0,...,0,0,0,0,13985,14176,28603,0,0,0
2,2,385029,val2017,"(0, 0, 0.125, 0.125)",0.000000,2,False,False,False,0,...,0,0,0,0,0,0,0,0,0,0
3,3,311303,val2017,"(0, 0, 0.16640625, 0.16640625)",0.125000,3,False,False,False,0,...,0,0,0,0,0,0,0,0,0,0
4,4,393226,val2017,"(0, 0, 0.125, 0.125)",0.133333,4,False,False,False,0,...,0,13720,22861,23023,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72995,72995,518071,train2017,"(0, 0, 0.125, 0.125)",0.000000,72995,False,False,False,0,...,0,0,0,0,0,0,0,6083,11650,26531
72996,72996,255930,train2017,"(0, 0, 0.125, 0.125)",0.125000,72996,False,False,False,0,...,0,0,0,0,0,0,0,10402,10434,10625
72997,72997,255934,train2017,"(0, 0, 0.1, 0.1)",0.000000,72997,False,False,False,0,...,0,0,0,0,0,0,0,0,0,0
72998,72998,518080,train2017,"(0.125, 0.125, 0, 0)",0.000000,72998,False,False,False,0,...,0,0,0,0,5585,11846,14495,0,0,0


In [5]:
shared1000 = stimdata[stimdata["shared1000"] == True]
shared1000 = shared1000.drop(columns=["Unnamed: 0"])
shared1000["73KID"] = shared1000["nsdId"] + 1

shared1000

Unnamed: 0,cocoId,cocoSplit,cropBox,loss,nsdId,flagged,BOLD5000,shared1000,subject1,subject2,...,subject6_rep0,subject6_rep1,subject6_rep2,subject7_rep0,subject7_rep1,subject7_rep2,subject8_rep0,subject8_rep1,subject8_rep2,73KID
2950,262145,train2017,"(0, 0, 0.16640625, 0.16640625)",0.09375,2950,False,True,True,1,1,...,2616,9716,27566,2616,9716,27566,2616,9716,27566,2951
2990,262239,train2017,"(0, 0, 0.1671875, 0.1671875)",0.10000,2990,False,True,True,1,1,...,18458,18697,27711,18458,18697,27711,18458,18697,27711,2991
3049,262414,train2017,"(0, 0, 0.125, 0.125)",0.00000,3049,False,True,True,1,1,...,6299,6448,6697,6299,6448,6697,6299,6448,6697,3050
3077,524646,train2017,"(0, 0, 0.1671875, 0.1671875)",0.00000,3077,False,True,True,1,1,...,4289,4515,4537,4289,4515,4537,4289,4515,4537,3078
3146,262690,train2017,"(0, 0, 0.16640625, 0.16640625)",0.00000,3146,False,True,True,1,1,...,8087,8443,26807,8087,8443,26807,8087,8443,26807,3147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72312,515508,train2017,"(0, 0, 0.125, 0.125)",0.00000,72312,False,True,True,1,1,...,2920,3086,3373,2920,3086,3373,2920,3086,3373,72313
72510,254130,train2017,"(0, 0, 0.16640625, 0.16640625)",0.00000,72510,False,True,True,1,1,...,18619,19110,25310,18619,19110,25310,18619,19110,25310,72511
72605,516634,train2017,"(0.125, 0.125, 0, 0)",0.00000,72605,False,True,True,1,1,...,6893,7036,7046,6893,7036,7046,6893,7036,7046,72606
72719,304627,train2017,"(0.125, 0.125, 0, 0)",0.00000,72719,False,True,True,1,1,...,2926,11601,21500,2926,11601,21500,2926,11601,21500,72720


In [6]:
# list of stimuli (73KID) included in shared1000

stim_shared1000 = nsd_base_path / 'nsddata' / 'stimuli' / 'nsd' / 'shared1000.tsv'

with stim_shared1000.open('r') as f:
        stimlabel_shared1000 = pd.read_csv(f, sep="\t", header=None)

stimlabel_shared1000

Unnamed: 0,0
0,2951
1,2991
2,3050
3,3078
4,3147
...,...
995,72313
996,72511
997,72606
998,72720


In [7]:
# list of stimuli (73KID) included in special100

stim_special100 = nsd_base_path / 'nsddata' / 'stimuli' / 'nsd' / 'special100.tsv'

with stim_special100.open('r') as f:
        stimlabel_special100 = pd.read_csv(f, sep="\t", header=None)

stimlabel_special100

Unnamed: 0,0
0,3078
1,3172
2,3914
3,4424
4,4668
...,...
95,70233
96,70506
97,71411
98,72016


# 2. Adding relevant info columns to sub01_behav

In [8]:
# column 'trial_full': trials spanning all the sessions (1 - 30,000)

sub01_behav["trial_full"] = sub01_behav.index + 1

In [9]:
# column 'is_shared1000': == 1 if the stimulus presented is included in shared1000

sub01_behav["is_shared1000"] = sub01_behav["73KID"].isin(stimlabel_shared1000.iloc[:, 0]).astype(int)

In [10]:
# column 'is_special100': == 1 if the stimulus presented is included in special100

sub01_behav["is_special100"] = sub01_behav["73KID"].isin(stimlabel_special100.iloc[:, 0]).astype(int)

In [11]:
# column 'shared1000_repNum': for stimuli included in shared1000, this is the number of repetition; 
# == 1 when presented for the first time, == 2 second, == 3 third

sub01_behav["shared1000_repNum"] = 0

# Loop through each row of shared1000
for _, row in shared1000.iterrows():
    A = row["73KID"]
    
    # Check each repetition column  # change column name (subjectN_rep)
    for rep_num in range(3):
        trial_val = row[f"subject2_rep{rep_num}"]
        mask = (sub01_behav["73KID"] == A) & (sub01_behav["trial_full"] == trial_val)
        sub01_behav.loc[mask, "shared1000_repNum"] = rep_num + 1


In [12]:
sub01_behav

Unnamed: 0,SUBJECT,SESSION,RUN,TRIAL,73KID,10KID,TIME,ISOLD,ISCORRECT,RT,...,ISOLDCURRENT,ISCORRECTCURRENT,TOTAL1,TOTAL2,BUTTON,MISSINGDATA,trial_full,is_shared1000,is_special100,shared1000_repNum
0,2,1,1,1,46003,626,0.731077,0,1.0,871.814177,...,0,1.0,1,0,1.0,0,1,1,0,1
1,2,1,1,2,42020,5013,0.731123,0,1.0,899.369976,...,0,1.0,1,0,1.0,0,2,0,0,0
2,2,1,1,3,22500,4850,0.731169,0,1.0,534.988883,...,0,1.0,1,0,1.0,0,3,0,0,0
3,2,1,1,4,61983,8823,0.731216,0,1.0,525.075803,...,0,1.0,1,0,1.0,0,4,0,0,0
4,2,1,1,5,48145,9538,0.731262,0,1.0,551.546343,...,0,1.0,1,0,1.0,0,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,2,40,12,58,22218,8984,294.717581,1,1.0,922.193575,...,0,0.0,0,1,2.0,0,29996,0,0,0
29996,2,40,12,59,30294,6026,294.717627,1,1.0,1103.050162,...,1,1.0,0,1,2.0,0,29997,0,0,0
29997,2,40,12,60,61376,4841,294.717673,1,1.0,1078.099882,...,0,0.0,0,1,2.0,0,29998,0,0,0
29998,2,40,12,61,42648,7323,294.717719,1,1.0,725.315119,...,1,1.0,0,1,2.0,0,29999,0,0,0


In [13]:
sub01_shared1000_check = sub01_behav[sub01_behav["is_special100"] == 1]
sub01_shared1000_check

Unnamed: 0,SUBJECT,SESSION,RUN,TRIAL,73KID,10KID,TIME,ISOLD,ISCORRECT,RT,...,ISOLDCURRENT,ISCORRECTCURRENT,TOTAL1,TOTAL2,BUTTON,MISSINGDATA,trial_full,is_shared1000,is_special100,shared1000_repNum
55,2,1,1,56,53053,727,0.733854,0,1.0,593.146577,...,0,1.0,1,0,1.0,0,56,1,1,1
345,2,1,6,33,45596,616,0.757509,0,1.0,1168.589350,...,0,1.0,1,0,1.0,0,346,1,1,1
353,2,1,6,41,53053,727,0.757926,1,1.0,921.801170,...,1,1.0,0,1,2.0,0,354,1,1,2
504,2,1,9,5,11943,145,0.771090,0,1.0,1334.250302,...,0,1.0,1,0,1.0,0,505,1,1,1
599,2,1,10,37,45596,616,0.777415,1,1.0,857.316533,...,1,1.0,0,1,2.0,0,600,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22265,2,30,9,16,13614,163,233.659378,1,1.0,1251.044418,...,0,0.0,0,1,2.0,0,22266,1,1,3
22284,2,30,9,35,16617,197,233.660350,1,0.0,1941.883593,...,0,1.0,1,0,1.0,0,22285,1,1,3
22348,2,30,10,36,64499,882,233.664674,1,1.0,3502.778591,...,0,0.0,1,1,2.0,0,22349,1,1,3
22394,2,30,11,20,25372,326,233.668224,1,1.0,931.165750,...,0,0.0,0,1,2.0,0,22395,1,1,3


In [14]:
# just sub1

# Initialize all presentation labels as 'unknown'
sub01_behav['presentation_type'] = 'unknown'

# Create mappings from global trial number to session number and to row index
trial_to_session = dict(zip(sub01_behav["trial_full"], sub01_behav['SESSION']))
trial_to_index = {trial: idx for idx, trial in enumerate(sub01_behav['trial_full'])}

# Loop over each row in shared1000
for _, row in shared1000.iterrows():
    rep0 = row['subject1_rep0']
    rep1 = row['subject1_rep1']
    rep2 = row.get('subject1_rep2', None)

    # Label rep0 trial as 'new'
    if not pd.isna(rep0) and rep0 in trial_to_index:
        sub01_behav.at[trial_to_index[rep0], 'presentation_type'] = 'new'

    # Label rep1 as 'easy' or 'hard' depending on session match with rep0
    if not pd.isna(rep0) and not pd.isna(rep1):
        s0 = trial_to_session.get(rep0)
        s1 = trial_to_session.get(rep1)
        if s0 is not None and s1 is not None and rep1 in trial_to_index:
            difficulty = 'easy' if s0 == s1 else 'hard'
            sub01_behav.at[trial_to_index[rep1], 'presentation_type'] = difficulty

    # Label rep2 as 'easy' or 'hard' depending on session match with rep1
    if rep2 is not None and not pd.isna(rep1) and not pd.isna(rep2):
        s1 = trial_to_session.get(rep1)
        s2 = trial_to_session.get(rep2)
        if s1 is not None and s2 is not None and rep2 in trial_to_index:
            difficulty = 'easy' if s1 == s2 else 'hard'
            sub01_behav.at[trial_to_index[rep2], 'presentation_type'] = difficulty

In [15]:
sub01_behav

Unnamed: 0,SUBJECT,SESSION,RUN,TRIAL,73KID,10KID,TIME,ISOLD,ISCORRECT,RT,...,ISCORRECTCURRENT,TOTAL1,TOTAL2,BUTTON,MISSINGDATA,trial_full,is_shared1000,is_special100,shared1000_repNum,presentation_type
0,1,1,1,1,46003,626,0.505082,0,1.0,803.529781,...,1.0,1,0,1.0,0,1,1,0,1,new
1,1,1,1,2,61883,5013,0.505128,0,1.0,972.261383,...,1.0,1,0,1.0,0,2,0,0,0,unknown
2,1,1,1,3,829,4850,0.505175,0,1.0,742.351236,...,1.0,1,0,1.0,0,3,0,0,0,unknown
3,1,1,1,4,67574,8823,0.505221,0,1.0,747.518479,...,1.0,1,0,1.0,0,4,0,0,0,unknown
4,1,1,1,5,16021,9538,0.505267,0,1.0,547.422774,...,1.0,1,0,1.0,0,5,0,0,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,1,40,12,58,13774,8984,262.629551,1,0.0,1275.300175,...,1.0,1,0,1.0,0,29996,0,0,0,unknown
29996,1,40,12,59,66768,6026,262.629597,1,1.0,661.379768,...,1.0,0,1,2.0,0,29997,0,0,0,unknown
29997,1,40,12,60,53168,4841,262.629644,1,1.0,786.811781,...,0.0,0,1,2.0,0,29998,0,0,0,unknown
29998,1,40,12,61,1944,7323,262.629690,1,1.0,502.626801,...,1.0,0,1,2.0,0,29999,0,0,0,unknown


In [17]:
sub01_behav.to_csv("sub01_behav.tsv", sep="\t", index=False)

# 3. Loop for all subjects; You can just do this after running Pt. 1.

In [20]:
import os

for sub_id in range(1,9):

    ## load the subject's trial-level behavior data
    sub_behav = load_behav_data(sub_id)

    
    ## column 'trial_full': trials spanning all the sessions (1 - 30,000)
    sub_behav["trial_full"] = sub_behav.index + 1

    
    ## column 'is_shared1000': == 1 if the stimulus presented is included in shared1000
    sub_behav["is_shared1000"] = sub_behav["73KID"].isin(stimlabel_shared1000.iloc[:, 0]).astype(int)   

    
    ## column 'is_special100': == 1 if the stimulus presented is included in special100
    sub_behav["is_special100"] = sub_behav["73KID"].isin(stimlabel_special100.iloc[:, 0]).astype(int)

    
    ## column 'shared1000_repNum': for stimuli included in shared1000, this is the number of repetition; 
    ## == 1 when presented for the first time, == 2 second, == 3 third
    sub_behav["shared1000_repNum"] = 0
    
    # Loop through each row of shared1000
    for _, row in shared1000.iterrows():
        A = row["73KID"]
        
        # Check each repetition column  # change column name (subjectN_rep)
        for rep_num in range(3):
            trial_val = row[f"subject{sub_id}_rep{rep_num}"]
            mask = (sub_behav["73KID"] == A) & (sub_behav["trial_full"] == trial_val)
            sub_behav.loc[mask, "shared1000_repNum"] = rep_num + 1

    
    ## column 'presentation_type': for the presentation condition
    ## == new for novel presentation, == easy when presented for nth time in same session, == hard when presented for nth time in different session
    
    # Initialize all presentation labels as 'unknown'
    sub_behav['presentation_type'] = 'unknown'
    
    # Create mappings from global trial number to session number and to row index
    trial_to_session = dict(zip(sub_behav["trial_full"], sub_behav['SESSION']))
    trial_to_index = {trial: idx for idx, trial in enumerate(sub_behav['trial_full'])}
    
    # Loop over each row in shared1000
    for _, row in shared1000.iterrows():
        rep0 = row[f'subject{sub_id}_rep0']
        rep1 = row[f'subject{sub_id}_rep1']
        rep2 = row.get(f'subject{sub_id}_rep2', None)
    
        # Label rep0 trial as 'new'
        if not pd.isna(rep0) and rep0 in trial_to_index:
            sub_behav.at[trial_to_index[rep0], 'presentation_type'] = 'new'
    
        # Label rep1 as 'easy' or 'hard' depending on session match with rep0
        if not pd.isna(rep0) and not pd.isna(rep1):
            s0 = trial_to_session.get(rep0)
            s1 = trial_to_session.get(rep1)
            if s0 is not None and s1 is not None and rep1 in trial_to_index:
                difficulty = 'easy' if s0 == s1 else 'hard'
                sub_behav.at[trial_to_index[rep1], 'presentation_type'] = difficulty
    
        # Label rep2 as 'easy' or 'hard' depending on session match with rep1
        if rep2 is not None and not pd.isna(rep1) and not pd.isna(rep2):
            s1 = trial_to_session.get(rep1)
            s2 = trial_to_session.get(rep2)
            if s1 is not None and s2 is not None and rep2 in trial_to_index:
                difficulty = 'easy' if s1 == s2 else 'hard'
                sub_behav.at[trial_to_index[rep2], 'presentation_type'] = difficulty

    ## create "behav" folder if it doesn't exist
    os.makedirs("behav", exist_ok=True)
    sub_behav.to_csv(f"behav/sub0{sub_id}_behav.tsv", sep="\t", index=False)

    