In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from scipy import stats
from scipy.stats import zscore
from scipy.interpolate import interp1d
from glob import glob
import csv
import seaborn as sns
import string
from string import digits
import colorcet as cc
import ast
import os
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

sns.set(font_scale=1)


This notebook load raw PsychoPy data in C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\exp152_Session_2\data and extract recognition, recall, and segmentation data in the second session. Recognition and Recall output is in memory_df.csv, segmentation output is in segmentation_df_08_15_2023.csv (last run to include all subjects).

General notes (05/03/2023): whether a subject is cf or fc depends on the output of PsychoPy (recognition and segmentation, recall is the same). Moreover, a subject can be cf for recognition but fc for segmentation. This is because some subjects did not finish session 2 in a single round, but needs multiple rounds due to computer crashes, participants have to leave and come back, etc. Moreover, even with segmentation data, if participants complete fine and coarse segmentation in different rounds, they can have different cf/fc status. To determine exactly cf/fc status, we need to look at the recorded data files. 
In the Memory session, cf/fc status was finalized by Tan on 05/03/2023. 
In the Segmentation session, cf/fc status was finalized by Tan on 05/03/2023. 

# Memory

In [50]:
# There are differences in the column names psychopy output, so it's important to load participants separately by condition
# note that cf doesn't necessarily mean segmentation data for the participant is coarse-fine, for example: subject 26's recognition/recall data is in cf format, but they did the fine then coarse segmentation. This is because psychopy crashed while doing segmentation, so they had to restart the experiment in a different order.
cf_subjects = ['e152004', 'e152007', 'e152009', 'e152010', 'e152014', 'e152016',
               'e152018', 'e152020v2', 'e152022', 'e152024', 'e152026', 'e152027', 'e152028', 'e152029', 'e152030', 'e152032', 'e152034', 'e152036', 'e152038', 'e152042', 'e152045', 'e152047', 'e152050', 'e152052']
fc_subjects = ['e152003', 'e152008', 'e152011', 'e152013', 'e152015', 'e152017', 'e152019',
               'e152021', 'e152023', 'e152025', 'e152031', 'e152033', 'e152035', 'e152037', 'e152039', 'e152040', 'e152041', 'e152043', 'e152044', 'e152046', 'e152049', 'e152051', 'e152053']

# sum to 47 corresponding to 47 fMRI participants, good!
len(cf_subjects), len(fc_subjects)


(24, 23)

In [51]:
def check_in_cf_fc(s2file):
    sub = os.path.basename(s2file).split('_')[0]
    if sub in cf_subjects + fc_subjects:
        return True
    else:
        return False


# Load recall/recognition output files from PsychoPy:
s2files = glob(
    r'C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\exp152_Session_2\data\e*.csv')

# there are duplicates of the same participant, and we want to use the most updated version only, which is in the cf/fc lists.
s2files = [s2file for s2file in s2files if check_in_cf_fc(s2file)]


## Recall

In [52]:
# TODO: overall, make sure to select the right columns, and make sure the difference between FC and CF (that has different column names) is accounted for.
# TODO: pay attention to idyo subjects raw data files and how they're processed.
# TODO: there're lots of exceptions raised here, come back later
'''
FREE RECALL
'''
# subject 28 miss one recall data for video 2.4.1
# Recall rows can be indexed by actor face shown:
faces = ['images/1.2.3.face.png', 'images/6.3.9.face.png',
         'images/3.1.3.face.png', 'images/2.4.1.face.png']
recalldf = pd.DataFrame()

for s2file in s2files:
    try:
        # s2file = r'C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\exp152_Session_2\data\e152028_exp152_session_2_2021_Dec_17_0751.csv'
        sub = os.path.basename(s2file).split('_')[0]

        df = pd.read_csv(s2file)
        for i, face in enumerate(faces):
            # recall is in textbox.text column
            # print(df[df.face==face]['textbox.text'].values[0])
            recall = df[df.face == face]['textbox.text'].values[0]
            filtered_sentence = []
            # Remove digits
            remove_digits = str.maketrans('', '', digits)
            try:
                res = recall.translate(remove_digits)
            except AttributeError:
                print("Attribute Error: ", recall, s2file)
            # Make lower case
            sl = res.lower()
            # Remove punctuation
            table = str.maketrans(
                {key: None for key in string.punctuation})
            slnp = sl.translate(table)
            split_str = slnp.split()
            filtered_sentence = split_str
            # Remove stopwords
            #filtered_sentence = [w for w in split_str if not w in s_words]

            # print(filtered_sentence)
            tdf = pd.DataFrame({'sub': [sub], 'movie': [face.split('/')[1][:-9]], 'recall_raw': [
                                recall], 'recall_filtered': [filtered_sentence], 'filtered_length': len(filtered_sentence)})
            recalldf = pd.concat([recalldf, tdf], ignore_index=True)
    except Exception as e:
        print(e)
        print(s2file)


Attribute Error:  nan C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\exp152_Session_2\data\e152004_exp152_session_2_2021_May_18_1411.csv
Attribute Error:  nan C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\exp152_Session_2\data\e152028_exp152_session_2_2021_Dec_17_0751.csv


In [None]:
recalldf[recalldf['sub'] == 'e152004']

In [55]:
recalldf[recalldf['sub'] == 'e152028']

Unnamed: 0,sub,movie,recall_raw,recall_filtered,filtered_length,filtered_length_scaled
88,e152028,1.2.3,\n,[],0,-1.260477
89,e152028,6.3.9,,[],0,-1.260477
90,e152028,3.1.3,\n,[],0,-1.260477
91,e152028,2.4.1,\nThis man walks into a bedroom carrying a ham...,"[this, man, walks, into, a, bedroom, carrying,...",97,-0.273061


In [56]:
recalldf['filtered_length_scaled'] = stats.zscore(recalldf['filtered_length'])

In [57]:
recalldf['sub'].unique()

array(['e152003', 'e152004', 'e152007', 'e152008', 'e152009', 'e152010',
       'e152011', 'e152013', 'e152014', 'e152015', 'e152016', 'e152017',
       'e152018', 'e152019', 'e152020v2', 'e152021', 'e152022', 'e152023',
       'e152024', 'e152025', 'e152026', 'e152027', 'e152028', 'e152029',
       'e152030', 'e152031', 'e152032', 'e152033', 'e152034', 'e152035',
       'e152036', 'e152037', 'e152038', 'e152039', 'e152040', 'e152041',
       'e152042', 'e152043', 'e152044', 'e152045', 'e152046', 'e152047',
       'e152049', 'e152050', 'e152051', 'e152052', 'e152053'],
      dtype=object)

## Recognition

In [58]:
'''
RECOGNITION
'''
# NOTE: There's a bug in the experiment script for auto-calculating accuracy on run 3 (3.1.3) for subjects e152003 and e152004,
# so accuracy should be re-calculated based on the correct response and subject response columns
# Also, for these two participants, the successive runs are in different columns, requiring work-arounds.

recogdf = pd.DataFrame()
for s2file in s2files:
    sub = os.path.basename(s2file).split('_')[0]
    try:
        # s2file='/Users/bezdek/Box/DCL_ARCHIVE/Documents/Events/exp152_fMRIneuralmechanisms/exp152_Session_2/data/e152011_exp152_session_2_2021_Aug_30_1305.csv'
        if sub in ['e152003', 'e152004']:
            df = pd.read_csv(s2file)
            df['key_resp.keys'] = df['key_resp.keys'].combine_first(
                df['key_resp_7.keys'])
            df['key_resp.rt'] = df['key_resp.rt'].combine_first(
                df['key_resp_7.rt'])
            df['key_resp.keys'] = df['key_resp.keys'].combine_first(
                df['key_resp_8.keys'])
            df['key_resp.rt'] = df['key_resp.rt'].combine_first(
                df['key_resp_8.rt'])
            df['key_resp.keys'] = df['key_resp.keys'].combine_first(
                df['key_resp_9.keys'])
            df['key_resp.rt'] = df['key_resp.rt'].combine_first(
                df['key_resp_9.rt'])
            df['recog_resp.corr'] = np.where(((df['key_resp.keys'] == 'right') & (df['corrAns'] == 'right')) | (
                (df['key_resp.keys'] == 'left') & (df['corrAns'] == 'left')), 1, 0)
            df = df.rename(
                columns={'key_resp.keys': 'recog_resp.keys', 'key_resp.rt': 'recog_resp.rt'})
            df = df[['movie', 'task', 'image', 'corrAns', 'trial_type',
                     'recog_resp.keys', 'recog_resp.corr', 'recog_resp.rt']]
        elif sub in fc_subjects:
            df = pd.read_csv(s2file)
            # select columns and rows:
            df = df[['movie', 'task', 'image', 'corrAns', 'trial_type',
                     'recog_resp.keys', 'recog_resp.corr', 'recog_resp.rt']]
        elif sub in cf_subjects:
            df = pd.read_csv(s2file)
            # select columns and rows:
            df = df[['movie', 'task', 'image', 'corrAns', 'trial_type',
                     'key_resp.keys', 'key_resp.corr', 'key_resp.rt']]
            df = df.rename(columns={'key_resp.keys': 'recog_resp.keys',
                           'key_resp.corr': 'recog_resp.corr', 'key_resp.rt': 'recog_resp.rt'})

        df = df[(df.task == 'recog') & (df.movie != 'legos2')]
        df = df.groupby(['movie', 'trial_type']).mean(numeric_only=True)
        df['sub'] = sub
        df['movie'] = [x[0] for x in df.index]
        df['trial_type'] = [x[1] for x in df.index]
        recogdf = pd.concat([recogdf, df], ignore_index=True)
    except Exception as e:
        print(e, sub)
        print(s2file)


In [59]:
recogdf['sub'].unique()

array(['e152003', 'e152004', 'e152007', 'e152008', 'e152009', 'e152010',
       'e152011', 'e152013', 'e152014', 'e152015', 'e152016', 'e152017',
       'e152018', 'e152019', 'e152020v2', 'e152021', 'e152022', 'e152023',
       'e152024', 'e152025', 'e152026', 'e152027', 'e152028', 'e152029',
       'e152030', 'e152031', 'e152032', 'e152033', 'e152034', 'e152035',
       'e152036', 'e152037', 'e152038', 'e152039', 'e152040', 'e152041',
       'e152042', 'e152043', 'e152044', 'e152045', 'e152046', 'e152047',
       'e152049', 'e152050', 'e152051', 'e152052', 'e152053'],
      dtype=object)

In [60]:
recogdf

Unnamed: 0,recog_resp.corr,recog_resp.rt,sub,movie,trial_type
0,0.750000,1.228249,e152003,1.2.3,lure
1,0.842105,1.542761,e152003,1.2.3,old
2,0.850000,1.623031,e152003,2.4.1,lure
3,0.666667,1.607780,e152003,2.4.1,old
4,0.650000,1.670984,e152003,3.1.3,lure
...,...,...,...,...,...
371,0.952381,1.657716,e152053,2.4.1,old
372,0.750000,4.222865,e152053,3.1.3,lure
373,0.947368,2.427225,e152053,3.1.3,old
374,0.450000,2.537419,e152053,6.3.9,lure


In [47]:
# pivot to wide format
recogdf_wide = recogdf.pivot_table(
    index=['sub', 'movie'], columns='trial_type', values='recog_resp.corr').reset_index()
recogdf_wide

trial_type,sub,movie,lure,old
0,e152005,1.2.3,0.55,0.894737
1,e152005,2.4.1,0.65,0.857143
2,e152005,3.1.3,0.70,1.000000
3,e152005,6.3.9,0.50,0.904762
4,e152006,1.2.3,0.90,0.947368
...,...,...,...,...
183,e152052,6.3.9,0.80,0.666667
184,e152053,1.2.3,0.95,1.000000
185,e152053,2.4.1,0.40,0.952381
186,e152053,3.1.3,0.75,0.947368


In [13]:
# compute sensitivity (d') and bias/criterion
recogdf_wide['dprime'] = 0
recogdf_wide['bias'] = 0
for sub in recogdf_wide['sub'].unique():
    for movie in recogdf_wide.movie.unique():
        tempdf = recogdf_wide[(recogdf_wide['sub'] == sub)
                              & (recogdf_wide['movie'] == movie)]
        # add correction for 100 percent hits / correct rejections
        hit = tempdf.old.item()
        # Correct for perfect
        if hit == 1:
            hit = .975
        if hit == 0:
            hit = .025
        miss = 1 - hit
        cr = tempdf.lure.item()
        if cr == 1:
            cr = .975
        if cr == 0:
            cr = .025
        fa = 1 - cr

        dprime = stats.norm.ppf(hit) - stats.norm.ppf(fa)
        c = -(stats.norm.ppf(hit) + stats.norm.ppf(fa))/2.0
        recogdf_wide.loc[(recogdf_wide['sub'] == sub) & (
            recogdf_wide['movie'] == movie), 'dprime'] = dprime.item()
        recogdf_wide.loc[(recogdf_wide['sub'] == sub) & (
            recogdf_wide['movie'] == movie), 'bias'] = c.item()


In [14]:
recogdf_wide


trial_type,sub,movie,lure,old,dprime,bias
0,e152005,1.2.3,0.55,0.894737,1.377781,-0.563229
1,e152005,2.4.1,0.65,0.857143,1.452891,-0.341125
2,e152005,3.1.3,0.70,1.000000,2.484364,-0.717782
3,e152005,6.3.9,0.50,0.904762,1.309172,-0.654586
4,e152006,1.2.3,0.90,0.947368,2.901408,-0.169152
...,...,...,...,...,...,...
183,e152052,6.3.9,0.80,0.666667,1.272349,0.205447
184,e152053,1.2.3,0.95,1.000000,3.604818,-0.157555
185,e152053,2.4.1,0.40,0.952381,1.415044,-0.960869
186,e152053,3.1.3,0.75,0.947368,2.294346,-0.472683


## Merge Recall and Recognition

In [15]:
memory_df = pd.merge(recalldf, recogdf_wide, on=['sub', 'movie'])
memory_df

Unnamed: 0,sub,movie,recall_raw,recall_filtered,filtered_length,filtered_length_scaled,lure,old,dprime,bias
0,e152005,1.2.3,This woman walked into a room with a table on...,"[this, woman, walked, into, a, room, with, a, ...",193,0.606046,0.55,0.894737,1.377781,-0.563229
1,e152005,6.3.9,This actor was folding laundry in the video. H...,"[this, actor, was, folding, laundry, in, the, ...",98,-0.330204,0.50,0.904762,1.309172,-0.654586
2,e152005,3.1.3,This actor was preparing breakfast in the vide...,"[this, actor, was, preparing, breakfast, in, t...",140,0.083717,0.70,1.000000,2.484364,-0.717782
3,e152005,2.4.1,This actor was in the bathroom in the video. H...,"[this, actor, was, in, the, bathroom, in, the,...",63,-0.675138,0.65,0.857143,1.452891,-0.341125
4,e152006,1.2.3,This lady came in from the left side of the r...,"[this, lady, came, in, from, the, left, side, ...",287,1.532441,0.90,0.947368,2.901408,-0.169152
...,...,...,...,...,...,...,...,...,...,...
183,e152052,2.4.1,\nThis guy walked into his bathroom and I beli...,"[this, guy, walked, into, his, bathroom, and, ...",217,0.842573,0.65,0.857143,1.452891,-0.341125
184,e152053,1.2.3,A woman enters a room from the left side of t...,"[a, woman, enters, a, room, from, the, left, s...",159,0.270967,0.95,1.000000,3.604818,-0.157555
185,e152053,6.3.9,A man entered the room with a basket of laundr...,"[a, man, entered, the, room, with, a, basket, ...",78,-0.527309,0.45,0.523810,-0.065944,-0.092689
186,e152053,3.1.3,A man walks into what appears to be an apartme...,"[a, man, walks, into, what, appears, to, be, a...",182,0.497638,0.75,0.947368,2.294346,-0.472683


In [30]:
px.scatter(memory_df.dropna(axis=0), x='filtered_length_scaled', 
           marginal_x='histogram', marginal_y='histogram',
           y='dprime', 
        #    color='sub',
           hover_data=['sub', 'bias', 'movie'], trendline='ols')

In [28]:
px.scatter(memory_df.dropna(axis=0), x='bias', 
           marginal_x='histogram',
           marginal_y='histogram',
           y='dprime', 
       #     color='sub',
           hover_data=['sub', 'bias'], trendline='ols')

In [75]:
memory_df[(memory_df['sub'] == 'e152022') & (memory_df['movie'] == '3.1.3')].recall_raw

66    This actor came into the kitchen, retrieved a ...
Name: recall_raw, dtype: object

In [76]:
memory_df.to_csv('memory_df.csv')

# Segmentation

## Load counterbalance, e152003 <-> sub-01 mapping

In [2]:
counter_balance = pd.read_excel(r"C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\forms&instructions\e152_counterbalance.xls", sheet_name='OA Participants')
# convert column VC##### and fmriPrepNumber to a dictionary
counter_balance_dict = counter_balance.set_index('VC#####').to_dict()['fmriPrep Number']
# convert fmriPrepNumber column and VC##### to a dictionary
counter_balance_dict2 = counter_balance.set_index('fmriPrep Number').to_dict()['VC#####']

In [3]:
counter_balance_dict2

{nan: 'e152048',
 'sub-01': 'e152003',
 'sub-02': 'e152004',
 'sub-03': 'e152007',
 'sub-04': 'e152008',
 'sub-05': 'e152009',
 'sub-06': 'e152010',
 'sub-07': 'e152011',
 'sub-08': 'e152013',
 'sub-09': 'e152014',
 'sub-10': 'e152015',
 'sub-11': 'e152016',
 'sub-12': 'e152017',
 'sub-13': 'e152018',
 'sub-14': 'e152019',
 'sub-15': 'e152020',
 'sub-16': 'e152021',
 'sub-17': 'e152022',
 'sub-18': 'e152023',
 'sub-19': 'e152024',
 'sub-20': 'e152025',
 'sub-21': 'e152026',
 'sub-22': 'e152027',
 'sub-23': 'e152028',
 'sub-24': 'e152029',
 'sub-25': 'e152030',
 'sub-26': 'e152031',
 'sub-27': 'e152032',
 'sub-28': 'e152033',
 'sub-29': 'e152034',
 'sub-30': 'e152035',
 'sub-31': 'e152036',
 'sub-32': 'e152037',
 'sub-33': 'e152038',
 'sub-34': 'e152039',
 'sub-35': 'e152040',
 'sub-36': 'e152041',
 'sub-37': 'e152042',
 'sub-38': 'e152043',
 'sub-39': 'e152044',
 'sub-40': 'e152045',
 'sub-41': 'e152046',
 'sub-42': 'e152047',
 'sub-43': 'e152049',
 'sub-44': 'e152050',
 'sub-45': 'e15

In [4]:
counter_balance_dict

{'e152001': nan,
 'e152002': nan,
 'e152003': 'sub-01',
 'e152004': 'sub-02',
 'e152005': nan,
 'e152006': nan,
 'e152007': 'sub-03',
 'e152008': 'sub-04',
 'e152009': 'sub-05',
 'e152010': 'sub-06',
 'e152011': 'sub-07',
 'e152012': nan,
 'e152013': 'sub-08',
 'e152014': 'sub-09',
 'e152015': 'sub-10',
 'e152016': 'sub-11',
 'e152017': 'sub-12',
 'e152018': 'sub-13',
 'e152019': 'sub-14',
 'e152020': 'sub-15',
 'e152021': 'sub-16',
 'e152022': 'sub-17',
 'e152023': 'sub-18',
 'e152024': 'sub-19',
 'e152025': 'sub-20',
 'e152026': 'sub-21',
 'e152027': 'sub-22',
 'e152028': 'sub-23',
 'e152029': 'sub-24',
 'e152030': 'sub-25',
 'e152031': 'sub-26',
 'e152032': 'sub-27',
 'e152033': 'sub-28',
 'e152034': 'sub-29',
 'e152035': 'sub-30',
 'e152036': 'sub-31',
 'e152037': 'sub-32',
 'e152038': 'sub-33',
 'e152039': 'sub-34',
 'e152040': 'sub-35',
 'e152041': 'sub-36',
 'e152042': 'sub-37',
 'e152043': 'sub-38',
 'e152044': 'sub-39',
 'e152045': 'sub-40',
 'e152046': 'sub-41',
 'e152047': '

## Process data from Scratch from raw data

In [5]:
# Tan 05/03/2023: This list is modified from Sophie's updated the list of participants, which was updated last on July 11.2022
cf_subjects = ['e152003','e152004','e152007','e152009','e152010','e152014','e152016',
               'e152018','e152020v2', 'e152022','e152024','e152028v2','e152030','e152032','e152034','e152036','e152037v2','e152038','e152042',
               'e152045','e152047','e152050','e152052']
fc_subjects = ['e152008','e152011','e152013','e152015v4', 'e152017','e152019',
               'e152021','e152023','e152025','e152026v2','e152027v2','e152031','e152033','e152035','e152039','e152040','e152041',
               'e152044','e152046','e152049','e152051', 'e152053']
# note that for e153043, fine segmentation is in e152043_exp152_session_2_2022_Apr_11_1247.csv but coarse segmentation is in e152043v2_exp152_session_2_2022_Apr_15_1158.csv. Similar things for e152029 => need workarounds for these participants.
# Thus, we should expect only 45 participants here.
len(cf_subjects), len(fc_subjects)

(23, 22)

In [6]:
def check_in_cf_fc(s2file):
    sub = os.path.basename(s2file).split('_')[0]
    if sub in cf_subjects + fc_subjects:
        return True
    else:
        return False


# Load recall/recognition output files from PsychoPy:
s2files = glob(
    r'C:\Users\nguye\Box\DCL_ARCHIVE\Documents\Events\exp152_fMRIneuralmechanisms\exp152_Session_2\data\e*.csv')

# there are duplicates of the same participant, and we want to use the most updated version only, which is in the cf/fc lists.
s2files = [s2file for s2file in s2files if check_in_cf_fc(s2file)]
len(s2files)

45

In [7]:
segdf = pd.DataFrame()
for s2file in s2files:
    try:
        sub = os.path.basename(s2file).split('_')[0]
        if sub in fc_subjects:
            df=pd.read_csv(s2file)
            # select columns and rows:
            # some participants don't have both segment_coarse.rt and segment_fine.rt, so we need to check for that
            if 'segment_coarse.rt' not in df.columns:
                df['segment_coarse.rt'] = np.nan
            if 'segment_fine.rt' not in df.columns:
                df['segment_fine.rt'] = np.nan
            df = df[['movie','task','segment_coarse.rt','segment_fine.rt']]
            df['order'] = 'fine_coarse'
            df = df[(df.task == 'segment') & (df.movie != 'legos2')]
            df['sub'] = sub
            segdf = pd.concat([segdf, df], ignore_index=True) 
        elif sub in cf_subjects:
            #s2file='/Users/bezdek/Library/CloudStorage/Box-Box/DCL_ARCHIVE/Documents/Events/exp152_fMRIneuralmechanisms/exp152_Session_2/data/e152037_exp152_session_2_2022_Mar_12_1229.csv'
            df = pd.read_csv(s2file)
            # select columns and rows:
            # some participants don't have both segment_2.rt and segment.rt, so we need to check for that
            if 'segment_2.rt' not in df.columns:
                df['segment_2.rt'] = np.nan
            if 'segment.rt' not in df.columns:
                df['segment.rt'] = np.nan
            df = df[['movie','task','segment_2.rt','segment.rt']]
            df = df.rename(columns={'segment_2.rt':'segment_coarse.rt','segment.rt':'segment_fine.rt'})
            df['order'] = 'coarse_fine'
            df = df[(df.task == 'segment') & (df.movie != 'legos2')]
            df['sub'] = sub
            segdf = pd.concat([segdf, df], ignore_index=True) 
    except Exception as e:
        print(e, s2file)

In [8]:
segdf.groupby(['sub']).count()

Unnamed: 0_level_0,movie,task,segment_coarse.rt,segment_fine.rt,order
sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e152003,4,4,0,4,4
e152004,4,4,0,4,4
e152007,8,8,4,4,8
e152008,8,8,4,4,8
e152009,8,8,4,4,8
e152010,8,8,4,4,8
e152011,8,8,4,4,8
e152013,8,8,4,4,8
e152014,8,8,4,4,8
e152015v4,4,4,0,4,4


In [9]:
## add e152043, e153043 complete fine segment and the file is in fc format, however, the v2 with coarse segmentation is in cf format.
df = pd.read_csv("../exp152_Session_2/data/e152043_exp152_session_2_2022_Apr_11_1247.csv")
# select columns and rows:
df = df[['movie','task','segment_fine.rt']]
df['order'] = 'fine_coarse'
df = df[(df.task == 'segment') & (df.movie != 'legos2')]
df['sub'] = 'e152043'
# add to the segmentation_df, be sure to align by columns names
segdf = pd.concat([segdf, df], ignore_index=True)

df_coarse = pd.read_csv("../exp152_Session_2/data/e152043v2_exp152_session_2_2022_Apr_15_1158.csv")
# select columns and rows:
df_coarse = df_coarse[['movie','task','segment_2.rt']]
df_coarse = df_coarse.rename(columns={'segment_2.rt':'segment_coarse.rt'})
df_coarse['order'] = 'coarse_fine'
df_coarse = df_coarse[(df_coarse.task == 'segment') & (df_coarse.movie != 'legos2')]
df_coarse['sub'] = 'e152043'
# add to the segmentation_df, be sure to align by columns names
segdf = pd.concat([segdf, df_coarse], ignore_index=True)

segdf

Unnamed: 0,movie,task,segment_coarse.rt,segment_fine.rt,order,sub
0,1.2.3,segment,,"[-3.0535317070025485, 82.44245017500361, 153.6...",coarse_fine,e152003
1,6.3.9,segment,,"[-6.316542810003739, -1.676559024002927, 284.5...",coarse_fine,e152003
2,3.1.3,segment,,"[-8.52994464999938, -7.521954813004413, -5.097...",coarse_fine,e152003
3,2.4.1,segment,,"[-6.891140856001584, -6.715163246000884, -1.59...",coarse_fine,e152003
4,1.2.3,segment,,"[-3.6784942579979543, 85.08150436398864, 153.2...",coarse_fine,e152004
...,...,...,...,...,...,...
343,2.4.1,segment,,"[32.867885480999576, 89.6438594329993, 153.499...",fine_coarse,e152043
344,1.2.3,segment,"[35.97722811299991, 121.68124132999992, 173.92...",,coarse_fine,e152043
345,6.3.9,segment,"[171.68109439299997, 246.07310757899995, 324.4...",,coarse_fine,e152043
346,3.1.3,segment,"[171.60011225899962, 509.42413506699995, 568.3...",,coarse_fine,e152043


In [10]:
## add e152029, e153029 complete coarse segment and the file is in cf format and the v2 with fine segmentation is in cf format.
df = pd.read_csv("../exp152_Session_2/data/e152029_exp152_session_2_2021_Dec_18_1018.csv")
# select columns and rows:
df = df[['movie','task','segment_2.rt']]
df = df.rename(columns={'segment_2.rt':'segment_coarse.rt'})
df['order'] = 'coarse_fine'
df = df[(df.task == 'segment') & (df.movie != 'legos2')]
df['sub'] = 'e152029'
# add to the segmentation_df, be sure to align by columns names
segdf = pd.concat([segdf, df], ignore_index=True)

df_coarse = pd.read_csv("../exp152_Session_2/data/e152029v2_exp152_session_2_2021_Dec_18_1153.csv")
# select columns and rows:
df_coarse = df_coarse[['movie','task','segment.rt']]
df_coarse = df_coarse.rename(columns={'segment.rt':'segment_fine.rt'})
df_coarse['order'] = 'coarse_fine'
df_coarse = df_coarse[(df_coarse.task == 'segment') & (df_coarse.movie != 'legos2')]
df_coarse['sub'] = 'e152029'
# add to the segmentation_df, be sure to align by columns names
segdf = pd.concat([segdf, df_coarse], ignore_index=True)

segdf

Unnamed: 0,movie,task,segment_coarse.rt,segment_fine.rt,order,sub
0,1.2.3,segment,,"[-3.0535317070025485, 82.44245017500361, 153.6...",coarse_fine,e152003
1,6.3.9,segment,,"[-6.316542810003739, -1.676559024002927, 284.5...",coarse_fine,e152003
2,3.1.3,segment,,"[-8.52994464999938, -7.521954813004413, -5.097...",coarse_fine,e152003
3,2.4.1,segment,,"[-6.891140856001584, -6.715163246000884, -1.59...",coarse_fine,e152003
4,1.2.3,segment,,"[-3.6784942579979543, 85.08150436398864, 153.2...",coarse_fine,e152004
...,...,...,...,...,...,...
351,2.4.1,segment,"[223.40938365300008, 334.8414090819997, 404.28...",,coarse_fine,e152029
352,1.2.3,segment,,"[56.48353854800007, 80.93153294100011, 152.203...",coarse_fine,e152029
353,6.3.9,segment,,"[57.32472520800002, 201.42077444300048, 279.60...",coarse_fine,e152029
354,3.1.3,segment,,"[164.76395759499974, 335.3319954260005, 410.37...",coarse_fine,e152029


In [11]:
segdf.groupby(['sub']).count()

Unnamed: 0_level_0,movie,task,segment_coarse.rt,segment_fine.rt,order
sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e152003,4,4,0,4,4
e152004,4,4,0,4,4
e152007,8,8,4,4,8
e152008,8,8,4,4,8
e152009,8,8,4,4,8
e152010,8,8,4,4,8
e152011,8,8,4,4,8
e152013,8,8,4,4,8
e152014,8,8,4,4,8
e152015v4,4,4,0,4,4


In [12]:
segdf['sub'] = segdf['sub'].apply(lambda x: x[:7])

In [13]:
# use counter_balance to map the sub to sub_id
segdf['sub_id'] = segdf['sub'].apply(lambda x: counter_balance_dict[x])
segdf

Unnamed: 0,movie,task,segment_coarse.rt,segment_fine.rt,order,sub,sub_id
0,1.2.3,segment,,"[-3.0535317070025485, 82.44245017500361, 153.6...",coarse_fine,e152003,sub-01
1,6.3.9,segment,,"[-6.316542810003739, -1.676559024002927, 284.5...",coarse_fine,e152003,sub-01
2,3.1.3,segment,,"[-8.52994464999938, -7.521954813004413, -5.097...",coarse_fine,e152003,sub-01
3,2.4.1,segment,,"[-6.891140856001584, -6.715163246000884, -1.59...",coarse_fine,e152003,sub-01
4,1.2.3,segment,,"[-3.6784942579979543, 85.08150436398864, 153.2...",coarse_fine,e152004,sub-02
...,...,...,...,...,...,...,...
351,2.4.1,segment,"[223.40938365300008, 334.8414090819997, 404.28...",,coarse_fine,e152029,sub-24
352,1.2.3,segment,,"[56.48353854800007, 80.93153294100011, 152.203...",coarse_fine,e152029,sub-24
353,6.3.9,segment,,"[57.32472520800002, 201.42077444300048, 279.60...",coarse_fine,e152029,sub-24
354,3.1.3,segment,,"[164.76395759499974, 335.3319954260005, 410.37...",coarse_fine,e152029,sub-24


## Build up from Sophie's data

In [14]:
# this segmentation.csv file was created by Sophie Su on 07/11/2022 for all subjects (the last subject were collected on 05/28/2022), use it. However, there are two subjects not there: e153053 (sub-47), e152043 (sub-38) => add them manually. Moreover, segmentation data for e152003 and e152004 should be fine instead of coarse => fix it manually.
segmentation_df = pd.read_csv("./segmentation.csv")
segmentation_df["sub"].unique()

array(['e152005', 'e152006', 'e152007', 'e152008', 'e152009', 'e152010',
       'e152011', 'e152013', 'e152014', 'e152016', 'e152017', 'e152018',
       'e152021', 'e152022', 'e152023', 'e152024', 'e152025', 'e152026',
       'e152028', 'e152030', 'e152031', 'e152032', 'e152033', 'e152034',
       'e152035', 'e152036', 'e152037', 'e152038', 'e152039', 'e152040',
       'e152041', 'e152042', 'e152044', 'e152045', 'e152046', 'e152047',
       'e152049', 'e152050', 'e152051', 'e152052', 'e152015', 'e152019',
       'e152020', 'e152027', 'e152029', 'e152003', 'e152004'],
      dtype=object)

In [15]:
# remove 'e152005' and 'e152006' because they do not have fMRI data
segmentation_df = segmentation_df[segmentation_df['sub'] != 'e152005']
segmentation_df = segmentation_df[segmentation_df['sub'] != 'e152006']

In [16]:
# segmentation for e152003 and e152004 should be fine instead of coarse, which means that there is no data in column segment_coarse.rt, so we need to replace it with segment_fine.rt and vice versa
segmentation_df.loc[segmentation_df["sub"].isin(["e152003", "e152004"]), ["segment_coarse.rt", "segment_fine.rt"]] = segmentation_df.loc[segmentation_df["sub"].isin(["e152003", "e152004"]), ["segment_fine.rt", "segment_coarse.rt"]].values
# also, we need to change the order column to fine
segmentation_df.loc[segmentation_df["sub"].isin(["e152003", "e152004"]), "order"] = "fine"

In [17]:
## add e152053, in fine-coarse format, this person didn't complete coarse segmentation
df = pd.read_csv("../exp152_Session_2/data/e152053_exp152_session_2_2022_Jun_04_1513.csv")
# select columns and rows:
df = df[['movie', 'task', 'segment_fine.rt']]
df['order']='fine_coarse'
df = df[(df.task == 'segment') & (df.movie != 'legos2')]
df['sub'] = 'e152053'
# add to the segmentation_df, be sure to align by columns names
segmentation_df = pd.concat([segmentation_df, df], ignore_index=True)
segmentation_df

Unnamed: 0,movie,task,segment_coarse.rt,segment_fine.rt,order,sub
0,1.2.3,segment,"[81.22465172300144, 152.4806256030024, 275.136...",,coarse_fine,e152007
1,6.3.9,segment,"[201.5615719640009, 275.17759370800195, 303.28...",,coarse_fine,e152007
2,3.1.3,segment,"[22.026569180998194, 168.62662580000324, 342.4...",,coarse_fine,e152007
3,2.4.1,segment,"[216.21561364099762, 329.53564650000044, 359.3...",,coarse_fine,e152007
4,1.2.3,segment,,"[56.83075998399727, 80.3107490820039, 93.07074...",coarse_fine,e152007
...,...,...,...,...,...,...
343,2.4.1,segment,,"[240.3569225300016, 314.90095041399763, 401.16...",fine,e152004
344,1.2.3,segment,,"[9.209173883002222, 55.849169659002655, 79.873...",fine_coarse,e152053
345,6.3.9,segment,,"[57.332311460999335, 270.49236585199833, 307.1...",fine_coarse,e152053
346,3.1.3,segment,,"[20.217718727002648, 72.72971740900175, 131.64...",fine_coarse,e152053


In [18]:
## add e152043, e153043 complete fine segment and the file is in fc format, however, the v2 with coarse segmentation is in cf format.
df = pd.read_csv("../exp152_Session_2/data/e152043_exp152_session_2_2022_Apr_11_1247.csv")
# select columns and rows:
df = df[['movie','task','segment_fine.rt']]
df['order'] = 'fine_coarse'
df = df[(df.task == 'segment') & (df.movie != 'legos2')]
df['sub'] = 'e152043'
# add to the segmentation_df, be sure to align by columns names
segmentation_df = pd.concat([segmentation_df, df], ignore_index=True)

df_coarse = pd.read_csv("../exp152_Session_2/data/e152043v2_exp152_session_2_2022_Apr_15_1158.csv")
# select columns and rows:
df_coarse = df_coarse[['movie','task','segment_2.rt']]
df_coarse = df_coarse.rename(columns={'segment_2.rt':'segment_coarse.rt'})
df_coarse['order'] = 'fine_coarse'
df_coarse = df_coarse[(df_coarse.task == 'segment') & (df_coarse.movie != 'legos2')]
df_coarse['sub'] = 'e152043'
# add to the segmentation_df, be sure to align by columns names
segmentation_df = pd.concat([segmentation_df, df_coarse], ignore_index=True)

segmentation_df

Unnamed: 0,movie,task,segment_coarse.rt,segment_fine.rt,order,sub
0,1.2.3,segment,"[81.22465172300144, 152.4806256030024, 275.136...",,coarse_fine,e152007
1,6.3.9,segment,"[201.5615719640009, 275.17759370800195, 303.28...",,coarse_fine,e152007
2,3.1.3,segment,"[22.026569180998194, 168.62662580000324, 342.4...",,coarse_fine,e152007
3,2.4.1,segment,"[216.21561364099762, 329.53564650000044, 359.3...",,coarse_fine,e152007
4,1.2.3,segment,,"[56.83075998399727, 80.3107490820039, 93.07074...",coarse_fine,e152007
...,...,...,...,...,...,...
351,2.4.1,segment,,"[32.867885480999576, 89.6438594329993, 153.499...",fine_coarse,e152043
352,1.2.3,segment,"[35.97722811299991, 121.68124132999992, 173.92...",,fine_coarse,e152043
353,6.3.9,segment,"[171.68109439299997, 246.07310757899995, 324.4...",,fine_coarse,e152043
354,3.1.3,segment,"[171.60011225899962, 509.42413506699995, 568.3...",,fine_coarse,e152043


In [19]:
segmentation_df.groupby('sub').count()

Unnamed: 0_level_0,movie,task,segment_coarse.rt,segment_fine.rt,order
sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e152003,4,4,0,4,4
e152004,4,4,0,4,4
e152007,8,8,4,4,8
e152008,8,8,4,4,8
e152009,8,8,4,4,8
e152010,8,8,4,4,8
e152011,8,8,4,4,8
e152013,8,8,4,4,8
e152014,8,8,4,4,8
e152015,4,4,0,4,4


In [32]:
segmentation_df.to_csv('segmentation_df_08_15_2023.csv', index=False)

## Compare the two approaches

In [20]:
def expand_list(segmentation_df):
    segmentation_df_long = pd.DataFrame()
    i = 0
    while i < len(segmentation_df):
        test = segmentation_df.iloc[i]
        if pd.notna(test['segment_fine.rt']):
            bds = ast.literal_eval(test['segment_fine.rt'])
            bds = [x for x in bds if x >= 0]
            testdf = pd.DataFrame(({'boundaries':bds}))
            testdf['boundary_id'] = list(range(1, len(bds)+1))
            testdf['grain'] = 'fine'
            testdf['sub'] = test['sub']
            testdf['order'] = test['order']
            testdf['movie'] = test['movie']   
            segmentation_df_long = pd.concat([segmentation_df_long,testdf],ignore_index=True)
        elif pd.notna(test['segment_coarse.rt']):
            bds = ast.literal_eval(test['segment_coarse.rt'])
            bds = [x for x in bds if x >= 0]
            testdf = pd.DataFrame(({'boundaries':bds}))
            testdf['boundary_id'] = list(range(1, len(bds)+1))
            testdf['grain'] = 'coarse'
            testdf['sub'] = test['sub']
            testdf['order'] = test['order']
            testdf['movie'] = test['movie'] 
            segmentation_df_long = pd.concat([segmentation_df_long,testdf],ignore_index=True)
        i+=1
    return segmentation_df_long

In [21]:
segdf_long = expand_list(segdf)
segdf_long

Unnamed: 0,boundaries,boundary_id,grain,sub,order,movie
0,82.442450,1,fine,e152003,coarse_fine,1.2.3
1,153.634449,2,fine,e152003,coarse_fine,1.2.3
2,275.714474,3,fine,e152003,coarse_fine,1.2.3
3,346.298503,4,fine,e152003,coarse_fine,1.2.3
4,440.986492,5,fine,e152003,coarse_fine,1.2.3
...,...,...,...,...,...,...
2807,215.582213,1,fine,e152029,coarse_fine,2.4.1
2808,321.686243,2,fine,e152029,coarse_fine,2.4.1
2809,402.142240,3,fine,e152029,coarse_fine,2.4.1
2810,533.038285,4,fine,e152029,coarse_fine,2.4.1


In [22]:
segmentation_df_long = expand_list(segmentation_df)
segmentation_df_long

Unnamed: 0,boundaries,boundary_id,grain,sub,order,movie
0,81.224652,1,coarse,e152007,coarse_fine,1.2.3
1,152.480626,2,coarse,e152007,coarse_fine,1.2.3
2,275.136678,3,coarse,e152007,coarse_fine,1.2.3
3,341.248664,4,coarse,e152007,coarse_fine,1.2.3
4,429.088706,5,coarse,e152007,coarse_fine,1.2.3
...,...,...,...,...,...,...
2807,509.424135,2,coarse,e152043,fine_coarse,3.1.3
2808,568.336149,3,coarse,e152043,fine_coarse,3.1.3
2809,190.118582,1,coarse,e152043,fine_coarse,2.4.1
2810,227.550553,2,coarse,e152043,fine_coarse,2.4.1


In [23]:
# sort each dataframe by sub and movie and boundary_id
segdf_long = segdf_long.sort_values(by=['sub', 'movie', 'boundary_id'])
segmentation_df_long = segmentation_df_long.sort_values(by=['sub', 'movie', 'boundary_id'])

In [25]:
segmentation_df_long

Unnamed: 0,boundaries,boundary_id,grain,sub,order,movie
2660,82.442450,1,fine,e152003,fine,1.2.3
2661,153.634449,2,fine,e152003,fine,1.2.3
2662,275.714474,3,fine,e152003,fine,1.2.3
2663,346.298503,4,fine,e152003,fine,1.2.3
2664,440.986492,5,fine,e152003,fine,1.2.3
...,...,...,...,...,...,...
2717,270.492366,2,fine,e152053,fine_coarse,6.3.9
2718,307.124340,3,fine,e152053,fine_coarse,6.3.9
2719,417.764381,4,fine,e152053,fine_coarse,6.3.9
2720,525.004402,5,fine,e152053,fine_coarse,6.3.9


In [26]:
segdf_long

Unnamed: 0,boundaries,boundary_id,grain,sub,order,movie
0,82.442450,1,fine,e152003,coarse_fine,1.2.3
1,153.634449,2,fine,e152003,coarse_fine,1.2.3
2,275.714474,3,fine,e152003,coarse_fine,1.2.3
3,346.298503,4,fine,e152003,coarse_fine,1.2.3
4,440.986492,5,fine,e152003,coarse_fine,1.2.3
...,...,...,...,...,...,...
2663,270.492366,2,fine,e152053,fine_coarse,6.3.9
2664,307.124340,3,fine,e152053,fine_coarse,6.3.9
2665,417.764381,4,fine,e152053,fine_coarse,6.3.9
2666,525.004402,5,fine,e152053,fine_coarse,6.3.9


In [31]:
assert sum(np.array(segdf_long['boundaries']) == np.array(segdf_long['boundaries'])) == len(segmentation_df_long)
sum(np.array(segdf_long['boundaries']) == np.array(segdf_long['boundaries'])), len(segdf_long)

(2812, 2812)