In [98]:
!pip install dropbox sentence_transformers



In [99]:
import re
from glob import glob
import os
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import seaborn as sns

import dropbox, getpass
from sentence_transformers import SentenceTransformer

In [100]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [101]:
# top_dir = '/scratch4/lisik3/emcmaho7/SIfMRI_modeling'
top_dir = './'
interim_dir = f'{top_dir}/data/interim'
raw_dir = f'{top_dir}/data/raw'
remote_path = '/projects/SI_fmri/SIfMRI_modeling/data/raw'

out_path = f'{interim_dir}/CaptionData'
figures_dir = f'{top_dir}/reports/figures/CaptionData'
Path(out_path).mkdir(exist_ok=True, parents=True)
Path(raw_dir).mkdir(exist_ok=True, parents=True)
Path(figures_dir).mkdir(exist_ok=True, parents=True)

In [102]:
catch_trials = ['flickr-0-5-7-5-4-0-7-0-2605754070_54.mp4', 'yt-dfOVWymr76U_103.mp4']

In [None]:
personal_access_token = getpass.getpass('Enter your Personal Access Token: ')
dbx = dropbox.Dropbox(personal_access_token)

if True:
    Path(f'{raw_dir}/captions/').mkdir(exist_ok=True, parents=True)
    list_folder_result = dbx.files_list_folder(path=f'{remote_path}/captions')
    for entry in tqdm(list_folder_result.entries, total=len(list_folder_result.entries)):
        file = entry.path_lower.split('/')[-1]
        dbx.files_download_to_file(f'{raw_dir}/captions/{file}', entry.path_lower)

if False:
    Path(f'{raw_dir}/videos/').mkdir(exist_ok=True, parents=True)
    list_folder_result = dbx.files_list_folder(path=f'{remote_path}/videos')
    for entry in tqdm(list_folder_result.entries, total=len(list_folder_result.entries)):
        file = entry.path_lower.split('/')[-1]
        dbx.files_download_to_file(f'{raw_dir}/videos/{file}', entry.path_lower)

if True:
    Path(f'{raw_dir}/annotations').mkdir(exist_ok=True, parents=True)
    list_folder_result = dbx.files_list_folder(path=f'{remote_path}/annotations')
    for entry in tqdm(list_folder_result.entries, total=len(list_folder_result.entries)):
        file = entry.path_lower.split('/')[-1]
        dbx.files_download_to_file(f'{raw_dir}/annotations/{file}', entry.path_lower)

Enter your Personal Access Token: ··········


 68%|██████▊   | 108/158 [00:50<00:21,  2.27it/s]

## Load and clean data

In [None]:
all_sub_data = []
empty_files = 0
for path in tqdm(glob(f'{raw_dir}/captions/*.csv')):
    match = re.search(r'sub-(.*?)_condition-(.*?)_(.*).csv', path)
    if match:
        sub_id = match.group(1)
        condition = match.group(2)
        date = match.group(3)
        # print(f"sub_id: {sub_id}, condition: {condition}, date: {date}")

    if os.path.getsize(path) == 0:
        # print('oops that file is empty. moving on...')
        empty_files += 1
    else:
        df = pd.read_csv(path, header=None)
        df.columns = ['url', 'caption']
        df['url'] = df['url'].str.extract("(https://[^']+)")
        df['video_name'] = df['url'].str.extract(r'/([^/]+\.mp4)')[0]
        df[['sub_id', 'condition', 'date']] = sub_id, condition, date
        all_sub_data.append(df)
all_sub_data = pd.concat(all_sub_data)

In [None]:
print(f'Number of empty files {empty_files}')

In [None]:
incomplete_data = all_sub_data.groupby('sub_id').filter(lambda x: len(np.unique(x.video_name)) < 12)
data = all_sub_data.groupby('sub_id').filter(lambda x: len(np.unique(x.video_name)) == 12)
extra_data = all_sub_data.groupby('sub_id').filter(lambda x: len(np.unique(x.video_name)) > 12)
extra_data = extra_data.drop_duplicates(subset=['sub_id', 'video_name'], keep='last')
data = pd.concat([data, extra_data]).reset_index()

All the subjects below that have incomplete data returned the HIT (manually confirmed).

In [None]:
returned_subs = list(incomplete_data.sub_id.unique())
print(f'Number of incomplete subjects {len(returned_subs)}')
print(returned_subs)

In [None]:
all_sub_ids = list(data.sub_id.unique())
print(f'Number of total complete subjects {len(all_sub_ids)}')
data.drop_duplicates(subset=['sub_id']).groupby(['condition']).count()

In [None]:
llm = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')

In [None]:
_, axes = plt.subplots(len(catch_trials), len(catch_trials))
bad_subs = []
for ax, catch_trial in zip(axes, catch_trials):
    catch_data = data.loc[data.video_name == catch_trial].reset_index(drop=True)
    captions_list = catch_data.caption.to_list()
    embeddings = llm.encode(captions_list, normalize_embeddings=False)

    pairwise_dist = squareform(pdist(embeddings, metric='correlation'))
    ax[0].imshow(pairwise_dist)
    ax[0].set_title(catch_trial)

    pairwise_dist[pairwise_dist == 0] = np.nan
    distance = np.nanmean(pairwise_dist, axis=0)
    dist_mean = distance.mean()
    dist_std = distance.std()
    dist_threshold = dist_mean + (2.5 * dist_std)
    ax[1].hist(distance)
    ax[1].vlines(x=dist_threshold, ymin=0, ymax=10, color='red')
    ax[1].set_title(catch_trial)

    bad_subs = bad_subs + catch_data.loc[distance > dist_threshold, 'sub_id'].to_list()
print(bad_subs)
print(len(bad_subs))
filtered_data = data[(~data['sub_id'].isin(bad_subs)) & (~data['video_name'].isin(catch_trials))].reset_index(drop=True)
print(sorted(filtered_data.condition.unique()))
plt.savefig(f'{figures_dir}/data_quality_viz.pdf')

In [None]:
print(f'Number of good participants {filtered_data.sub_id.nunique()}')
filtered_data.drop_duplicates(subset=['sub_id']).groupby(['condition']).count()

#### Reorganize the captions and save

In [None]:
caption_df = filtered_data[['video_name', 'caption']]
caption_df['n_caption'] = caption_df.groupby('video_name').cumcount() + 1
caption_df['n_caption'] = 'caption' + caption_df['n_caption'].astype('str').str.zfill(2)
captions = caption_df.pivot(columns='n_caption', index='video_name', values='caption')
captions.to_csv(f'{out_path}/captions.csv')

In [None]:
missing_captions = captions.loc[np.invert(captions.isna().to_numpy()).sum(axis=1) < 5].reset_index().video_name.to_list()
print(missing_captions)

#### Merge with the ratings

In [None]:
annotations = pd.read_csv(f'{raw_dir}/annotations/annotations.csv').drop(columns=['cooperation', 'dominance', 'intimacy'])
cap_annot = annotations.merge(captions.reset_index(), on='video_name')
caption_columns = [col for col in cap_annot.columns if col.startswith('caption')]
cap_annot['captions'] = cap_annot[caption_columns].apply(lambda row: row.dropna().tolist(), axis=1)
cap_annot = cap_annot.drop(columns=caption_columns)

rename_map = {col: 'rating-' + col.replace(' ', '_')  for col in annotations.columns if 'video_name' not in col}
rename_map['transitivity'] = 'rating-object'
cap_annot.rename(columns=rename_map, inplace=True)

#### Calculate the noise ceiling of the ratings

In [None]:
def corr(x, y):
    x_m = x - np.nanmean(x)
    y_m = y - np.nanmean(y)
    numer = np.nansum(x_m * y_m)
    denom = np.sqrt(np.nansum(x_m * x_m) * np.nansum(y_m * y_m))
    if denom != 0:
        return numer / denom
    else:
        return np.nan

def noise_ceiling(rows):
    even = rows[rows.even].groupby('video_name').mean(numeric_only=True).reset_index().sort_values(by='video_name').likert_response.to_numpy()
    odd = rows[~rows.even].groupby('video_name').mean(numeric_only=True).reset_index().sort_values(by='video_name').likert_response.to_numpy()
    return corr(even, odd)

In [None]:
# Load the ratings per subject
individ_rating = pd.read_csv(f'{raw_dir}/annotations/individual_subject_ratings.csv')
individ_rating = individ_rating[~individ_rating['question_name'].isin(['dominance', 'cooperation', 'relation'])]
rename_map = {q: 'rating-' + q.replace(' ', '_')  for q in individ_rating.question_name.unique()}

# Manually edit some of the values so that it matches the convension in the annotations file
rename_map['joint'] = 'rating-agent_distance'
rename_map['distance'] = 'rating-joint_action'
rename_map['communicating'] = 'rating-communication'
individ_rating.replace(rename_map, inplace=True)

In [None]:
individ_rating['rating_num'] = individ_rating.groupby(['question_name', 'video_name']).cumcount()
individ_rating['even'] = False
individ_rating.loc[(individ_rating.rating_num % 2) == 0, 'even'] = True
individ_rating.head()

In [None]:
train_data = pd.read_csv(f'{raw_dir}/annotations/train.csv')
test_data = pd.read_csv(f'{raw_dir}/annotations/test.csv')

In [None]:
train_nc = individ_rating[individ_rating.video_name.isin(train_data.video_name.to_list())]
train_nc = train_nc.groupby('question_name').apply(noise_ceiling).reset_index()
train_nc.rename(columns={0: 'nc'}, inplace=True)
train_nc

In [None]:
test_nc = individ_rating[individ_rating.video_name.isin(test_data.video_name.to_list())]
test_nc = test_nc.groupby('question_name').apply(noise_ceiling).reset_index()
test_nc.rename(columns={0: 'nc'}, inplace=True)
test_nc

#### Save the files

In [None]:
train_nc.to_csv(f'{out_path}/train_rating_noise_ceiling.csv', index=False)
test_nc.to_csv(f'{out_path}/test_rating_noise_ceiling.csv', index=False)

In [None]:
cap_annot.to_csv(f'{out_path}/stimulus_data.csv', index=False)

cap_train_only = cap_annot.merge(train_data, on='video_name').reset_index(drop=True)
cap_train_only.to_csv(f'{out_path}/stimulus_data_train.csv', index=False)

cap_test_only = cap_annot.merge(test_data, on='video_name').reset_index(drop=True)
cap_test_only.to_csv(f'{out_path}/stimulus_data_test.csv', index=False)