In [5]:
# Standard Library Imports
import os
import sys
from subprocess import call, check_output
import json
import time

# Third-Party Library Imports
import numpy as np
import boto3
from botocore import UNSIGNED
from botocore.client import Config
import webdataset as wds
import nibabel as nib
import pickle as pkl
import h5py
from PIL import Image
import matplotlib.pyplot as plt

In [36]:
proj_name = "NSD"
temp_dir = os.getcwd() + f"/temp{proj_name}" # the folder where the AFNI container will do its work
mni_dir = os.getcwd() + f"/MNIs{proj_name}" # the folder where MNI outputs will go

# # if starting from scratch
# command = f"rm -r {temp_dir}"
# call(command,shell=True)
# command = f"rm -r {mni_dir}"
# call(command,shell=True)

os.makedirs(temp_dir, exist_ok=True)
os.makedirs(mni_dir, exist_ok=True)
print(temp_dir)
print(mni_dir)

/weka/proj-fmri/paulscotti/fMRI-foundation-model/dataset_creation/afni_conversion/tempNSD
/weka/proj-fmri/paulscotti/fMRI-foundation-model/dataset_creation/afni_conversion/MNIsNSD


In [37]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'natural-scenes-dataset'
prefix = 'nsddata_rawdata'

paginator = s3.get_paginator('list_objects_v2')
file_name_list = []
for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
    for obj in page.get('Contents', []):
        file_name = obj['Key']
        file_name_list.append(file_name)
print("len(file_name_list) =", len(file_name_list))

len(file_name_list) = 28922


In [None]:
print("starting...")
for file_name in file_name_list:
    if file_name.endswith('_bold.nii.gz'):
        dataset_id = file_name.split('/')[2]

        func_path = file_name.split('/')[-1]
        temp_file_path = temp_dir + '/' + dataset_id + '/' + func_path
        mni_file_path = mni_dir + '/' + dataset_id + '/' + func_path
        
        os.makedirs(temp_dir + '/' + dataset_id, exist_ok=True)
        os.makedirs(mni_dir + '/' + dataset_id, exist_ok=True)

        if os.path.exists(f"{temp_file_path.split('.nii.gz')[0] + '.txt'}") or os.path.exists(mni_file_path.split(".nii.gz")[0] + "_overlap.txt"):
            continue
           
        # download from s3
        print(f"downloading {temp_file_path}")
        try:
            # s3.download_file(bucket_name, file_name, temp_file_path)
            command = f"wget https://{bucket_name}.s3.amazonaws.com/{file_name} -q -O {temp_file_path}"
            call(command,shell=True)
        except:
            print("failed to download? 1")

        while not os.path.exists(f"{temp_file_path}"):
            print(f"s3 download failed. trying again... {temp_file_path}")
            try:
                # s3.download_file(bucket_name, file_name, temp_file_path)
                command = f"wget https://{bucket_name}.s3.amazonaws.com/{file_name} -q -O {temp_file_path}"
                call(command,shell=True)
            except:
                print("failed to download? 2")
            time.sleep(10)

        # make temp lock file so other parallel jobs dont do duplicate work
        command = f"touch {temp_file_path.split('.nii.gz')[0] + '.txt'}"
        call(command, shell=True)

        # Wait for AFNI to be complete
        print('waiting...')
        waiting_time = 0
        while not os.path.exists(mni_file_path.split(".nii.gz")[0] + "_overlap.txt"):
            time.sleep(5)     
            waiting_time += 5
            if waiting_time > 180:
                break

        if waiting_time <= 180:
            time.sleep(10) # wait to ensure txt file was fully created
            with open(mni_file_path.split(".nii.gz")[0] + "_overlap.txt", 'r') as file:
                try:
                    overlap = file.readlines()
                    overlap = np.array(overlap).astype(np.float32)[0]
                except:
                    print("overlap error!")
                    overlap = 0 # in case some weird error occurs where overlap txt is empty, assume its ok
            
            # if overlap >20% is bad
            print("overlap:",overlap)
            afni_filename = mni_dir + '/' + dataset_id + '/' + func_path.split(".nii.gz")[0] + "_MNI.nii.gz"

            command = f"aws s3 cp {afni_filename} s3://proj-fmri/fmri_foundation_datasets/NSD_MNI/{dataset_id}/{func_path.split('.nii.gz')[0] + '_MNI.nii.gz'}"
            call(command,shell=True)

            print(f"Removing _MNI file... {afni_filename}")
            command = f"rm {afni_filename}"
            call(command,shell=True)

            time.sleep(5)
        else:
            print("waiting time exceeded...")
            
            # remove all files
            command = f"rm {temp_dir}/{dataset_id}/*"
            call(command,shell=True)

            # write placeholder txt overlap file
            with open(mni_file_path.split(".nii.gz")[0] + "_overlap.txt", 'w') as file:
                file.write('-999')

starting...
downloading /weka/proj-fmri/paulscotti/fMRI-foundation-model/dataset_creation/afni_conversion/tempNSD/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-01_bold.nii.gz
waiting...
overlap: 0.084
upload: MNIsNSD/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-01_bold_MNI.nii.gz to s3://proj-fmri/fmri_foundation_datasets/NSD_MNI/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-01_bold_MNI.nii.gz
Removing _MNI file... /weka/proj-fmri/paulscotti/fMRI-foundation-model/dataset_creation/afni_conversion/MNIsNSD/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-01_bold_MNI.nii.gz
downloading /weka/proj-fmri/paulscotti/fMRI-foundation-model/dataset_creation/afni_conversion/tempNSD/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-02_bold.nii.gz
waiting...
overlap: 0.05
upload: MNIsNSD/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-02_bold_MNI.nii.gz to s3://proj-fmri/fmri_foundation_datasets/NSD_MNI/ses-nsd01/sub-01_ses-nsd01_task-nsdcore_run-02_bold_MNI.nii.gz
Removing _MNI file... /weka/proj-fmri/paulscotti/fMRI-