In [1]:
import pickle
import numpy as np
from config import get_config

In [2]:
from transformers import BertLMHeadModel, BartTokenizer
from data import ZuCo_dataset

In [3]:
task_name = "task1, task2, taskNRv2"

In [4]:
''' set up dataloader '''

whole_dataset_dicts = []

if 'task1' in task_name:
    dataset_path_task1 = r'I:\Science\CIS-YASHMOSH\niallmcguire\ZuCo\task1-SR\pickle\task1-SR-dataset.pickle'
    with open(dataset_path_task1, 'rb') as handle:
        whole_dataset_dicts.append(pickle.load(handle))

if 'task2' in task_name:
    dataset_path_task2 = r'I:\Science\CIS-YASHMOSH\niallmcguire\ZuCo\task2-NR\pickle\task2-NR-dataset.pickle'
    with open(dataset_path_task2, 'rb') as handle:
        whole_dataset_dicts.append(pickle.load(handle))

if 'task3' in task_name:
    dataset_path_task3 = r'I:\Science\CIS-YASHMOSH\niallmcguire\ZuCo\task3-TSR\pickle\task3-TSR-dataset.pickle'
    with open(dataset_path_task3, 'rb') as handle:
        whole_dataset_dicts.append(pickle.load(handle))

if 'taskNRv2' in task_name:
    dataset_path_taskNRv2 = r'I:\Science\CIS-YASHMOSH\niallmcguire\ZuCo\task2-NR-2.0\pickle\task2-NR-2.0-dataset.pickle'
    with open(dataset_path_taskNRv2, 'rb') as handle:
        whole_dataset_dicts.append(pickle.load(handle))


In [5]:
len(whole_dataset_dicts)

3

In [12]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [13]:
subject_choice = 'ALL'
eeg_type_choice = 'GD'
bands_choice = ['_t1','_t2','_a1','_a2','_b1','_b2','_g1','_g2']
dataset_setting = 'unique_sent'

In [14]:
Task_Dataset_List = whole_dataset_dicts
if not isinstance(whole_dataset_dicts,list):
    Task_Dataset_List = [whole_dataset_dicts]

In [15]:


def get_eeg_word_embedding(word, eeg_type = 'GD', bands = ['_t1','_t2','_a1','_a2','_b1','_b2','_g1','_g2']):
    EEG_frequency_features = []
    EEG_word_level_label = word['content']
    for band in bands:
        EEG_frequency_features.append(word['word_level_EEG'][eeg_type][eeg_type+band])
    word_eeg_embedding = np.concatenate(EEG_frequency_features)
    if len(word_eeg_embedding) != 105*len(bands):
        print(f'expect word eeg embedding dim to be {105*len(bands)}, but got {len(word_eeg_embedding)}, return None')
        word_eeg_embedding = None
    else:
        word_eeg_embedding = word_eeg_embedding.reshape(105, 8)

    return word_eeg_embedding, EEG_word_level_label





In [17]:
import torch
EEG_word_level_embeddings = []
EEG_word_level_labels = []
#Main loop, looping through each task
for Task_Dataset in Task_Dataset_List:
    subjects = list(Task_Dataset.keys())
    print('[INFO]using subjects: ', subjects)

    total_num_sentence = len(Task_Dataset[subjects[0]])

    train_divider = int(0.8*total_num_sentence)
    dev_divider = train_divider + int(0.1*total_num_sentence)

    print(f'train size = {train_divider}')
    print(f'dev size = {dev_divider}')


    print('[INFO]initializing a train set...')
    for key in subjects:
        print(f'key = {key}')
        for i in range(train_divider):
            if Task_Dataset[key][i] is not None:
                sentence_object = Task_Dataset[key][i]
                sentence = sentence_object['content']
                #print(sentence_object['content'])
                for word in sentence_object['word']:
                    word_eeg_embedding, EEG_word_level_label = get_eeg_word_embedding(word)
                    if word_eeg_embedding is not None and torch.isnan(torch.from_numpy(word_eeg_embedding)).any() == False:
                        EEG_word_level_embeddings.append(word_eeg_embedding)
                        EEG_word_level_labels.append(EEG_word_level_label)




[INFO]using subjects:  ['ZAB', 'ZDM', 'ZDN', 'ZGW', 'ZJM', 'ZJN', 'ZJS', 'ZKB', 'ZKH', 'ZMG', 'ZPH']
train size = 320
dev size = 360
[INFO]initializing a train set...
key = ZAB
Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.
Beautifully crafted, engaging filmmaking that should attract upscale audiences hungry for quality and a nostalgic, twisty yarn that will keep them guessing.
Bread, My Sweet has so many flaws it would be easy for critics to shred it.
Slow, silly and unintentionally hilarious.
Ultimately feels emp11111ty and unsatisfying, like swallowing a Communion wafer without the wine.
Exudes the fizz of a Busby Berkeley musical and the visceral excitement of a sports extravaganza.
The film rehashes several old themes and is capped with pointless extremes -- it's insanely violent and very graphic.
Ryan Gosling is, in a word, brilliant as the conflicted Daniel.
If Deuces Wild had been tweaked up a notch it would