In [1]:
# Utilities
import itertools
import importlib
import sys
import os

In [2]:
# Add src to path
src_path = os.path.abspath(os.path.join(os.getcwd(), '../src'))
if src_path not in sys.path:
    sys.path.append(src_path)
    
import prepare_data as ld
import indexer
import model_data

In [3]:
# Run this to reload our modules
importlib.reload(indexer)
importlib.reload(ld)
importlib.reload(model_data)
from indexer import Indexer
from model_data import ModelDataset

In [4]:
from transformers import T5Tokenizer

In [5]:
## Preparing data

TRAIN_FIRST_TEXTS, TRAIN_SECOND_TEXTS, TRAIN_LABELS  = ld.prepare_data("../data/train.csv", True)
DEV_FIRST_TEXTS, DEV_SECOND_TEXTS, DEV_LABELS = ld.prepare_data("../data/dev.csv", True)

TRAIN_FIRST_TEXTS, TRAIN_SECOND_TEXTS, TRAIN_LABELS, \
TEST_FIRST_TEXTS, TEST_SECOND_TEXTS, TEST_LABELS = ld.split_data(TRAIN_FIRST_TEXTS, TRAIN_SECOND_TEXTS, TRAIN_LABELS)

print("Training data size: ", len(TRAIN_FIRST_TEXTS),
     "\nDev data size: ", len(DEV_FIRST_TEXTS), 
     "\nTest data size: ", len(TEST_FIRST_TEXTS))

Prepared 30000 data points.
Prepared 6000 data points.
Training data size:  27000 
Dev data size:  6000 
Test data size:  3000


In [6]:
test = ld.PrepDataset(TEST_FIRST_TEXTS, TEST_SECOND_TEXTS, TEST_LABELS)
test.ExtractFeatures()

Extracting POS:   0%|          | 0/3000 [00:00<?, ?it/s]

Extracting POS:   0%|          | 0/3000 [00:00<?, ?it/s]

Extracting Punctuation:   0%|          | 0/3000 [00:00<?, ?it/s]

Extracting Punctuation:   0%|          | 0/3000 [00:00<?, ?it/s]

Extracting Information:   0%|          | 0/3000 [00:00<?, ?it/s]

Extracting Information:   0%|          | 0/3000 [00:00<?, ?it/s]

Cleaned up all data!


In [7]:
test.INVALID_INDEXES

[1492, 1394, 1170, 728, 82]

In [8]:
label_indexer = Indexer(values=test.LABELS)
POS_VALS = list(itertools.chain(*test.FIRST_POS + test.SECOND_POS))
pos_indexer = Indexer(values=POS_VALS, pre=indexer.POS_PRE)
# pos_indexer

In [9]:
test.index_data(pos_indexer, label_indexer)

In [10]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
testset = ModelDataset(test, tokenizer, 200)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
testset[10]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'text': tensor([    3,    23,   114,    12,  3011,    12,    48,  2324,   116,     3,
            23,  1731,    12,   473, 18940,    42,  1385, 14718,    10,     3,
            23,    43,     3, 17010,    12,    34,   220,   648,   469,     5,
         26378,    71,  7938,  1171,     9,  7622,  1008,    31,    17,   320,
            44,   140,  2181,   239,    19,    78,  1627,   275,  8247,     6,
            34,    31,     7,   614,    12, 13418,   852,    11,   258,     6,
            27,   129,    16, 24875,  1029,    66,     8, 10393,     6,    27,
            31,    51,    78, 30119,    27,   183,   786,   150,  1052,   125,
            79,   497,  4467,     7,    54,    31,    17,   830,   140,   323,
            27,   183,   786,    16,   334,   712,   194,  2163,     6,     1,
         17600,     7, 10022,  1220, 29035,    30,  1244, 13017,    19,   352,
            66,    91,    28,     8, 27432,  8929, 16023,  2908,     5,    37,
         23470,    19,  2164,     6,    28, 