In [1]:
import torch
from fp16 import FP16_Module, FP16_Optimizer
from knight_utils import TASK_DICT, QADataset, MODEL_CLASS, TOKENIZER, DEVICE, SPECIAL_TOKEN_IDS, \
                        SPECIAL_TOKENS, get_gen_token, get_real_data

['Tesla V100-PCIE-32GB']


In [2]:
model_dir = "/data/model_runs/20230605T182935_yelagdbpamayah_LAMOL"
tasks = ["yelp", "ag", "dbpedia", "amazon", "yahoo"]

# Add special token YELP, AG
gen_token = get_gen_token(tasks[0])
TOKENIZER.add_tokens([gen_token])
SPECIAL_TOKENS[tasks[0]] = gen_token
SPECIAL_TOKEN_IDS[tasks[0]] = TOKENIZER.convert_tokens_to_ids(gen_token)

gen_token = get_gen_token(tasks[1])
TOKENIZER.add_tokens([gen_token])
SPECIAL_TOKENS[tasks[1]] = gen_token
SPECIAL_TOKEN_IDS[tasks[1]] = TOKENIZER.convert_tokens_to_ids(gen_token)
    
    
task_id = 1
_tasks = [tasks[task_id]]

In [3]:
print("Initializing Model...")
net_cls = MODEL_CLASS
net = net_cls.from_pretrained('gpt2').to(DEVICE)
net.resize_token_embeddings(len(TOKENIZER))
net = FP16_Module(net)

Initializing Model...


In [4]:
def create_extra_data(task, prev_task, model, train_extra_data, model_dir):
    # Real Sample - why is real samples not os.path.exist check? generate everytime??
    print(f"using real data as extra data")
    return get_real_data(task, train_extra_data, model_dir)

## `data[i]`

```python
{'paragraphs': [{'context': "I'll start by saying that Lee's is not my favorite liquor store. However, this location is convenient as I live nearby. \\n\\nI just don't like the atmosphere in any given Lee's. It's sterile and feels somewhat shady. I feel like I am a degenerate shopping here. \\n\\nAlthough they have a decent selection of what I need, they offer nothing extra. I shop here only out of convenience of the location.", 'qas': [{'question': 'Is this sentence very negative, negative, neutral, positive, or very positive?', 'answers': [{'text': 'neutral'}]}]}]}
```

## `d = parse_single_real_data(data[i],prev_task)`

```
__yelp__I'll start by saying that Lee's is not my favorite liquor store. However, this location is convenient as I live nearby. \n\nI just don't like the atmosphere in any given Lee's. It's sterile and feels somewhat shady. I feel like I am a degenerate shopping here. \n\nAlthough they have a decent selection of what I need, they offer nothing extra. I shop here only out of convenience of the location. Is this sentence very negative, negative, neutral, positive, or very positive?__ans__neutral<|endoftext|>
```

## `TOKENIZER.encode(d)`

```python
[50260, 314, 1183, 923, 416, 2282, 326, 5741, 338, 318, 407, 616, 4004, 20030, 3650, 13, 2102, 11, 428, 4067, 318, 11282, 355, 314, 2107, 6716, 13, 3467, 77, 59, 77, 40, 655, 836, 470, 588, 262, 8137, 287, 597, 1813, 5741, 338, 13, 632, 338, 38697, 290, 5300, 6454, 36135, 13, 314, 1254, 588, 314, 716, 257, 25419, 378, 9735, 994, 13, 3467, 77, 59, 77, 7003, 484, 423, 257, 7709, 6356, 286, 644, 314, 761, 11, 484, 2897, 2147, 3131, 13, 314, 6128, 994, 691, 503, 286, 15607, 286, 262, 4067, 13, 1148, 428, 6827, 845, 4633, 11, 4633, 11, 8500, 11, 3967, 11, 393, 845, 3967, 30, 50257, 8500, 50256]
```

## `[TOKENIZER.convert_ids_to_tokens(int(a)) for a in TOKENIZER.encode(d)]`

```python 
['__yelp__', 'ĠI', "'ll", 'Ġstart', 'Ġby', 'Ġsaying', 'Ġthat', 'ĠLee', "'s", 'Ġis', 'Ġnot', 'Ġmy', 'Ġfavorite', 'Ġliquor', 'Ġstore', '.', 'ĠHowever', ',', 'Ġthis', 'Ġlocation', 'Ġis', 'Ġconvenient', 'Ġas', 'ĠI', 'Ġlive', 'Ġnearby', '.', 'Ġ\\', 'n', '\\', 'n', 'I', 'Ġjust', 'Ġdon', "'t", 'Ġlike', 'Ġthe', 'Ġatmosphere', 'Ġin', 'Ġany', 'Ġgiven', 'ĠLee', "'s", '.', 'ĠIt', "'s", 'Ġsterile', 'Ġand', 'Ġfeels', 'Ġsomewhat', 'Ġshady', '.', 'ĠI', 'Ġfeel', 'Ġlike', 'ĠI', 'Ġam', 'Ġa', 'Ġdegener', 'ate', 'Ġshopping', 'Ġhere', '.', 'Ġ\\', 'n', '\\', 'n', 'Although', 'Ġthey', 'Ġhave', 'Ġa', 'Ġdecent', 'Ġselection', 'Ġof', 'Ġwhat', 'ĠI', 'Ġneed', ',', 'Ġthey', 'Ġoffer', 'Ġnothing', 'Ġextra', '.', 'ĠI', 'Ġshop', 'Ġhere', 'Ġonly', 'Ġout', 'Ġof', 'Ġconvenience', 'Ġof', 'Ġthe', 'Ġlocation', '.', 'ĠIs', 'Ġthis', 'Ġsentence', 'Ġvery', 'Ġnegative', ',', 'Ġnegative', ',', 'Ġneutral', ',', 'Ġpositive', ',', 'Ġor', 'Ġvery', 'Ġpositive', '?', '__ans__', 'Ġneutral', '<|endoftext|>']
```

In [5]:
train_extra_data = []
if task_id > 0:
    prev_task = tasks[task_id-1]
    with torch.no_grad():
        create_extra_data(_tasks[0], prev_task, net, train_extra_data, model_dir)
print('extra training data size: {}'.format(len(train_extra_data)))

using real data as extra data
Generating extra data! With gen_size 5750


Token indices sequence length is longer than the specified maximum sequence length for this model (1135 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1214 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1174 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1047 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1033 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

writing extra data in /data/model_runs/20230605T182935_yelagdbpamayah_LAMOL/real-yelp.csv ...
extra training data size: 5750


# For AG,

extra training data size = 5750. After filter usable = 5733. Still okay for this task, but maybe not for others!  
Need to change how extra data is etl'ed, or find better methods.

In [6]:
train_dataset = [TASK_DICT[t]["train"] for t in _tasks]
train_qadata = QADataset(train_dataset, "train", SPECIAL_TOKEN_IDS[_tasks[0]], train_extra_data)

an example with len 1133 is too long!
an example with len 1134 is too long!
an example with len 1134 is too long!
an example with len 1134 is too long!
an example with len 1212 is too long!
an example with len 1213 is too long!
an example with len 1213 is too long!
an example with len 1213 is too long!
an example with len 1172 is too long!
an example with len 1173 is too long!
an example with len 1173 is too long!
an example with len 1173 is too long!
an example with len 1045 is too long!
an example with len 1046 is too long!
an example with len 1046 is too long!
an example with len 1046 is too long!
an example with len 1030 is too long!
an example with len 1032 is too long!
an example with len 1032 is too long!
an example with len 1032 is too long!
an example with len 1040 is too long!
an example with len 1041 is too long!
an example with len 1041 is too long!
an example with len 1041 is too long!
an example with len 1265 is too long!
an example with len 1266 is too long!
an example w

Actual extra data: 5733


In [7]:
[len(x[0]) for x in train_qadata.data[-10:]]

[281, 71, 318, 235, 107, 79, 130, 97, 109, 75]

In [8]:
train_qadata.data[-9]

([314,
  1053,
  3750,
  284,
  1583,
  13,
  18258,
  329,
  2048,
  1315,
  812,
  13,
  679,
  290,
  465,
  3085,
  389,
  257,
  1074,
  13,
  314,
  4313,
  683,
  284,
  2460,
  290,
  1641,
  13,
  1119,
  481,
  670,
  351,
  345,
  329,
  5096,
  3503,
  13,
  339,
  338,
  355,
  1969,
  355,
  314,
  1053,
  7891,
  284,
  19990,
  33983,
  7879,
  1016,
  284,
  262,
  38408,
  13,
  1148,
  428,
  6827,
  845,
  4633,
  11,
  4633,
  11,
  8500,
  11,
  3967,
  11,
  393,
  845,
  3967,
  30,
  50257],
 71,
 [314,
  1053,
  3750,
  284,
  1583,
  13,
  18258,
  329,
  2048,
  1315,
  812,
  13,
  679,
  290,
  465,
  3085,
  389,
  257,
  1074,
  13,
  314,
  4313,
  683,
  284,
  2460,
  290,
  1641,
  13,
  1119,
  481,
  670,
  351,
  345,
  329,
  5096,
  3503,
  13,
  339,
  338,
  355,
  1969,
  355,
  314,
  1053,
  7891,
  284,
  19990,
  33983,
  7879,
  1016,
  284,
  262,
  38408,
  13,
  1148,
  428,
  6827,
  845,
  4633,
  11,
  4633,
  11,
  8500,
  11,
  3