# T5 Triplet Augmentation 
使用 MCQ dataset訓練 並加入 Concept 的 Triplet 全部資料 (From ChatGPT Candidate)<br>
直接使用 trainer 訓練 <br>

### GPU

In [1]:
!nvidia-smi

Tue Aug 29 07:53:01 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA TITAN RTX                On | 00000000:01:00.0 Off |                  N/A |
| 41%   39C    P8               20W / 280W|      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX                On | 00000000:02:00.0 Off |  

### Weight and Bias (Assisting Metrics, Optional)

In [2]:
!pip install wandb
!wandb login
project_name = "test on MCQ with T5-Triplet-Augmentation-without-Reranker"
import os

os.environ["WANDB_PROJECT"] = project_name

[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin


### import

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

2023-08-29 07:53:06.820662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-29 07:53:06.929028: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-29 07:53:07.399292: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-08-29 07:53:07.399410: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

### Loading the dataset

In [4]:
import json
import os, sys
import fnmatch

In [5]:
def read_data(item):
    path = '../../../../../data/mcq/total_new_cleaned_{}.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [6]:
train = read_data('train')
test = read_data('test')

In [7]:
len(train), len(test)

(2321, 259)

In [8]:
train[0]

{'answer': 'gravity',
 'distractors': ['friction', 'erosion', 'magnetism'],
 'sentence': '**blank** causes rocks to roll downhill'}

Append triplet into dataset

In [9]:
def append_data(item):
    # Initalize 版本
    path = '../../../../../data/mcq/triplets/generate_lm/{}.triplet.json'.format(item)
    with open(path) as f:
        triplet = json.load(f)
        
    return triplet

In [10]:
train_triplet = append_data('train')
test_triplet = append_data('test')

In [11]:
len(train_triplet), len(test_triplet)

(2321, 259)

In [12]:
train_triplet[0]

[['isa', 'gravity', 'force', 8.0],
 ['relatedto', 'rocks', 'rock', 1.0],
 ['relatedto', 'force', 'gravity', 0.658],
 ['relatedto', 'wind', 'pressure', 0.146],
 ['relatedto', 'roll', 'rock', 4.629],
 ['relatedto', 'force', 'pressure', 0.911],
 ['relatedto', 'force', 'cause', 0.13],
 ['relatedto', 'rock', 'roll', 2.226],
 ['relatedto', 'force', 'magnetism', 0.284],
 ['isa', 'friction', 'force', 1.0],
 ['antonym', 'roll', 'rock', 0.238],
 ['relatedto', 'rock', 'water', 0.338],
 ['relatedto', 'pressure', 'force', 1.0],
 ['antonym', 'rock', 'water', 0.113],
 ['relatedto', 'wind', 'force', 1.123],
 ['relatedto', 'friction', 'force', 0.351],
 ['hassubevent', 'roll', 'rock', 2.0],
 ['relatedto', 'gravity', 'force', 1.67],
 ['relatedto', 'roll', 'downhill', 0.14],
 ['atlocation', 'rock', 'water', 1.0],
 ['isa', 'pressure', 'force', 2.0],
 ['relatedto', 'force', 'electricity', 0.125],
 ['relatedto', 'roll', 'wind', 2.0],
 ['relatedto', 'causes', 'cause', 1.0]]

Remove 重複的 triplet

In [13]:
for i in range(len(train_triplet)):
    train_triplet[i] = [list(t) for t in set(tuple(element) for element in train_triplet[i] )]
    
for i in range(len(test_triplet)):
    test_triplet[i] = [list(t) for t in set(tuple(element) for element in test_triplet[i] )]

In [14]:
len(train_triplet), len(test_triplet)

(2321, 259)

In [15]:
train_triplet[0]

[['relatedto', 'force', 'magnetism', 0.284],
 ['relatedto', 'roll', 'wind', 2.0],
 ['hassubevent', 'roll', 'rock', 2.0],
 ['relatedto', 'causes', 'cause', 1.0],
 ['relatedto', 'rock', 'roll', 2.226],
 ['relatedto', 'force', 'pressure', 0.911],
 ['relatedto', 'rocks', 'rock', 1.0],
 ['relatedto', 'force', 'cause', 0.13],
 ['relatedto', 'wind', 'force', 1.123],
 ['isa', 'pressure', 'force', 2.0],
 ['relatedto', 'rock', 'water', 0.338],
 ['relatedto', 'roll', 'downhill', 0.14],
 ['antonym', 'rock', 'water', 0.113],
 ['relatedto', 'roll', 'rock', 4.629],
 ['relatedto', 'gravity', 'force', 1.67],
 ['isa', 'friction', 'force', 1.0],
 ['relatedto', 'pressure', 'force', 1.0],
 ['relatedto', 'wind', 'pressure', 0.146],
 ['atlocation', 'rock', 'water', 1.0],
 ['isa', 'gravity', 'force', 8.0],
 ['relatedto', 'force', 'gravity', 0.658],
 ['relatedto', 'force', 'electricity', 0.125],
 ['relatedto', 'friction', 'force', 0.351],
 ['antonym', 'roll', 'rock', 0.238]]

Combine triplet into train and test data

In [19]:
def combine_data(data, triplet):
    res = []
    for i in range(len(data)):
        d = data[i]
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']
        triplets = triplet[i]

        triplets = sorted(triplets, key = lambda x: x[3],reverse=True)
        res.append({'sentence':sentence, 'distractors':distractors, 'answer':answer, 'triplets':triplets})
    return res

In [20]:
train = combine_data(train, train_triplet)
test = combine_data(test, test_triplet)

In [21]:
len(train), len(test)

(2321, 259)

In [22]:
train[0]

{'sentence': '**blank** causes rocks to roll downhill',
 'distractors': ['friction', 'erosion', 'magnetism'],
 'answer': 'gravity',
 'triplets': [['isa', 'gravity', 'force', 8.0],
  ['relatedto', 'roll', 'rock', 4.629],
  ['relatedto', 'rock', 'roll', 2.226],
  ['relatedto', 'roll', 'wind', 2.0],
  ['hassubevent', 'roll', 'rock', 2.0],
  ['isa', 'pressure', 'force', 2.0],
  ['relatedto', 'gravity', 'force', 1.67],
  ['relatedto', 'wind', 'force', 1.123],
  ['relatedto', 'causes', 'cause', 1.0],
  ['relatedto', 'rocks', 'rock', 1.0],
  ['isa', 'friction', 'force', 1.0],
  ['relatedto', 'pressure', 'force', 1.0],
  ['atlocation', 'rock', 'water', 1.0],
  ['relatedto', 'force', 'pressure', 0.911],
  ['relatedto', 'force', 'gravity', 0.658],
  ['relatedto', 'friction', 'force', 0.351],
  ['relatedto', 'rock', 'water', 0.338],
  ['relatedto', 'force', 'magnetism', 0.284],
  ['antonym', 'roll', 'rock', 0.238],
  ['relatedto', 'wind', 'pressure', 0.146],
  ['relatedto', 'roll', 'downhill', 0.

### Prepare data

In [23]:
from sklearn.model_selection import train_test_split
import random
random.seed(0)

train, valid = train_test_split(train, random_state=777, train_size=0.9)
len(train), len(valid)

(2088, 233)

In [21]:
def processData(data, task_prefix):
    
    sentences = []
    labels = []
    answers = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        triplets = d['triplets']
        answer = d['answer']
        
        triplet_set = []
        for each_triplet in triplets:
            rel, source, target, weight = each_triplet
            triplet_set.append('{} {} {}'.format(source,rel,target))

        sentence = sentence.replace('**blank**', '_')
        # 避免dataset的label有空白
        distractors = [dis.strip() for dis in distractors]
        
        random.shuffle(triplet_set)

        sentences.append(task_prefix + sentence + '</s></s>' + answer + '</s></s>' + ', '.join(triplet_set))
        labels.append('_ of distractors are ' + ', '.join(distractors) + '</s>')
        answers.append(answer)
        
    return sentences, answers, labels

In [22]:
task_prefix = 'distractor generation: '
train_sent, train_answer, train_label = processData(train, task_prefix)
valid_sent, valid_answer, valid_label = processData(valid, task_prefix)
test_sent, test_answer, test_label = processData(test, task_prefix)

In [23]:
len(train_sent), len(train_answer), len(train_label)

(2088, 2088, 2088)

In [24]:
for idx in range(2):
    print(train_sent[idx])
    print(train_answer[idx])
    print(train_label[idx])
    print()

distractor generation: In _ reinforcement, the reinforcer follows every correct response.</s></s>continuous</s></s>continuous relatedto sustained, follows relatedto follow, intermittent relatedto sporadic, sporadic relatedto intermittent, sustained relatedto continuous
continuous
_ of distractors are intermittent, partial, negative</s>

distractor generation: _ of glucose units is found in plants and serves a structural purpose</s></s>cellulose</s></s>cellulose partof plants, found relatedto find, pectin isa cellulose, galactose isa sucrose, cellulose relatedto unit, serves relatedto serve, cellulose relatedto plant, cellulose relatedto glucose, find relatedto found, glycogen relatedto glucose, gluconeogenesis relatedto glucose, plant relatedto cellulose, sucrose relatedto fructose, plants relatedto plant, units relatedto unit, sucrose relatedto glucose, serve relatedto purpose
cellulose
_ of distractors are frucose, carbonate, sucrose</s>



### Tokenization

In [25]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [26]:
train_encodings = tokenizer(train_sent, truncation=True, padding=True)
valid_encodings = tokenizer(valid_sent, truncation=True, padding=True)
test_encodings = tokenizer(test_sent, truncation=True, padding=True)



In [27]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [28]:
len(train_encodings.input_ids)

2088

In [29]:
def add_labels(encodings, distractors):
    
    distractors_encodings = tokenizer(distractors, padding=True)
    labels = []
    for i in range(len(distractors_encodings.input_ids)):
        labels.append(distractors_encodings.input_ids[i])
    
    encodings["labels"] = labels
    return encodings

In [30]:
train_encodings = add_labels(train_encodings, train_label)
valid_encodings = add_labels(valid_encodings, valid_label)
test_encodings = add_labels(test_encodings, test_label)

In [31]:
len(train_encodings.input_ids)

2088

In [32]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

### Dataset

In [33]:
class MCQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = MCQDataset(train_encodings)
valid_dataset = MCQDataset(valid_encodings)
test_dataset = MCQDataset(test_encodings)

In [34]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(2088, 233, 259)

### Fine-tuning

In [35]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [36]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="F1@3",
    weight_decay=0.01,
    predict_with_generate=True,
    gradient_accumulation_steps = 2,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [37]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [38]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [39]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [40]:
trainer.train()

***** Running training *****
  Num examples = 2088
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 1650
  Number of trainable parameters = 222903552
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,P@1,P@3,R@3,F1@3
1,No log,0.633659,0.077253,0.04721,0.044349,0.045575
2,No log,0.572865,0.175966,0.09299,0.092632,0.091866
3,No log,0.556032,0.197425,0.108727,0.108727,0.107807
4,No log,0.547541,0.201717,0.107296,0.106581,0.105968
5,No log,0.540085,0.188841,0.114449,0.113019,0.112712
6,No log,0.542391,0.2103,0.11731,0.115522,0.115369
7,No log,0.537978,0.240343,0.133047,0.127682,0.129982
8,No log,0.534939,0.240343,0.1402,0.134478,0.13693
9,No log,0.537491,0.23176,0.1402,0.135193,0.137339
10,No log,0.541245,0.248927,0.157368,0.151288,0.153893


***** Running Evaluation *****
  Num examples = 233
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_to

TrainOutput(global_step=1650, training_loss=0.3062617573593602, metrics={'train_runtime': 3529.8563, 'train_samples_per_second': 29.576, 'train_steps_per_second': 0.467, 'total_flos': 6.3575203774464e+16, 'train_loss': 0.3062617573593602, 'epoch': 50.0})

In [41]:
trainer.save_model('../../../../../saved_models/KAG/mcq/t5/t5-triplet-augmentation-without-reranker')

Saving model checkpoint to ./saved_models/t5-triplet-augmentation-without-reranker
Configuration saved in ./saved_models/t5-triplet-augmentation-without-reranker/config.json
Configuration saved in ./saved_models/t5-triplet-augmentation-without-reranker/generation_config.json
Model weights saved in ./saved_models/t5-triplet-augmentation-without-reranker/pytorch_model.bin
tokenizer config file saved in ./saved_models/t5-triplet-augmentation-without-reranker/tokenizer_config.json
Special tokens file saved in ./saved_models/t5-triplet-augmentation-without-reranker/special_tokens_map.json


In [42]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 233
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.6807897090911865,
 'eval_P@1': 0.26180257510729615,
 'eval_P@3': 0.2131616595135908,
 'eval_R@3': 0.20350500715307576,
 'eval_F1@3': 0.20764357245043927,
 'eval_runtime': 11.0505,
 'eval_samples_per_second': 21.085,
 'eval_steps_per_second': 0.724,
 'epoch': 50.0}

In [43]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 259
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_to

test: 


{'test_loss': 1.1292715072631836,
 'test_P@1': 0.17374517374517376,
 'test_P@3': 0.126126126126126,
 'test_R@3': 0.126126126126126,
 'test_F1@3': 0.126126126126126,
 'test_runtime': 10.3861,
 'test_samples_per_second': 24.937,
 'test_steps_per_second': 0.867}

### Save Distractor Data

In [45]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
# import torch


# model = T5ForConditionalGeneration.from_pretrained('../rs')

loading file spiece.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ../../../../saved_models/t5/t5-triplet-chatgpt-qa-k=20/config.json
Model config T5Config {
  "_name_or_path": "/user_data/DG-via-Text2Text-with-Triplet-Augmentation/saved_models/t5/t5-text2text-df",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
   

In [45]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="F1@3",
    gradient_accumulation_steps = 2,
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

PyTorch: setting up devices


In [46]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [47]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [48]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [54]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_metrics

***** Running Prediction *****
  Num examples = 259
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'test_loss': 1.1292715072631836,
 'test_P@1': 0.17374517374517376,
 'test_P@3': 0.126126126126126,
 'test_R@3': 0.126126126126126,
 'test_F1@3': 0.126126126126126,
 'test_runtime': 10.3772,
 'test_samples_per_second': 24.959,
 'test_steps_per_second': 0.867}

In [51]:
import json
def write_json(data, path):
    
    jsonString = json.dumps(data)
    jsonFile = open(path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()

In [52]:
def save_data(data, predictions, labels, file_name):

    

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)
    
    # evaluation metrics
    for idx in range(len(true_label)):
        
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p1 = len(act_set & pred1_set) / float(1)
        p3 = len(act_set & pred3_set) / float(3)
        r3 = len(act_set & pred3_set) / float(len(act_set))

        if p3 == 0 and r3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p3 * r3 / (p3 + r3))
        
        data[idx]['pred_distractors'] = distractors
        data[idx]['metric'] = {'P@1': p1, 'P@3': p3, 'R@3': r3, 'F1@3': f1_3}
        if 'triplets' in data[idx]:

            del data[idx]['triplets']
    
    write_json(data, file_name)
    print(file_name + ' is saved :)')

In [57]:
file_path = '../../../../../predictions/mcq/t5-tripleat-augmentation(without-reranker).json'
save_data(test, test_predictions, test_labels, file_path)

./predictions/t5-tripleat-augmentation(without-reranker).json is saved :)
