# BART Triplet Augmentation
使用 MCQ Dataset 並加入 Triplet 資料 20 進行 Triplet Augmentation

Candidate Set 是用 Generate LM 生成 (From ChatGPT)
<br>
直接使用 trainer 訓練 <br>

### GPU

In [1]:
!nvidia-smi

Sat Aug 26 14:42:45 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA TITAN RTX                On | 00000000:01:00.0 Off |                  N/A |
| 41%   40C    P8               20W / 280W|      1MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX                On | 00000000:02:00.0 Off |  

### Weight and Bias (Assisting Metrics, Optional)

In [2]:
!pip install wandb
!wandb login
project_name = "test on MCQ with BART-Triplet-Augmentation-ChatGPT"
import os

os.environ["WANDB_PROJECT"] = project_name

[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin


### import

In [3]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

2023-08-26 14:42:52.281586: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-26 14:42:55.611102: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-26 14:43:00.294251: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-08-26 14:43:00.312456: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

### Loading the dataset

In [4]:
import json
import os, sys
import fnmatch

In [5]:
def read_data(item):
    path = '../../../../../data/mcq/total_new_cleaned_{}.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [6]:
train = read_data('train')
test = read_data('test')

In [7]:
len(train), len(test)

(2321, 259)

In [8]:
train[0]

{'answer': 'gravity',
 'distractors': ['friction', 'erosion', 'magnetism'],
 'sentence': '**blank** causes rocks to roll downhill'}

Append triplet into dataset

In [11]:
def append_data(item):
    # Initalize 版本
    path = '../../../../../data/mcq/triplets/generate_lm/{}.triplet.json'.format(item)
    with open(path) as f:
        triplet = json.load(f)
        
    return triplet

In [12]:
train_triplet = append_data('train')
test_triplet = append_data('test')

In [13]:
len(train_triplet), len(test_triplet)

(2321, 259)

In [14]:
train_triplet[0]

[['isa', 'gravity', 'force', 8.0],
 ['relatedto', 'rocks', 'rock', 1.0],
 ['relatedto', 'force', 'gravity', 0.658],
 ['relatedto', 'wind', 'pressure', 0.146],
 ['relatedto', 'roll', 'rock', 4.629],
 ['relatedto', 'force', 'pressure', 0.911],
 ['relatedto', 'force', 'cause', 0.13],
 ['relatedto', 'rock', 'roll', 2.226],
 ['relatedto', 'force', 'magnetism', 0.284],
 ['isa', 'friction', 'force', 1.0],
 ['antonym', 'roll', 'rock', 0.238],
 ['relatedto', 'rock', 'water', 0.338],
 ['relatedto', 'pressure', 'force', 1.0],
 ['antonym', 'rock', 'water', 0.113],
 ['relatedto', 'wind', 'force', 1.123],
 ['relatedto', 'friction', 'force', 0.351],
 ['hassubevent', 'roll', 'rock', 2.0],
 ['relatedto', 'gravity', 'force', 1.67],
 ['relatedto', 'roll', 'downhill', 0.14],
 ['atlocation', 'rock', 'water', 1.0],
 ['isa', 'pressure', 'force', 2.0],
 ['relatedto', 'force', 'electricity', 0.125],
 ['relatedto', 'roll', 'wind', 2.0],
 ['relatedto', 'causes', 'cause', 1.0]]

In [15]:
train_triplet[0]

[['isa', 'gravity', 'force', 8.0],
 ['relatedto', 'rocks', 'rock', 1.0],
 ['relatedto', 'force', 'gravity', 0.658],
 ['relatedto', 'wind', 'pressure', 0.146],
 ['relatedto', 'roll', 'rock', 4.629],
 ['relatedto', 'force', 'pressure', 0.911],
 ['relatedto', 'force', 'cause', 0.13],
 ['relatedto', 'rock', 'roll', 2.226],
 ['relatedto', 'force', 'magnetism', 0.284],
 ['isa', 'friction', 'force', 1.0],
 ['antonym', 'roll', 'rock', 0.238],
 ['relatedto', 'rock', 'water', 0.338],
 ['relatedto', 'pressure', 'force', 1.0],
 ['antonym', 'rock', 'water', 0.113],
 ['relatedto', 'wind', 'force', 1.123],
 ['relatedto', 'friction', 'force', 0.351],
 ['hassubevent', 'roll', 'rock', 2.0],
 ['relatedto', 'gravity', 'force', 1.67],
 ['relatedto', 'roll', 'downhill', 0.14],
 ['atlocation', 'rock', 'water', 1.0],
 ['isa', 'pressure', 'force', 2.0],
 ['relatedto', 'force', 'electricity', 0.125],
 ['relatedto', 'roll', 'wind', 2.0],
 ['relatedto', 'causes', 'cause', 1.0]]

Remove 重複的 triplet

In [16]:
for i in range(len(train_triplet)):
    train_triplet[i] = [list(t) for t in set(tuple(element) for element in train_triplet[i] )]
    
for i in range(len(test_triplet)):
    test_triplet[i] = [list(t) for t in set(tuple(element) for element in test_triplet[i] )]

In [17]:
len(train_triplet), len(test_triplet)

(2321, 259)

In [18]:
train_triplet[0]

[['relatedto', 'friction', 'force', 0.351],
 ['atlocation', 'rock', 'water', 1.0],
 ['relatedto', 'force', 'electricity', 0.125],
 ['relatedto', 'roll', 'wind', 2.0],
 ['relatedto', 'rock', 'water', 0.338],
 ['relatedto', 'force', 'pressure', 0.911],
 ['relatedto', 'causes', 'cause', 1.0],
 ['relatedto', 'gravity', 'force', 1.67],
 ['relatedto', 'roll', 'downhill', 0.14],
 ['isa', 'pressure', 'force', 2.0],
 ['relatedto', 'roll', 'rock', 4.629],
 ['antonym', 'roll', 'rock', 0.238],
 ['relatedto', 'wind', 'force', 1.123],
 ['relatedto', 'rocks', 'rock', 1.0],
 ['isa', 'gravity', 'force', 8.0],
 ['relatedto', 'wind', 'pressure', 0.146],
 ['relatedto', 'rock', 'roll', 2.226],
 ['relatedto', 'force', 'gravity', 0.658],
 ['isa', 'friction', 'force', 1.0],
 ['relatedto', 'force', 'cause', 0.13],
 ['hassubevent', 'roll', 'rock', 2.0],
 ['relatedto', 'force', 'magnetism', 0.284],
 ['antonym', 'rock', 'water', 0.113],
 ['relatedto', 'pressure', 'force', 1.0]]

Combine triplet into train and test data

In [20]:
def combine_data(data, triplet):
    res = []
    for i in range(len(data)):
        d = data[i]
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']

        triplets = triplet[i]
        triplets = sorted(triplets, key = lambda x: x[3],reverse=True)

        res.append({'sentence':sentence, 'distractors':distractors, 'answer':answer, 'triplets':triplets})
    return res

In [21]:
train = combine_data(train, train_triplet)
test = combine_data(test, test_triplet)

In [22]:
len(train), len(test)

(2321, 259)

In [23]:
train[0]

{'sentence': '**blank** causes rocks to roll downhill',
 'distractors': ['friction', 'erosion', 'magnetism'],
 'answer': 'gravity',
 'triplets': [['isa', 'gravity', 'force', 8.0],
  ['relatedto', 'roll', 'rock', 4.629],
  ['relatedto', 'rock', 'roll', 2.226],
  ['relatedto', 'roll', 'wind', 2.0],
  ['isa', 'pressure', 'force', 2.0],
  ['hassubevent', 'roll', 'rock', 2.0],
  ['relatedto', 'gravity', 'force', 1.67],
  ['relatedto', 'wind', 'force', 1.123],
  ['atlocation', 'rock', 'water', 1.0],
  ['relatedto', 'causes', 'cause', 1.0],
  ['relatedto', 'rocks', 'rock', 1.0],
  ['isa', 'friction', 'force', 1.0],
  ['relatedto', 'pressure', 'force', 1.0],
  ['relatedto', 'force', 'pressure', 0.911],
  ['relatedto', 'force', 'gravity', 0.658],
  ['relatedto', 'friction', 'force', 0.351],
  ['relatedto', 'rock', 'water', 0.338],
  ['relatedto', 'force', 'magnetism', 0.284],
  ['antonym', 'roll', 'rock', 0.238],
  ['relatedto', 'wind', 'pressure', 0.146],
  ['relatedto', 'roll', 'downhill', 0.

### Prepare data

In [24]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, random_state=777, train_size=0.9)
len(train), len(valid)

(2088, 233)

In [25]:
def processData(data):
    
    sentences = []
    labels = []
    answers = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        triplets = d['triplets']
        answer = d['answer']
        
        triplet = ""
        for each_triplet in triplets[:20]:
            rel, source, target, weight = each_triplet
            triplet += '{} {} {} </s></s>'.format(source,rel,target)

        sentence = sentence.replace('**blank**', '_')
        # 避免dataset的label有空白
        distractors = [dis.strip() for dis in distractors]
        
        sentences.append(sentence + '</s></s>' + answer + '</s></s>' + triplet)
        labels.append('_ of distractors are ' + ', '.join(distractors))
        answers.append(answer)
        
    return sentences, answers, labels

In [26]:
def get_blank(lst):
    for k in range(len(lst)):
        if '**blank**' in lst[k]:
            return k
    return -1

In [27]:
train_sent, train_answer, train_label = processData(train)
valid_sent, valid_answer, valid_label = processData(valid)
test_sent, test_answer, test_label = processData(test)

In [28]:
for idx in range(2):
    print(train_sent[idx])
    print(train_answer[idx])
    print(train_label[idx])
    print()

In _ reinforcement, the reinforcer follows every correct response.</s></s>continuous</s></s>intermittent relatedto sporadic </s></s>continuous relatedto sustained </s></s>sporadic relatedto intermittent </s></s>sustained relatedto continuous </s></s>follows relatedto follow </s></s>
continuous
_ of distractors are intermittent, partial, negative

_ of glucose units is found in plants and serves a structural purpose</s></s>cellulose</s></s>pectin isa cellulose </s></s>galactose isa sucrose </s></s>plants relatedto plant </s></s>cellulose relatedto glucose </s></s>sucrose relatedto glucose </s></s>cellulose relatedto plant </s></s>cellulose partof plants </s></s>sucrose relatedto fructose </s></s>serve relatedto purpose </s></s>glycogen relatedto glucose </s></s>found relatedto find </s></s>gluconeogenesis relatedto glucose </s></s>cellulose relatedto unit </s></s>serves relatedto serve </s></s>units relatedto unit </s></s>find relatedto found </s></s>plant relatedto cellulose </s></s>
c

In [29]:
len(train_sent), len(train_answer), len(train_label)

(2088, 2088, 2088)

In [30]:
len(valid_sent), len(valid_answer), len(valid_label)

(233, 233, 233)

In [31]:
len(test_sent), len(test_answer), len(test_label)

(259, 259, 259)

### Tokenization

In [32]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

In [33]:
train_encodings = tokenizer(train_sent, train_answer, truncation=True, padding=True)
valid_encodings = tokenizer(valid_sent, valid_answer, truncation=True, padding=True)
test_encodings = tokenizer(test_sent, test_answer, truncation=True, padding=True)

In [34]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [35]:
len(train_encodings.input_ids)

2088

In [36]:
def add_labels(encodings, distractors):
    
    distractors_encodings = tokenizer(distractors, padding=True)
    labels = []
    for i in range(len(distractors_encodings.input_ids)):
        labels.append(distractors_encodings.input_ids[i])
    
    encodings["labels"] = labels
    return encodings

In [37]:
train_encodings = add_labels(train_encodings, train_label)
valid_encodings = add_labels(valid_encodings, valid_label)
test_encodings = add_labels(test_encodings, test_label)

In [38]:
len(train_encodings.input_ids)

2088

In [39]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

### Dataset

In [40]:
class MCQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = MCQDataset(train_encodings)
valid_dataset = MCQDataset(valid_encodings)
test_dataset = MCQDataset(test_encodings)

In [41]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(2088, 233, 259)

### Fine-tuning

In [42]:
from transformers import BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [40]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="P@1",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [41]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [42]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [43]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [44]:
trainer.train()

***** Running training *****
  Num examples = 2088
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1650
  Number of trainable parameters = 139420416
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,P@1,P@3,R@3,F1@3
1,No log,3.775666,0.051502,0.025751,0.02432,0.024934
2,No log,2.451977,0.072961,0.042918,0.04113,0.041897
3,No log,1.588924,0.124464,0.078684,0.075465,0.076844
4,No log,1.014286,0.150215,0.098712,0.094063,0.096056
5,No log,0.74763,0.158798,0.107296,0.102647,0.104639
6,No log,0.663754,0.158798,0.114449,0.108727,0.111179
7,No log,0.648716,0.193133,0.121602,0.116237,0.118414
8,No log,0.650367,0.193133,0.133047,0.127682,0.129859
9,No log,0.647255,0.214592,0.150215,0.144492,0.146822
10,No log,0.654068,0.214592,0.148784,0.143419,0.145596


***** Running Evaluation *****
  Num examples = 233
  Batch size = 64
  Num examples = 233
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Gen

TrainOutput(global_step=1650, training_loss=0.5170198186238607, metrics={'train_runtime': 1137.917, 'train_samples_per_second': 91.747, 'train_steps_per_second': 1.45, 'total_flos': 1.8152044904448e+16, 'train_loss': 0.5170198186238607, 'epoch': 50.0})

In [45]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 259
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

test: 


{'test_loss': 1.2526915073394775,
 'test_P@1': 0.1891891891891892,
 'test_P@3': 0.13513513513513506,
 'test_R@3': 0.13513513513513506,
 'test_F1@3': 0.13513513513513506,
 'test_runtime': 5.1987,
 'test_samples_per_second': 49.82,
 'test_steps_per_second': 0.962}

In [46]:
trainer.save_model('../../../../../saved_models/KAG/mcq/bart/triplet-augmentation-without-reranker')

Saving model checkpoint to ../../../../saved_models/bart/bart-triplet-chatgpt-scibert1
Configuration saved in ../../../../saved_models/bart/bart-triplet-chatgpt-scibert1/config.json
Configuration saved in ../../../../saved_models/bart/bart-triplet-chatgpt-scibert1/generation_config.json
Model weights saved in ../../../../saved_models/bart/bart-triplet-chatgpt-scibert1/pytorch_model.bin
tokenizer config file saved in ../../../../saved_models/bart/bart-triplet-chatgpt-scibert1/tokenizer_config.json
Special tokens file saved in ../../../../saved_models/bart/bart-triplet-chatgpt-scibert1/special_tokens_map.json


In [47]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 259
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

test: 


{'test_loss': 1.2526915073394775,
 'test_P@1': 0.1891891891891892,
 'test_P@3': 0.13513513513513506,
 'test_R@3': 0.13513513513513506,
 'test_F1@3': 0.13513513513513506,
 'test_runtime': 5.0835,
 'test_samples_per_second': 50.949,
 'test_steps_per_second': 0.984}

In [48]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 233
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

{'eval_loss': 0.8171960115432739,
 'eval_P@1': 0.26180257510729615,
 'eval_P@3': 0.2145922746781117,
 'eval_R@3': 0.2060085836909872,
 'eval_F1@3': 0.2095646842427959,
 'eval_runtime': 5.4638,
 'eval_samples_per_second': 42.645,
 'eval_steps_per_second': 0.732,
 'epoch': 50.0}

### Save Distractor Data

In [49]:
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

tokenizer = BartTokenizer.from_pretrained("../../../../../saved_models/KAG/mcq/bart/triplet-augmentation-without-reranker")
model = BartForConditionalGeneration.from_pretrained("../../../../../saved_models/KAG/mcq/bart/triplet-augmentation-without-reranker")

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ../../../../saved_models/bart/bart-triplet-chatgpt-scibert/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "g

In [50]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="P@1",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

PyTorch: setting up devices


In [51]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [52]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [53]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [54]:
valid_predictions, valid_labels, valid_metrics = trainer.predict(valid_dataset)
valid_metrics

***** Running Prediction *****
  Num examples = 233
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

{'test_loss': 0.8195326924324036,
 'test_P@1': 0.2703862660944206,
 'test_P@3': 0.2288984263233192,
 'test_R@3': 0.2174535050071532,
 'test_F1@3': 0.22223584712855107,
 'test_runtime': 5.5648,
 'test_samples_per_second': 41.87,
 'test_steps_per_second': 0.719}

In [55]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_metrics

***** Running Prediction *****
  Num examples = 259
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

{'test_loss': 1.2668520212173462,
 'test_P@1': 0.1583011583011583,
 'test_P@3': 0.12998712998712986,
 'test_R@3': 0.12998712998712986,
 'test_F1@3': 0.12998712998712986,
 'test_runtime': 5.1331,
 'test_samples_per_second': 50.457,
 'test_steps_per_second': 0.974}

In [56]:
import json
def write_json(data, path):
    
    jsonString = json.dumps(data)
    jsonFile = open(path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()

In [64]:
def save_data(data, predictions, labels, file_name):

    

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split(' ')[-1]
        label_list[0] = label_list[0].split(' ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)
    
    # evaluation metrics
    for idx in range(len(true_label)):
        
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p1 = len(act_set & pred1_set) / float(1)
        p3 = len(act_set & pred3_set) / float(3)
        r3 = len(act_set & pred3_set) / float(len(act_set))

        if p3 == 0 and r3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p3 * r3 / (p3 + r3))
        
        data[idx]['pred_distractors'] = distractors
        data[idx]['metric'] = {'P@1': p1, 'P@3': p3, 'R@3': r3, 'F1@3': f1_3}
        del data[idx]['triplets']
    
    write_json(data, file_name)
    print(file_name + ' is saved :)')

In [65]:
save_data(test, test_predictions, test_labels, '../../../../../predictions/mcq/bart/bart-triplet-augmentation-without-reranker.json')

../../../../predictions/bart/bart-triplet-chatgpt-scibert.json is saved :)
