# BART Triplet Augmentation
使用 Sciq dataset訓練 並加入 Concept 的 Triplet 資料 20 筆(From ChatGPT Candidate)<br>
直接使用 trainer 訓練 <br>

### GPU

In [1]:
!nvidia-smi

Mon Jul 10 14:30:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN RTX    Off  | 00000000:01:00.0 Off |                  N/A |
| 39%   52C    P0    67W / 280W |      0MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX    Off  | 00000000:02:00.0 Off |                  N/A |
| 34%   43C    P0    57W / 280W |      0MiB / 24220MiB |      0%      Default |
|       

### Weight and Bias (Assisting Metrics, Optional)

In [2]:
!pip install wandb
!wandb login
project_name = "test on Sciq with BART-Triplet-Augmentation-with-Finetuned-Sent-Reranker"
import os

os.environ["WANDB_PROJECT"] = project_name

[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin


### import

In [3]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

2023-07-10 14:30:29.550568: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-10 14:30:29.656848: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-10 14:30:30.126284: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-07-10 14:30:30.126401: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

### Loading the dataset

In [4]:
import json
import os, sys
import fnmatch

In [5]:
def read_data(item):
    path = '../../../../data/sciq(all)/{}(fine-tune-sentence-transformer).json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [6]:
train = read_data('train')
valid = read_data('valid')
test = read_data('test')

In [7]:
len(train), len(valid), len(test)

(11679, 1000, 1000)

In [8]:
train[0]

{'sentence': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
 'distractors': ['protozoa', 'gymnosperms', 'viruses'],
 'answer': 'mesophilic organisms',
 'triplets': [['isa', 'yogurt', 'food', 1.5036563873291016],
  ['relatedto', 'cheese', 'yogurt', 1.5036563873291016],
  ['relatedto', 'organisms', 'organism', 1.4846434891223907],
  ['antonym', 'thermophilic', 'psychrophilic', 1.3640008568763733],
  ['isa', 'cheese', 'food', 1.3373385071754456],
  ['relatedto', 'preparation', 'food', 1.3373385071754456],
  ['relatedto', 'cheese', 'food', 1.3373385071754456],
  ['relatedto', 'foods', 'food', 1.3373385071754456],
  ['relatedto', 'food', 'foods', 1.3373385071754456],
  ['antonym', 'anaerobic', 'aerobic', 1.2871821820735931],
  ['antonym', 'aerobic', 'anaerobic', 1.2871821820735931],
  ['relatedto', 'anaerobic', 'organism', 1.2871821820735931],
  ['relatedto', 'used', 'use', 1.1459934711456299]]}

### Rerank

In [9]:
for i in range(len(train)):
    d = train[i]
    sentence = d['sentence']
    answer = d['answer']
    distractors = d['distractors']
    distractors = [x.lower() for x in distractors]
    triplets = d['triplets']

    if triplets != []:
        max_score = triplets[0][-1]
        for each_triplet in triplets:
            rel, source, target, weight = each_triplet
            if source == answer or target == answer or source in distractors or target in distractors:
                each_triplet[3] = max_score + 1.0

        d['triplets'] = sorted(triplets, key = lambda x: x[3],reverse=True)

In [10]:
for i in range(len(test)):
    d = test[i]
    sentence = d['sentence']
    answer = d['answer']
    distractors = d['distractors']
    distractors = [x.lower() for x in distractors]
    triplets = d['triplets']

    d['triplets'] = sorted(triplets, key = lambda x: x[3],reverse=True)

### Prepare data

In [11]:
import random
random.seed(0)
k = 25

In [12]:
def processData(data, k):
    
    sentences = []
    labels = []
    answers = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        triplets = d['triplets']
        answer = d['answer']
        
        triplet_set = []
        for each_triplet in triplets[:k]:
            rel, source, target, weight = each_triplet
            
            triplet_set.append('{} {} {}'.format(source,rel,target))


        distractors = [dis.strip() for dis in distractors]

        random.shuffle(triplet_set)
        sentences.append(sentence + '</s></s>' + answer + '</s></s>' + ', '.join(triplet_set))
        labels.append(', '.join(distractors))
        answers.append(answer)
        
    return sentences, answers, labels

In [13]:
train_sent, train_answer, train_label = processData(train,k)
valid_sent, valid_answer, valid_label = processData(valid,k)
test_sent, test_answer, test_label = processData(test,k)

In [14]:
for idx in range(2):
    print(train_sent[idx])
    print(train_answer[idx])
    print(train_label[idx])
    print()

What type of organism is commonly used in preparation of foods such as cheese and yogurt?</s></s>mesophilic organisms</s></s>cheese relatedto yogurt, aerobic antonym anaerobic, anaerobic antonym aerobic, preparation relatedto food, anaerobic relatedto organism, organisms relatedto organism, thermophilic antonym psychrophilic, foods relatedto food, food relatedto foods, cheese isa food, yogurt isa food, used relatedto use, cheese relatedto food
mesophilic organisms
protozoa, gymnosperms, viruses

What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?</s></s>coriolis effect</s></s>southern_hemisphere relatedto earth, wind relatedto atmospheric, moon relatedto ocean, wind relatedto earth, hemisphere relatedto northern_hemisphere, rotation relatedto earth, southern_hemisphere relatedto hemisphere, wind relatedto currents, plate relatedto tectonic, wind relatedto pressure,

In [15]:
len(train_sent), len(train_answer), len(train_label)

(11679, 11679, 11679)

In [16]:
len(valid_sent), len(valid_answer), len(valid_label)

(1000, 1000, 1000)

In [17]:
len(test_sent), len(test_answer), len(test_label)

(1000, 1000, 1000)

### Tokenization

In [18]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

In [19]:
train_encodings = tokenizer(train_sent, truncation=True, padding=True)
valid_encodings = tokenizer(valid_sent, truncation=True, padding=True)
test_encodings = tokenizer(test_sent, truncation=True, padding=True)

In [20]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [21]:
len(train_encodings.input_ids)

11679

In [22]:
def add_labels(encodings, distractors):
    
    distractors_encodings = tokenizer(distractors, padding=True)
    labels = []
    for i in range(len(distractors_encodings.input_ids)):
        labels.append(distractors_encodings.input_ids[i])
    
    encodings["labels"] = labels
    return encodings

In [23]:
train_encodings = add_labels(train_encodings, train_label)
valid_encodings = add_labels(valid_encodings, valid_label)
test_encodings = add_labels(test_encodings, test_label)

In [24]:
len(train_encodings.input_ids)

11679

In [25]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [26]:
tokenizer.decode(train_encodings['input_ids'][0])

'<s>What type of organism is commonly used in preparation of foods such as cheese and yogurt?</s></s>mesophilic organisms</s></s>cheese relatedto yogurt, aerobic antonym anaerobic, anaerobic antonym aerobic, preparation relatedto food, anaerobic relatedto organism, organisms relatedto organism, thermophilic antonym psychrophilic, foods relatedto food, food relatedto foods, cheese isa food, yogurt isa food, used relatedto use, cheese relatedto food</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

### Dataset

In [27]:
import torch
class SciqDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SciqDataset(train_encodings)
valid_dataset = SciqDataset(valid_encodings)
test_dataset = SciqDataset(test_encodings)

In [28]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(11679, 1000, 1000)

### Fine-tuning

In [29]:
from transformers import BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [30]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="P@1",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [32]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    r1 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        r_1 = len(act_set & pred1_set) / float(len(act_set))
        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        r1+=r_1
        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_r1 = r1 / len(true_label)
    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'R@1': avg_r1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [33]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 11679
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 9150
  Number of trainable parameters = 139420416
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,P@1,R@1,P@3,R@3,F1@3
1,No log,0.849218,0.06,0.020167,0.058333,0.058143,0.058111
2,No log,0.767431,0.1,0.033333,0.082333,0.081952,0.081978
3,1.597600,0.753572,0.111,0.037,0.096667,0.096286,0.096311
4,1.597600,0.751855,0.109,0.0365,0.101333,0.100952,0.100978
5,1.597600,0.751813,0.133,0.044333,0.111667,0.111286,0.111311
6,0.493100,0.753133,0.134,0.044833,0.112333,0.111952,0.111978
7,0.493100,0.756205,0.131,0.043667,0.116,0.115619,0.115644
8,0.493100,0.758586,0.12,0.040167,0.115333,0.115119,0.115089
9,0.442000,0.765282,0.134,0.044667,0.117,0.116786,0.116667
10,0.442000,0.771433,0.13,0.0435,0.118,0.117786,0.117756


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos

TrainOutput(global_step=9150, training_loss=0.334680312526682, metrics={'train_runtime': 6331.9928, 'train_samples_per_second': 92.222, 'train_steps_per_second': 1.445, 'total_flos': 1.13353639805952e+17, 'train_loss': 0.334680312526682, 'epoch': 50.0})

In [36]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

test: 


{'test_loss': 0.9976295232772827,
 'test_P@1': 0.171,
 'test_R@1': 0.05697619047619061,
 'test_P@3': 0.14633333333333334,
 'test_R@3': 0.14625396825396828,
 'test_F1@3': 0.1461666666666667,
 'test_runtime': 18.0174,
 'test_samples_per_second': 55.502,
 'test_steps_per_second': 0.888}

In [40]:
predictions, labels, metrics = trainer.predict(test_dataset, no_repeat_ngram_size = 2)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

test: 


{'test_loss': 0.9976295232772827,
 'test_P@1': 0.166,
 'test_R@1': 0.05530952380952393,
 'test_P@3': 0.14299999999999988,
 'test_R@3': 0.14314285714285704,
 'test_F1@3': 0.1429999999999999,
 'test_runtime': 18.5008,
 'test_samples_per_second': 54.052,
 'test_steps_per_second': 0.865}

In [41]:
trainer.save_model('./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq')

Saving model checkpoint to ./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq
Configuration saved in ./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq/config.json
Configuration saved in ./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq/generation_config.json
Model weights saved in ./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq/pytorch_model.bin
tokenizer config file saved in ./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq/tokenizer_config.json
Special tokens file saved in ./saved_models/bart/bart-triplet-augmentation-finetuned-sent-transformer-on-sciq/special_tokens_map.json


In [51]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 994
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

{'eval_loss': 0.8254396915435791,
 'eval_P@1': 0.1529175050301811,
 'eval_R@1': 0.05114017437961109,
 'eval_P@3': 0.1344735077129441,
 'eval_R@3': 0.13425792852352186,
 'eval_F1@3': 0.13422758774871427,
 'eval_runtime': 18.7659,
 'eval_samples_per_second': 52.968,
 'eval_steps_per_second': 0.853,
 'epoch': 50.0}

### Save Distractor Data

In [42]:
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

model = BartForConditionalGeneration.from_pretrained('./results/checkpoint-9150/')

loading configuration file ./results/checkpoint-9150/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LA

In [43]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="F1@3",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

PyTorch: setting up devices


In [44]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [45]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    r1 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        r_1 = len(act_set & pred1_set) / float(len(act_set))
        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        r1+=r_1
        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_r1 = r1 / len(true_label)
    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'R@1': avg_r1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [46]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Last Epoch Result

In [47]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

{'test_loss': 1.2076491117477417,
 'test_P@1': 0.147,
 'test_R@1': 0.04916666666666674,
 'test_P@3': 0.13266666666666652,
 'test_R@3': 0.13261111111111099,
 'test_F1@3': 0.13256666666666653,
 'test_runtime': 18.3625,
 'test_samples_per_second': 54.459,
 'test_steps_per_second': 0.871}

In [67]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset, no_repeat_ngram_size = 2, num_beams = 6)
test_metrics

***** Running Prediction *****
  Num examples = 997
  Batch size = 64
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,

{'test_loss': 1.0127898454666138,
 'test_P@1': 0.14744232698094284,
 'test_R@1': 0.04931461049816123,
 'test_P@3': 0.14008692744901358,
 'test_R@3': 0.140421263791374,
 'test_F1@3': 0.14022066198595778,
 'test_runtime': 23.3364,
 'test_samples_per_second': 42.723,
 'test_steps_per_second': 0.686}

## Save File

In [37]:
import json
def write_json(data, path):
    
    jsonString = json.dumps(data)
    jsonFile = open(path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()

In [38]:
def save_data(data, predictions, labels, file_name):

    

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')

        predicted.append(pred_list)
        true_label.append(label_list)
    
    # evaluation metrics
    for idx in range(len(true_label)):
        
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p1 = len(act_set & pred1_set) / float(1)
        p3 = len(act_set & pred3_set) / float(3)
        r3 = len(act_set & pred3_set) / float(len(act_set))

        if p3 == 0 and r3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p3 * r3 / (p3 + r3))
        
        data[idx]['pred_distractors'] = distractors
        data[idx]['metric'] = {'P@1': p1, 'P@3': p3, 'R@3': r3, 'F1@3': f1_3}

        if 'triplets' in data[idx]:
            del data[idx]['triplets']
    
    write_json(data, file_name)
    print(file_name + ' is saved :)')

In [39]:
save_data(test, predictions, labels, './predictions/bart-triplet-augmentation-with-finetuned-sentence-transformer.json')

./predictions/bart-triplet-augmentation-with-finetuned-sentence-transformer.json is saved :)


In [72]:
save_data(test, test_predictions, test_labels, '../../../../predictions/bart/sciq/bart-triplet-augmentation-with-finetuned-sentence-transformer')

../../../../predictions/bart/sciq/bart-triplet-augmentation-with-finetuned-sentence-transformer is saved :)


In [50]:
decoded_preds = tokenizer.batch_decode(test_predictions, skip_special_tokens=True)

In [51]:
same_count = 0
for i in range(len(test)):
    pred = decoded_preds[i]
    pred_list = pred.split(', ')
    

    if test[i]['answer'] in pred_list:
        same_count += 1
print('答案與 distractor 相同的筆數 = ', same_count)
print('答案與 distractor 相同的比例 = ',same_count/len(test))

答案與 distractor 相同的筆數 =  54
答案與 distractor 相同的比例 =  0.05416248746238716


In [52]:
decoded_preds = tokenizer.batch_decode(test_predictions, skip_special_tokens=True)
repeat_count = 0
for k in range(len(decoded_preds)):
    pred = decoded_preds[k]
    pred_list = pred.split(', ')
    
    duplicates = set([x for x in pred_list if pred_list.count(x) > 1])
    if duplicates:
        repeat_count += 1
print('重複 distractor 筆數 = ', repeat_count)
print('重複 distractor 的比例 = ',repeat_count/len(test))

重複 distractor 筆數 =  41
重複 distractor 的比例 =  0.041123370110330994
