# T5 Triplet Augmentation
使用 Sciq dataset訓練 並加入 Concept 的 Triplet 資料 20 筆(From ChatGPT Candidate)<br>
直接使用 trainer 訓練 <br>

### GPU

In [1]:
!nvidia-smi

Sat Jul  8 18:13:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN RTX    Off  | 00000000:01:00.0 Off |                  N/A |
| 66%   84C    P0    93W / 280W |      0MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX    Off  | 00000000:02:00.0 Off |                  N/A |
| 68%   76C    P0    71W / 280W |      0MiB / 24220MiB |      0%      Default |
|       

### Weight and Bias (Assisting Metrics, Optional)

In [2]:
!pip install wandb
!wandb login
project_name = "test on Sciq with T5-Triplet-Augmentation-with-Finetuned-Sent-Reranker"
import os

os.environ["WANDB_PROJECT"] = project_name

[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin


### import

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

2023-07-08 18:13:17.079153: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-08 18:13:18.036130: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-08 18:13:18.500349: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-07-08 18:13:18.500462: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

### Loading the dataset

In [2]:
import json
import os, sys
import fnmatch

In [4]:
def read_data(item):
    path = '../../../../../data/sciq/{}.with.reranker.triplet.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [5]:
train = read_data('train')
valid = read_data('valid')
test = read_data('test')

In [6]:
len(train), len(valid), len(test)

(11679, 1000, 1000)

In [8]:
train[0]

{'sentence': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
 'distractors': ['protozoa', 'gymnosperms', 'viruses'],
 'answer': 'mesophilic organisms',
 'triplets': [['isa', 'yogurt', 'food', 1.5036563873291016],
  ['relatedto', 'cheese', 'yogurt', 1.5036563873291016],
  ['relatedto', 'organisms', 'organism', 1.4846434891223907],
  ['antonym', 'thermophilic', 'psychrophilic', 1.3640008568763733],
  ['isa', 'cheese', 'food', 1.3373385071754456],
  ['relatedto', 'preparation', 'food', 1.3373385071754456],
  ['relatedto', 'cheese', 'food', 1.3373385071754456],
  ['relatedto', 'foods', 'food', 1.3373385071754456],
  ['relatedto', 'food', 'foods', 1.3373385071754456],
  ['antonym', 'anaerobic', 'aerobic', 1.2871821820735931],
  ['antonym', 'aerobic', 'anaerobic', 1.2871821820735931],
  ['relatedto', 'anaerobic', 'organism', 1.2871821820735931],
  ['relatedto', 'used', 'use', 1.1459934711456299]]}

### Rerank

In [9]:
for i in range(len(train)):
    d = train[i]
    sentence = d['sentence']
    answer = d['answer']
    distractors = d['distractors']
    distractors = [x.lower() for x in distractors]
    triplets = d['triplets']

    if triplets != []:
        max_score = triplets[0][-1]
        for each_triplet in triplets:
            rel, source, target, weight = each_triplet
            if source == answer or target == answer or source in distractors or target in distractors:
                each_triplet[3] = max_score + 1.0

        d['triplets'] = sorted(triplets, key = lambda x: x[3],reverse=True)

In [10]:
for i in range(len(test)):
    d = test[i]
    sentence = d['sentence']
    answer = d['answer']
    distractors = d['distractors']
    distractors = [x.lower() for x in distractors]
    triplets = d['triplets']

    d['triplets'] = sorted(triplets, key = lambda x: x[3],reverse=True)

### Prepare data

In [11]:
import random
random.seed(0)
k = 25

In [12]:
def processData(data, k, task_prefix):
    
    sentences = []
    labels = []
    answers = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        triplets = d['triplets']
        answer = d['answer']
        
        triplet_set = []
        for each_triplet in triplets[:k]:
            rel, source, target, weight = each_triplet
            triplet_set.append('{} {} {}'.format(source,rel,target))

        distractors = [dis.strip() for dis in distractors]

        random.shuffle(triplet_set)
        sentences.append(task_prefix + sentence + '</s></s>' + answer  + '</s></s>' + ', '.join(triplet_set))
        labels.append(', '.join(distractors))
        answers.append(answer)
        
    return sentences, answers, labels

In [13]:
task_prefix = 'distractor generation: '
train_sent, train_answer, train_label = processData(train, k, task_prefix)
valid_sent, valid_answer, valid_label = processData(valid, k, task_prefix)
test_sent, test_answer, test_label = processData(test, k, task_prefix)

In [14]:
for idx in range(2):
    print(train_sent[idx])
    print(train_answer[idx])
    print(train_label[idx])
    print()

distractor generation: What type of organism is commonly used in preparation of foods such as cheese and yogurt?</s></s>mesophilic organisms</s></s>cheese relatedto yogurt, aerobic antonym anaerobic, anaerobic antonym aerobic, preparation relatedto food, anaerobic relatedto organism, organisms relatedto organism, thermophilic antonym psychrophilic, foods relatedto food, food relatedto foods, cheese isa food, yogurt isa food, used relatedto use, cheese relatedto food
mesophilic organisms
protozoa, gymnosperms, viruses

distractor generation: What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?</s></s>coriolis effect</s></s>southern_hemisphere relatedto earth, wind relatedto atmospheric, moon relatedto ocean, wind relatedto earth, hemisphere relatedto northern_hemisphere, rotation relatedto earth, southern_hemisphere relatedto hemisphere, wind relatedto currents, plat

In [15]:
len(train_sent), len(train_answer), len(train_label)

(11679, 11679, 11679)

In [16]:
len(valid_sent), len(valid_answer), len(valid_label)

(1000, 1000, 1000)

In [17]:
len(test_sent), len(test_answer), len(test_label)

(1000, 1000, 1000)

### Tokenization

In [18]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [19]:
train_encodings = tokenizer(train_sent, truncation=True, padding=True)
valid_encodings = tokenizer(valid_sent, truncation=True, padding=True)
test_encodings = tokenizer(test_sent, truncation=True, padding=True)



In [20]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [21]:
len(train_encodings.input_ids)

11679

In [22]:
def add_labels(encodings, distractors):
    
    distractors_encodings = tokenizer(distractors, padding=True)
    labels = []
    for i in range(len(distractors_encodings.input_ids)):
        labels.append(distractors_encodings.input_ids[i])
    
    encodings["labels"] = labels
    return encodings

In [23]:
train_encodings = add_labels(train_encodings, train_label)
valid_encodings = add_labels(valid_encodings, valid_label)
test_encodings = add_labels(test_encodings, test_label)

In [24]:
len(train_encodings.input_ids)

11679

In [25]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [26]:
tokenizer.decode(train_encodings['input_ids'][0])

'distractor generation: What type of organism is commonly used in preparation of foods such as cheese and yogurt?</s></s> mesophilic organisms</s></s> cheese relatedto yogurt, aerobic antonym anaerobic, anaerobic antonym aerobic, preparation relatedto food, anaerobic relatedto organism, organisms relatedto organism, thermophilic antonym psychrophilic, foods relatedto food, food relatedto foods, cheese isa food, yogurt isa food, used relatedto use, cheese relatedto food</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

### Dataset

In [27]:
import torch
class SciqDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SciqDataset(train_encodings)
valid_dataset = SciqDataset(valid_encodings)
test_dataset = SciqDataset(test_encodings)

In [28]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(11679, 1000, 1000)

### Fine-tuning

In [29]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [30]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="F1@3",
    weight_decay=0.01,
    gradient_accumulation_steps = 2,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [32]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    r1 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        r_1 = len(act_set & pred1_set) / float(len(act_set))
        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        r1+=r_1
        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_r1 = r1 / len(true_label)
    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'R@1': avg_r1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [33]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 11679
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 9100
  Number of trainable parameters = 222903552
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhankystyle[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,P@1,R@1,P@3,R@3,F1@3
0,No log,0.605512,0.095,0.031833,0.048333,0.0485,0.0484
1,No log,0.587024,0.133,0.0445,0.068,0.068167,0.068067
2,0.586600,0.578135,0.142,0.0475,0.073667,0.073833,0.073733
3,0.586600,0.573541,0.148,0.0495,0.081667,0.081833,0.081689
4,0.586600,0.57265,0.159,0.053167,0.09,0.0905,0.0902
5,0.403100,0.57426,0.163,0.0545,0.093,0.093333,0.093089
6,0.403100,0.574793,0.173,0.057833,0.095333,0.095667,0.095467
7,0.403100,0.578667,0.178,0.0595,0.105,0.105143,0.104956
8,0.368100,0.581272,0.172,0.0575,0.105,0.105143,0.104956
9,0.368100,0.587093,0.192,0.064167,0.115333,0.115476,0.115289


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_t

TrainOutput(global_step=9100, training_loss=0.26432660909799427, metrics={'train_runtime': 16581.3857, 'train_samples_per_second': 35.217, 'train_steps_per_second': 0.549, 'total_flos': 3.0349486338200064e+17, 'train_loss': 0.26432660909799427, 'epoch': 50.0})

In [37]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

test: 


{'test_loss': 0.8654527068138123,
 'test_P@1': 0.227,
 'test_R@1': 0.07542063492063494,
 'test_P@3': 0.15466666666666676,
 'test_R@3': 0.15475396825396834,
 'test_F1@3': 0.15456666666666677,
 'test_runtime': 30.5735,
 'test_samples_per_second': 32.708,
 'test_steps_per_second': 1.047}

In [36]:
predictions, labels, metrics = trainer.predict(test_dataset, no_repeat_ngram_size = 2)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

test: 


{'test_loss': 0.8654527068138123,
 'test_P@1': 0.227,
 'test_R@1': 0.07542063492063494,
 'test_P@3': 0.15100000000000002,
 'test_R@3': 0.15108730158730163,
 'test_F1@3': 0.15090000000000003,
 'test_runtime': 33.0761,
 'test_samples_per_second': 30.233,
 'test_steps_per_second': 0.967}

### Save Model

In [42]:
trainer.save_model('../../../../../saved_models/KAG/sciq/t5/t5-triplet-augmentation-k=25-epoch=50')

Saving model checkpoint to ./saved_models/t5-triplet-augmentation-k=25-epoch=50
Configuration saved in ./saved_models/t5-triplet-augmentation-k=25-epoch=50/config.json
Configuration saved in ./saved_models/t5-triplet-augmentation-k=25-epoch=50/generation_config.json
Model weights saved in ./saved_models/t5-triplet-augmentation-k=25-epoch=50/pytorch_model.bin
tokenizer config file saved in ./saved_models/t5-triplet-augmentation-k=25-epoch=50/tokenizer_config.json
Special tokens file saved in ./saved_models/t5-triplet-augmentation-k=25-epoch=50/special_tokens_map.json


In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 994
  Batch size = 32
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.6440611481666565,
 'eval_P@1': 0.19416498993963782,
 'eval_R@1': 0.06472166331321277,
 'eval_P@3': 0.13380281690140833,
 'eval_R@3': 0.133922583117754,
 'eval_F1@3': 0.13373574782025474,
 'eval_runtime': 33.0508,
 'eval_samples_per_second': 30.075,
 'eval_steps_per_second': 0.968,
 'epoch': 30.0}

### Save Distractor Data

In [43]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Tokenizer
import torch

model = T5ForConditionalGeneration.from_pretrained('./results/checkpoint-9100')

loading configuration file ./results/checkpoint-9100/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "ea

In [44]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="F1@3",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

PyTorch: setting up devices


In [45]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [46]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    r1 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        r_1 = len(act_set & pred1_set) / float(len(act_set))
        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        r1+=r_1
        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_r1 = r1 / len(true_label)
    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'R@1': avg_r1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [47]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Last Epoch Result

In [48]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'test_loss': 0.9534540772438049,
 'test_P@1': 0.204,
 'test_R@1': 0.06794444444444457,
 'test_P@3': 0.15166666666666667,
 'test_R@3': 0.15194444444444447,
 'test_F1@3': 0.15170000000000003,
 'test_runtime': 27.8614,
 'test_samples_per_second': 35.892,
 'test_steps_per_second': 0.574}

In [49]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset, no_repeat_ngram_size = 2)
test_metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'test_loss': 0.9534540772438049,
 'test_P@1': 0.204,
 'test_R@1': 0.06794444444444457,
 'test_P@3': 0.1476666666666666,
 'test_R@3': 0.1479444444444444,
 'test_F1@3': 0.14769999999999997,
 'test_runtime': 29.1363,
 'test_samples_per_second': 34.321,
 'test_steps_per_second': 0.549}

In [50]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset, no_repeat_ngram_size = 2, num_beams = 6)
test_metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'test_loss': 0.9534540772438049,
 'test_P@1': 0.186,
 'test_R@1': 0.06194444444444461,
 'test_P@3': 0.14966666666666656,
 'test_R@3': 0.14994444444444438,
 'test_F1@3': 0.14969999999999994,
 'test_runtime': 62.5115,
 'test_samples_per_second': 15.997,
 'test_steps_per_second': 0.256}

In [38]:
import json
def write_json(data, path):
    
    jsonString = json.dumps(data)
    jsonFile = open(path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()

In [39]:
def save_data(data, predictions, labels, file_name):

    

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all sentences
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')

        predicted.append(pred_list)
        true_label.append(label_list)
    
    # evaluation metrics
    for idx in range(len(true_label)):
        
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p1 = len(act_set & pred1_set) / float(1)
        p3 = len(act_set & pred3_set) / float(3)
        r3 = len(act_set & pred3_set) / float(len(act_set))

        if p3 == 0 and r3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p3 * r3 / (p3 + r3))
        
        data[idx]['pred_distractors'] = distractors
        data[idx]['metric'] = {'P@1': p1, 'P@3': p3, 'R@3': r3, 'F1@3': f1_3}

        if 'triplets' in data[idx]:
            del data[idx]['triplets']
    
    write_json(data, file_name)
    print(file_name + ' is saved :)')

In [41]:
save_data(test, test_predictions, test_labels, '../../../../../predictions/sciq/t5/t5-triplet-augmentation-sciq')

./predictions/t5-triplet-augmentation-sciq is saved :)


In [50]:
decoded_preds = tokenizer.batch_decode(test_predictions, skip_special_tokens=True)

In [51]:
same_count = 0
for i in range(len(test)):
    pred = decoded_preds[i]
    pred_list = pred.split(', ')
    

    if test[i]['answer'] in pred_list:
        same_count += 1
print('答案與 distractor 相同的筆數 = ', same_count)
print('答案與 distractor 相同的比例 = ',same_count/len(test))

答案與 distractor 相同的筆數 =  54
答案與 distractor 相同的比例 =  0.05416248746238716


In [52]:
decoded_preds = tokenizer.batch_decode(test_predictions, skip_special_tokens=True)
repeat_count = 0
for k in range(len(decoded_preds)):
    pred = decoded_preds[k]
    pred_list = pred.split(', ')
    
    duplicates = set([x for x in pred_list if pred_list.count(x) > 1])
    if duplicates:
        repeat_count += 1
print('重複 distractor 筆數 = ', repeat_count)
print('重複 distractor 的比例 = ',repeat_count/len(test))

重複 distractor 筆數 =  41
重複 distractor 的比例 =  0.041123370110330994
