# 使用 fine-tune 過後的 Sentence Transformer 來 rerank triplet

### GPU

In [1]:
!nvidia-smi

Tue Jun  6 09:15:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN RTX    Off  | 00000000:01:00.0 Off |                  N/A |
| 40%   44C    P0    66W / 280W |      0MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX    Off  | 00000000:02:00.0 Off |                  N/A |
| 35%   39C    P0    55W / 280W |      0MiB / 24220MiB |      0%      Default |
|       

## Prepare Data

In [2]:
import json
import os, sys
import fnmatch

In [1]:
def read_data(item):
    path = '../../../../data/sciq/{}.with.triplet.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [3]:
train = read_data('train')
valid = read_data('valid')
test = read_data('test')

In [4]:
print(test[0]['sentence'])
print(test[0]['distractors'])
print(test[0]['answer'])

Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?
['antioxidants', 'Oxygen', 'residues']
oxidants


In [6]:
test[0]['triplets'][:10]

[['relatedto', 'oxidants', 'oxidant', 0.6359457969665527],
 ['relatedto', 'electrons', 'electron', 0.46036529541015625],
 ['relatedto', 'delocalized', 'electron', 0.43334996700286865],
 ['relatedto', 'redox', 'electron', 0.43334996700286865],
 ['relatedto', 'proton', 'electron', 0.43334996700286865],
 ['relatedto', 'electron', 'proton', 0.43334996700286865],
 ['relatedto', 'delocalized', 'compound', 0.32832592725753784],
 ['relatedto', 'called', 'call', -0.034704722464084625]]

Relation Name Dict

In [7]:
Relation_Dict ={
 'antonym': 'is the antonym of',
 'atlocation' : 'is at location of',
 'capableof': 'is capable of',
 'causes' : 'causes',
 'createdby': 'is created by',
 'desires': 'desires',
 'hasproperty': 'has property',
 'hassubevent': 'has subevent',
 'isa':'is a kind of',
 'madeof':'is made of',
 'notcapableof':'has not capable of',
 'notdesires': "does not desires",
 'partof':'is part of',
 'receivesaction':'is',
 'relatedto':'is related to',
 'usedfor':'is used for'
}

In [8]:
def processData(data):
    
    sentences = []
    triplets = []
    labels = []
    answers = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']
        triplet = d['triplets']


        distractors = [dis.strip() for dis in distractors]
        sentence = sentence + ' ' +answer

        
        for each_triplet in triplet:
            rel, source, target, weight = each_triplet

            sentences.append(sentence)
            triplets.append('{} {} {}'.format(source,Relation_Dict[rel],target))

            if source == answer or target == answer or source in distractors or target in distractors:
                labels.append(0)
            else:
                labels.append(1)
            
    return sentences, triplets, labels

In [9]:
train_sent, train_triplet, train_label = processData(train)
valid_sent, valid_triplet, valid_label = processData(valid)
test_sent, test_triplet, test_label = processData(test)

In [10]:
len(train_sent), len(valid_sent), len(test_sent)

(260474, 22596, 22403)

統計訓練與測試資料分布

In [11]:
print('Train 資料分布 : Positive 有 {} 筆, Negative 有 {} 筆，大約有 {:.2f}% 為 Positive。'.format(train_label.count(0),train_label.count(1),train_label.count(0)/len(train_sent)*100))
print('Valid 資料分布 : Positive 有 {} 筆, Negative 有 {} 筆，大約有 {:.2f}% 為 Positive。'.format(valid_label.count(0),valid_label.count(1),valid_label.count(0)/len(valid_sent)*100))
print('Test 資料分布 : Positive 有 {} 筆, Negative 有 {} 筆，大約有 {:.2f}% 為 Positive。'.format(test_label.count(0),test_label.count(1),test_label.count(0)/len(test_sent)*100))

Train 資料分布 : Positive 有 39688 筆, Negative 有 220786 筆，大約有 15.24% 為 Positive。
Valid 資料分布 : Positive 有 3147 筆, Negative 有 19449 筆，大約有 13.93% 為 Positive。
Test 資料分布 : Positive 有 3499 筆, Negative 有 18904 筆，大約有 15.62% 為 Positive。


## Tokenization

In [12]:
from transformers import AutoTokenizer

2023-07-08 15:28:50.095521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-08 15:28:50.200589: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-08 15:28:50.750973: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-07-08 15:28:50.751124: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

In [13]:
tokenizer = AutoTokenizer.from_pretrained('./saved_models/sentence-transformer-sciq-all')

In [14]:
train_encodings = tokenizer(train_sent, train_triplet, truncation=True, padding=True)
valid_encodings = tokenizer(valid_sent, valid_triplet, truncation=True, padding=True)
test_encodings = tokenizer(test_sent, test_triplet, truncation=True, padding=True)

In [15]:
def add_labels(encodings, label):
    encodings.update({'labels': label})

In [16]:
add_labels(train_encodings, train_label)
add_labels(valid_encodings, valid_label)
add_labels(test_encodings, test_label)

## 定義 Dataset，並轉換成 tensor 格式

In [17]:
from torch.utils import data
import torch

class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
valid_dataset = Dataset(valid_encodings)
test_dataset = Dataset(test_encodings)

## Load Model

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
from transformers import BertConfig, BertForSequenceClassification
config  = BertConfig.from_pretrained('./saved_models/sentence-transformer-sciq-all/', num_labels = 2) # num_labels 設定類別數
model = BertForSequenceClassification.from_pretrained('./saved_models/sentence-transformer-sciq-all/', config=config).to(device)

You are using a model of type mpnet to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Some weights of the model checkpoint at ./saved_models/sentence-transformer-sciq-all/ were not used when initializing BertForSequenceClassification: ['mpnet.encoder.layer.10.output.dense.weight', 'mpnet.encoder.layer.7.attention.attn.q.weight', 'mpnet.encoder.layer.8.attention.LayerNorm.weight', 'mpnet.encoder.layer.5.intermediate.dense.weight', 'mpnet.encoder.layer.11.attention.attn.v.weight', 'mpnet.encoder.layer.4.attention.LayerNorm.weight', 'mpnet.encoder.layer.10.attention.attn.k.weight', 'mpnet.encoder.layer.9.output.dense.bias', 'mpnet.encoder.layer.5.attention.attn.v.weight', 'mpnet.encoder.layer.4.output.dense.bias', 'mpnet.encoder.layer.3.attention.attn.k.bias', 'mpnet.encoder.layer.5.output.LayerNorm.bias', 'mpnet.encoder.layer.7.output.dense.weight', 'mpnet.encoder.layer.2.attention.LayerNorm.bias', 'mpnet.encoder.layer.3.attention.attn.k.weight', 'mpnet.encoder.layer.6.output.LayerNorm.bias', 'mpnet.encoder.layer.6.attention.attn.q.weight', 'mpnet.encoder.layer.10.attenti

In [20]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

In [21]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
   
    results = metric.compute(predictions=predictions, references=labels)

    return {'accuracy': results['accuracy']}

In [22]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [24]:
batch_size = 32
args = TrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [25]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [26]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 22403
  Batch size = 64
You're using a MPNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


test: 


{'test_loss': 0.7386144399642944,
 'test_accuracy': 0.15636298709994198,
 'test_runtime': 23.5301,
 'test_samples_per_second': 952.101,
 'test_steps_per_second': 14.917}

In [31]:
labels

array([0, 1, 1, ..., 1, 1, 1])

In [32]:
predictions[4]

array([-0.06555603, -0.13582139], dtype=float32)

In [33]:
np.argmax(predictions, axis=-1)[4]

0

In [34]:
encoded_input = tokenizer(test_sent[0],test_triplet[0], truncation=True, padding=True, return_tensors="pt").to(device)

In [35]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [36]:
np.argmax(model_output, axis=-1)

0

## Rerank Sciq Triplet

In [27]:
def rerank_data(data, predictions):
    idx = 0
    predictions = np.argmax(predictions, axis=-1)

    for d in data:
        for each_triplet in d['triplets']:
            rel, source, target, weight = each_triplet

            if predictions[idx] == 0:
                each_triplet[3] += 1.0

            idx += 1

In [28]:
predictions

array([[ 0.03046609, -0.21285811],
       [ 0.0301279 , -0.2142671 ],
       [ 0.01066719, -0.20922351],
       ...,
       [-0.03168179, -0.22601952],
       [-0.04911616, -0.24205561],
       [-0.04201736, -0.24933326]], dtype=float32)

In [29]:
len(predictions)

22403

In [30]:
rerank_data(test,predictions)

In [31]:
file_path = '../../../../data/sciq/test.with.reranker.triplet.json'
jsonString = json.dumps(test)
jsonFile = open(file_path, "w")
jsonFile.write(jsonString)
jsonFile.close()

In [32]:
predictions, labels, metrics = trainer.predict(train_dataset)
print('train: ')
metrics

***** Running Prediction *****
  Num examples = 260474
  Batch size = 64


train: 


{'test_loss': 0.7391087412834167,
 'test_accuracy': 0.1530671007470995,
 'test_runtime': 387.8461,
 'test_samples_per_second': 671.591,
 'test_steps_per_second': 10.494}

In [33]:
rerank_data(train,predictions)

In [34]:
file_path = '../../../../data/sciq/train.with.reranker.triplet.json'
jsonString = json.dumps(train)
jsonFile = open(file_path, "w")
jsonFile.write(jsonString)
jsonFile.close()

In [35]:
predictions, labels, metrics = trainer.predict(valid_dataset)
print('valid: ')
metrics

***** Running Prediction *****
  Num examples = 22596
  Batch size = 64




valid: 


{'test_loss': 0.7401852607727051,
 'test_accuracy': 0.14099840679766332,
 'test_runtime': 24.8018,
 'test_samples_per_second': 911.063,
 'test_steps_per_second': 14.273}

In [36]:
rerank_data(valid,predictions)

In [37]:
file_path = '../../../../data/sciq/valid.with.reranker.triplet.json'
jsonString = json.dumps(valid)
jsonFile = open(file_path, "w")
jsonFile.write(jsonString)
jsonFile.close()