# 使用 fine-tune 過後的 Sentence Transfomrer 來 rerank triplet

### GPU

In [3]:
!nvidia-smi

Tue Aug 29 02:55:50 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA TITAN RTX                On | 00000000:01:00.0 Off |                  N/A |
| 40%   38C    P8               20W / 280W|      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX                On | 00000000:02:00.0 Off |  

## Prepare Data

In [4]:
import json
import os, sys
import fnmatch

In [5]:
def read_data(item):
    path = '../../../../data/mcq/{}.with.triplet.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [6]:
train = read_data('train')
test = read_data('test')

In [7]:
test[0].keys()

dict_keys(['sentence', 'distractors', 'answer', 'triplets'])

In [8]:
print(test[0]['sentence'])
print(test[0]['distractors'])
print(test[0]['answer'])

**blank** is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species
['enzyme', 'isolate', 'amino']
pheromone


In [10]:
# inital score of each triplet
test[0]['triplets'][:10]

[['isa', 'protease', 'enzyme', 2.0],
 ['relatedto', 'used', 'use', 2.0],
 ['isa', 'enzyme', 'catalyst', 2.0],
 ['relatedto', 'catalyst', 'enzyme', 1.0],
 ['relatedto', 'carbohydrate', 'animal', 1.0],
 ['relatedto', 'pheromone', 'chemical', 1.0],
 ['relatedto', 'protease', 'enzyme', 1.0],
 ['relatedto', 'animal', 'animals', 1.0],
 ['relatedto', 'affects', 'affect', 1.0],
 ['relatedto', 'pheromone', 'species', 1.0]]

Relation Name Dict

In [11]:
Relation_Dict ={
 'antonym': 'is the antonym of',
 'atlocation' : 'is at location of',
 'capableof': 'is capable of',
 'causes' : 'causes',
 'createdby': 'is created by',
 'desires': 'desires',
 'hasproperty': 'has property',
 'hassubevent': 'has subevent',
 'isa':'is a kind of',
 'madeof':'is made of',
 'notcapableof':'has not capable of',
 'notdesires': "does not desires",
 'partof':'is part of',
 'receivesaction':'is',
 'relatedto':'is related to',
 'usedfor':'is used for'
}

## 用 Sentence Transformer 來先初始化每一筆 Triplet 的 Inital Score 

In [15]:
from sentence_transformers import SentenceTransformer, util
sentences = ["I'm happy", "I'm full of happiness"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

2023-08-29 03:00:08.836315: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-29 03:00:08.950781: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-29 03:00:09.397266: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-08-29 03:00:09.397424: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

In [23]:
def sort_triplet_with_sentence_transformer(data):
    for idx in range(len(data)):

        # 先取 sentence entity
        sentence_entity = set(data[idx]['sentence'].lower().split(' '))

        answer = data[idx]['answer']
        source_sentence = data[idx]['sentence'].replace('**blank**', answer)
        sent_embedding = model.encode(source_sentence, convert_to_tensor=True)

        
        for each_triplet in data[idx]['triplets']:
            rel, source, target, weight = each_triplet

            # 初始化 weight 為 0
            each_triplet[3] = 0.0

            # Case 1
            if source not in sentence_entity and target not in sentence_entity:
                source_embedding = model.encode(source, convert_to_tensor=True)
                target_embedding = model.encode(target, convert_to_tensor=True)
                score = max(util.pytorch_cos_sim(sent_embedding, source_embedding).item(), util.pytorch_cos_sim(sent_embedding, target_embedding).item())
                each_triplet[3] = score

            # Case 2
            elif source not in sentence_entity:
                source_embedding = model.encode(source, convert_to_tensor=True)
                score = util.pytorch_cos_sim(sent_embedding, source_embedding).item()
                each_triplet[3] = score

            # Case 2
            elif target not in sentence_entity:
                target_embedding = model.encode(target, convert_to_tensor=True)
                score = util.pytorch_cos_sim(sent_embedding, target_embedding).item()
                each_triplet[3] = score
    return data


In [24]:
train = sort_triplet_with_sentence_transformer(train)
test = sort_triplet_with_sentence_transformer(train)

In [25]:
# the score of each triplet calcuate by cos similarity
test[0]['triplets'][:10]

[['isa', 'gravity', 'force', 0.4813329875469208],
 ['relatedto', 'roll', 'rock', 0.3557915687561035],
 ['relatedto', 'rock', 'roll', 0.3557915687561035],
 ['hassubevent', 'roll', 'rock', 0.3557915687561035],
 ['isa', 'pressure', 'force', 0.11507400125265121],
 ['relatedto', 'roll', 'wind', 0.1665383279323578],
 ['relatedto', 'gravity', 'force', 0.4813329875469208],
 ['relatedto', 'wind', 'force', 0.1665383279323578],
 ['isa', 'friction', 'force', 0.3244278132915497],
 ['relatedto', 'causes', 'cause', 0.07075150310993195]]

In [9]:
def processData(data):
    
    sentences = []
    triplets = []
    labels = []
    answers = []
    for d in data:
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']
        triplet = d['triplets']


        distractors = [dis.strip() for dis in distractors]
        sentence = sentence.replace('**blank**', answer)

        
        for each_triplet in triplet:
            rel, source, target, weight = each_triplet

            sentences.append(sentence)
            triplets.append('{} {} {}'.format(source,Relation_Dict[rel],target))

            if source == answer or target == answer or source in distractors or target in distractors:
                labels.append(0)
            else:
                labels.append(1)
            
    return sentences, triplets, labels

In [10]:
train_sent, train_triplet, train_label = processData(train)
test_sent, test_triplet, test_label = processData(test)

In [11]:
len(train_sent), len(test_sent)

(41854, 4390)

統計訓練與測試資料分布

In [12]:
print('Train 資料分布 : Positive 有 {} 筆, Negative 有 {} 筆，大約有 {:.2f}% 為 Positive。'.format(train_label.count(0),train_label.count(1),train_label.count(0)/len(train_sent)*100))
print('Test 資料分布 : Positive 有 {} 筆, Negative 有 {} 筆，大約有 {:.2f}% 為 Positive。'.format(test_label.count(0),test_label.count(1),test_label.count(0)/len(test_sent)*100))

Train 資料分布 : Positive 有 9539 筆, Negative 有 32315 筆，大約有 22.79% 為 Positive。
Test 資料分布 : Positive 有 1346 筆, Negative 有 3044 筆，大約有 30.66% 為 Positive。


## Tokenization

In [13]:
from transformers import AutoTokenizer

2023-08-29 02:43:33.036994: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-29 02:43:33.145511: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-29 02:43:33.636481: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2023-08-29 02:43:33.636599: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or direc

In [14]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [15]:
train_encodings = tokenizer(train_sent, train_triplet, truncation=True, padding=True)
test_encodings = tokenizer(test_sent, test_triplet, truncation=True, padding=True)

In [16]:
def add_labels(encodings, label):
    encodings.update({'labels': label})

In [17]:
add_labels(train_encodings, train_label)
add_labels(test_encodings, test_label)

## 定義 Dataset，並轉換成 tensor 格式

In [18]:
from torch.utils import data
import torch

class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
test_dataset = Dataset(test_encodings)

## 用 FineTune 後的 Sentence Transformer 再調整一次每一筆的 Triplet 的 Score

## Load Model

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
from transformers import MPNetConfig, MPNetForSequenceClassification
config  = MPNetConfig.from_pretrained('../../../../saved_models/Reranker/sentence-transformer-for-mcq', num_labels = 2)
model = MPNetForSequenceClassification.from_pretrained('../../../../saved_models/Reranker/sentence-transformer-for-mcq', config=config).to(device)

In [21]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

In [22]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
   
    results = metric.compute(predictions=predictions, references=labels)

    return {'accuracy': results['accuracy']}

In [23]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [24]:
batch_size = 32
args = TrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [25]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [26]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 8339
  Batch size = 64
You're using a MPNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


test: 


{'test_loss': 0.3614237308502197,
 'test_accuracy': 0.8732461925890395,
 'test_runtime': 7.1131,
 'test_samples_per_second': 1172.346,
 'test_steps_per_second': 18.417}

In [27]:
labels

array([0, 0, 0, ..., 1, 1, 1])

In [28]:
predictions[4]

array([-1.820901 ,  1.9065006], dtype=float32)

In [29]:
np.argmax(predictions, axis=-1)[4]

1

In [30]:
encoded_input = tokenizer(test_sent[0],test_triplet[0], truncation=True, padding=True, return_tensors="pt").to(device)

In [31]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [32]:
np.argmax(model_output, axis=-1)

0

## Caluate Sentence Similarity between sentence and triplet

In [33]:
test[0]['triplets'][0]

['relatedto', 'pheromone', 'species', 0.6779286861419678]

In [34]:
import torch.nn.functional as F
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [35]:
test_sent[0], test_triplet[0]

('pheromone is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species',
 'pheromone is related to species')

In [36]:
test_sent[0], test_triplet[4]

('pheromone is used to describe a chemical released by an animal that affects the behavior or physiology of animals of the same species',
 'predator is related to animal')

In [37]:
# Sentences we want sentence embeddings for
sentences = [test_sent[0], test_triplet[4]]

In [38]:
from transformers import AutoTokenizer, AutoModel

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('../../../../saved_models/Reranker/sentence-transformer-for-mcq')
model = AutoModel.from_pretrained('../../../../saved_models/Reranker/sentence-transformer-for-mcq')

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./saved_models/sentence-transformer-mcq/config.json
Model config MPNetConfig {
  "_name_or_path": "./saved_models/sentence-transformer-mcq",
  "architectures": [
    "MPNetForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "mpnet",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "problem_type": "single_label_classification",
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30527
}

loading weights file ./saved_models/sentence-transformer-mc

In [39]:
from sentence_transformers import SentenceTransformer, util

In [40]:
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

util.pytorch_cos_sim(sentence_embeddings[0],sentence_embeddings[1])

Sentence embeddings:
tensor([[-0.0101, -0.1124,  0.0024,  ..., -0.0232,  0.0373,  0.0102],
        [ 0.0096, -0.0648, -0.0040,  ..., -0.0429,  0.0158,  0.0072]])


tensor([[0.8587]])

## Rerank MCQ Triplet

In [41]:
def rerank_data(data, predictions):
    idx = 0
    predictions = np.argmax(predictions, axis=-1)

    for d in data:
        for each_triplet in d['triplets']:
            rel, source, target, weight = each_triplet

            if predictions[idx] == 0:
                each_triplet[3] += 1.0

            idx += 1

In [42]:
predictions

array([[ 1.5062765 , -1.5388696 ],
       [ 1.6252131 , -1.6455723 ],
       [ 1.6963547 , -1.7159475 ],
       ...,
       [-1.0103647 ,  1.0678096 ],
       [-0.92447764,  0.9635637 ],
       [-1.8176643 ,  1.8996884 ]], dtype=float32)

In [43]:
len(predictions)

8339

In [44]:
rerank_data(test,predictions)

In [47]:
file_path = '../../../../data/mcq/test.kag.sentence.transformer.json'
jsonString = json.dumps(test)
jsonFile = open(file_path, "w")
jsonFile.write(jsonString)
jsonFile.close()

In [43]:
predictions, labels, metrics = trainer.predict(train_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 77061
  Batch size = 64


test: 


{'test_loss': 0.19605468213558197,
 'test_accuracy': 0.9277715056902973,
 'test_runtime': 239.4281,
 'test_samples_per_second': 321.854,
 'test_steps_per_second': 5.033}

In [44]:
rerank_data(train,predictions)

In [45]:
file_path = '../../../../data/mcq/train.kag.sentence.transformer.json'
jsonString = json.dumps(train)
jsonFile = open(file_path, "w")
jsonFile.write(jsonString)
jsonFile.close()