In [1]:
!nvidia-smi

Tue Apr  4 09:54:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   33C    P8    27W / 460W |  24253MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 28%   45C    P8    15W / 260W |   9065MiB / 11264MiB |      0%      Default |
|       

In [2]:
!pip install torch
!pip install transformers

Collecting torch
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nvidia-cufft-cu11==10.9.0.58
  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting triton==2.0.0
  Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m25.0 MB/s[0

In [3]:
import numpy as np
import torch

from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig


from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange


import pandas as pd
from sklearn.utils import shuffle

from collections import Counter
from tqdm import tqdm

# model

In [4]:
from get_cuda_device import get_cuda_command

get_cuda_command('cuda:3')

'os.environ["CUDA_VISIBLE_DEVICES"]="1"'

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [6]:
if torch.cuda.is_available():
    device='cuda'
    print('GPU')
else:
    device='cpu'
    print('CPU')
    
    
SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

GPU


In [7]:
from transformers import TrainingArguments, Trainer

model_checkpoint = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [9]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [10]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [11]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

# **Read data SemEval2018-Task9**

In [12]:
path = os.getcwd().replace('hearst_patterns', '')

In [13]:
path_data_en = path+"SemEval2018-Task9/training/data/1A.english.training.data.txt"
path_gold_en = path+"SemEval2018-Task9/training/gold/1A.english.training.gold.txt"

train_data_en_data = pd.read_csv(path_data_en, header=None, sep="\t", names=['term', 'relation'])
train_gold_en_data = pd.read_csv(path_gold_en, header=None, names=['hypernym'])

train_data_en_data.head()

Unnamed: 0,term,relation
0,blackfly,Concept
1,Turonian,Entity
2,abhorrence,Concept
3,tropical storm,Concept
4,militarization,Concept


In [14]:
path_test_data_en = path+"SemEval2018-Task9/test/data/1A.english.test.data.txt"
path_test_gold_en = path+"SemEval2018-Task9/test/gold/1A.english.test.gold.txt"

test_data_en_data =pd.read_csv(path_test_data_en, header=None, sep="\t", names=['term', 'relation'])
test_gold_en_data = pd.read_csv(path_test_gold_en, header=None, names=['hypernym'])

In [15]:
def hearest_preprocessing(train_features, train_target, test_features, test_target, hearst_pattern='My favorite [PARENT] is'):
    hearst_pattern = hearst_pattern.replace('[PARENT]', '<extra_id_0>')
    
    prefix=''
        
    train_data_en = train_features.copy()
    train_data_en = prefix + hearst_pattern + ' ' + train_data_en_data.term 
    # +  ' </s>'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = prefix + hearst_pattern + ' ' + test_data_en.term
    # +  ' </s>'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en


In [17]:
train_data_en, train_gold_en, test_data_en, test_gold_en = hearest_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0          My favorite <extra_id_0> is blackfly
1          My favorite <extra_id_0> is Turonian
2        My favorite <extra_id_0> is abhorrence
3    My favorite <extra_id_0> is tropical storm
4    My favorite <extra_id_0> is militarization
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    My favorite <extra_id_0> is maliciousness
1          My favorite <extra_id_0> is buckler
2        My favorite <extra_id_0> is spelunker
3     My favorite <extra_id_0> is quo warranto
4     My favorite <extra_id_0> is Jeff Francis
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2                    exploration, adventurer, explorer
3  

# Model training

In [18]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=15, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [19]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.0109
1000,0.6647


TrainOutput(global_step=1410, training_loss=0.7692533885333555, metrics={'train_runtime': 462.1099, 'train_samples_per_second': 48.69, 'train_steps_per_second': 3.051, 'total_flos': 1067740434432000.0, 'train_loss': 0.7692533885333555, 'epoch': 15.0})

In [20]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Apr  4 10:34:05 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   34C    P8    27W / 460W |  24253MiB / 24564MiB |      0%      Default |
|                               |            

# EVALUATION 

In [21]:
test_pred_en=[]
for i_2, j_2 in tqdm(zip(test_data_en.tolist(), test_gold_en.tolist())):
        input_ids = tokenizer.encode(i_2, return_tensors="pt")
        output_batch = model.generate(input_ids.cuda(), 
                                      no_repeat_ngram_size=2, 
                                      max_new_tokens=2048, 
                                      num_return_sequences=50, num_beams=50, early_stopping=True, 
                                      num_beam_groups=5, 
                                      diversity_penalty=1.0)
        decoded_list = []
        for outputs in output_batch:
            decoded = tokenizer.decode(outputs, skip_special_tokens=True).split(", ")
            decoded_list.extend(decoded)

        sorted_predicted_answer = [i[0] for i in Counter(decoded_list).most_common()]
        
        test_pred_en.append(sorted_predicted_answer)
        

1500it [09:44,  2.56it/s]


In [22]:
name  = 'ft_hearst_no_s.txt'

test_pred_en_df = []
for i in test_pred_en:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [27]:
!python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt ft_hearst.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.407305897805898
MAP: 0.2558429445862783
P@1: 0.32
P@3: 0.23633333333333315
P@5: 0.23217777777777596
P@15: 0.27171738446738397


In [23]:
!python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt ft_hearst_no_s.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.4046078847078844
MAP: 0.2517983607750273
P@1: 0.32066666666666666
P@3: 0.22888888888888842
P@5: 0.2240999999999988
P@15: 0.27106337736337677


In [24]:
pd.read_csv('/home/jovyan/work/prefix/metrics_table.csv')

Unnamed: 0,prefix,MRR,MAP,P@1,P@3,P@5,P@15
0,"find hypernyms for hyponym: [CHILD] , target: ...",0.41024,0.2557,0.32267,0.22956,0.22899,0.27718
1,hyponym: [CHILD] | hypernyms: [PARENTS],0.39409,0.25061,0.30867,0.22044,0.21683,0.28208
2,what are hypernyms for hyponym [CHILD] ? [PARE...,0.37119,0.24918,0.28267,0.21067,0.21613,0.29124
3,question: what are hypernyms for hyponym [CHIL...,0.41452,0.25872,0.33333,0.23033,0.23152,0.281
4,question: what are hypernyms for hyponym [CHIL...,0.4063,0.26081,0.32267,0.22644,0.23028,0.28664
5,question: what are hypernyms for hyponym [CHIL...,0.40687,0.26098,0.32333,0.22667,0.23041,0.28675


## predict strategy 2

In [30]:
import re
regex = "[a-zA-Z]+"


def _filter(output, end_token='<extra_id_1>'):
        # The first token is <unk> (inidex at 0) and the second token is <extra_id_0> (indexed at 32099)
        _txt = tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
        if end_token in _txt:
            _end_token_index = _txt.index(end_token)
            return _txt[:_end_token_index]
        else:
            return _txt

        
def predict_token(text):
    
    encoded = tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
    input_ids = encoded['input_ids'].to(device)

    # Generaing 20 sequences with maximum length set to 10
    outputs = model.generate(input_ids=input_ids.cuda(), 
                              num_beams=20, num_return_sequences=20,
                              max_length=10)

    _0_index = text.index('<extra_id_0>')
    _result_prefix = text[:_0_index]
    _result_suffix = text[_0_index+12:]  # 12 is the length of <extra_id_0>
    
    results = list(map(_filter, outputs))
    results = [test_string.replace(',' , '')for test_string in results]
    
    return results




def predict(test_data_en, test_gold_en):
    
#   make predictions for each hyponyms
    test_pred_en=[]
    for text in tqdm(test_data_en.tolist()):
        pred_masked_token = predict_token(text)
        test_pred_en.append('\t'.join(pred_masked_token))
            
#   make txt format
    name  = 'ft2_hearst.txt'

    test_pred_en_df = pd.DataFrame(test_pred_en)
    test_pred_en_df.to_csv(name, header=None, index=None)

In [31]:
predict(test_data_en, test_gold_en)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [08:40<00:00,  2.88it/s]


In [32]:
!python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt ft2_hearst.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.002888888888888889
MAP: 0.0007845833179166514
P@1: 0.0026666666666666666
P@3: 0.0011111111111111111
P@5: 0.0006666666666666666
P@15: 0.00046666666666666666
