In [1]:
!nvidia-smi

Fri Mar 31 17:31:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   39C    P8    26W / 460W |      8MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 27%   34C    P8    14W / 260W |   9065MiB / 11264MiB |      0%      Default |
|       

In [2]:
!pip install torch
!pip install transformers

Collecting torch
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cusolver-cu11==11.4.0.1
  Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting triton==2.0.0
  Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m16.7 MB/s

In [3]:
import numpy as np
import torch

from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig


from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange


import pandas as pd
from sklearn.utils import shuffle

from collections import Counter
from tqdm import tqdm

# model

In [4]:
from get_cuda_device import get_cuda_command

get_cuda_command('cuda:0')

'os.environ["CUDA_VISIBLE_DEVICES"]="0"'

In [5]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="0"


if torch.cuda.is_available():
    device='cuda'
    print('GPU')
else:
    device='cpu'
    print('CPU')
    
    
SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

GPU


In [6]:
from transformers import TrainingArguments, Trainer

model_checkpoint = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [8]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [9]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [10]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

# **Read data SemEval2018-Task9**

In [11]:
path = os.getcwd().replace('prefix', '')

In [18]:
path_data_en = path+"SemEval2018-Task9/training/data/1A.english.training.data.txt"
path_gold_en = path+"SemEval2018-Task9/training/gold/1A.english.training.gold.txt"

train_data_en_data = pd.read_csv(path_data_en, header=None, sep="\t", names=['term', 'relation'])
train_gold_en_data = pd.read_csv(path_gold_en, header=None, names=['hypernym'])

train_data_en_data.head()

Unnamed: 0,term,relation
0,blackfly,Concept
1,Turonian,Entity
2,abhorrence,Concept
3,tropical storm,Concept
4,militarization,Concept


In [19]:
path_test_data_en = path+"SemEval2018-Task9/test/data/1A.english.test.data.txt"
path_test_gold_en = path+"SemEval2018-Task9/test/gold/1A.english.test.gold.txt"

test_data_en_data = pd.read_csv(path_test_data_en, header=None, sep="\t", names=['term', 'relation'])
test_gold_en_data = pd.read_csv(path_test_gold_en, header=None, names=['hypernym'])

In [20]:
def embading_find_hyponyms_uppercase(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'find hypernyms for hyponym: ' + train_data_en.term + ' , target:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())

    test_data_en = test_features.copy()
    test_data_en = 'find hypernyms for hyponym: ' + test_data_en.term + ' , target:'

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

In [44]:
train_data_en, train_gold_en, test_data_en, test_gold_en = embading_find_hyponyms_uppercase(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0       find hypernyms for hyponym: blackfly , target:
1       find hypernyms for hyponym: Turonian , target:
2     find hypernyms for hyponym: abhorrence , target:
3    find hypernyms for hyponym: tropical storm , t...
4    find hypernyms for hyponym: militarization , t...
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object


# Model training embading_find_hyponyms_uppercase

In [45]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [46]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.9678
1000,0.6436
1500,0.5745


TrainOutput(global_step=1504, training_loss=0.7283922998194999, metrics={'train_runtime': 518.6368, 'train_samples_per_second': 46.275, 'train_steps_per_second': 2.9, 'total_flos': 2255860881408000.0, 'train_loss': 0.7283922998194999, 'epoch': 16.0})

In [47]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 31 15:35:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|100%   63C    P2   312W / 460W |  21307MiB / 24564MiB |     14%      Default |
|                               |            

# EVALUATION 

In [25]:
def predict(test_data_en, test_gold_en):
    test_pred_en=[]
    for i_2, j_2 in tqdm(zip(test_data_en.tolist(), test_gold_en.tolist())):
            input_ids = tokenizer.encode(i_2, return_tensors="pt")
            output_batch = model.generate(input_ids.cuda(), no_repeat_ngram_size=2, max_new_tokens=2048, 
                                          num_return_sequences=50, num_beams=50, early_stopping=True, 
                                          num_beam_groups=5, 
                                          diversity_penalty=1.0)
            decoded_list = []
            for outputs in output_batch:
                decoded = tokenizer.decode(outputs, skip_special_tokens=True).split(", ")
                decoded_list.extend(decoded)

            sorted_predicted_answer = [i[0] for i in Counter(decoded_list).most_common()]

            test_pred_en.append(sorted_predicted_answer)
    return test_pred_en
        

In [49]:
test_pred_en = predict(test_data_en, test_gold_en)

1500it [09:21,  2.67it/s]


In [58]:
name  = 'find_pred.txt'

find_pred = []
for i in test_pred_en:
    find_pred.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(find_pred)
test_pred_en_df.to_csv(name, header=None, index=None)

In [12]:
def answers(str_ans):
    columns_name = []
    values = []
    for ind, metrics in enumerate(str_ans.split('\n')):
        if ind == 6:
            _name = 'prefix'
            number = metrics
        else:
            _name, number = metrics.split(' ')
            number = round(float(number), 5)
            _name = _name[:-1]
        
        columns_name.append(_name)
        values.append([number])
        
    
        
    df = pd.DataFrame(values).T
    df.columns = columns_name
    df.set_index('prefix', inplace=True)
    return df

In [13]:
from contextlib import redirect_stdout
import io
from tqdm import tqdm
from IPython.display import clear_output

f = io.StringIO()
with redirect_stdout(f):
    !python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt find_pred.txt


_std_out = f.getvalue()
_std_out = _std_out  + 'find hypernyms for hyponym: [CHILD] , target: [PARENTS]'
embading_find_hyponyms_uppercase_table = answers(_std_out)
embading_find_hyponyms_uppercase_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"find hypernyms for hyponym: [CHILD] , target: [PARENTS]",0.41024,0.2557,0.32267,0.22956,0.22899,0.27718


In [25]:
# name  = 'test_pred_en_2v_14_02.txt'

# test_pred_en_df = []
# for i in test_pred_en:
#     test_pred_en_df.append('\t'.join(i))


# test_pred_en_df = pd.DataFrame(test_pred_en_df)
# test_pred_en_df.to_csv(name, header=None, index=None)

In [79]:
cleanup()

In [80]:
del trainer

In [81]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 31 15:57:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   41C    P2   115W / 460W |   9665MiB / 24564MiB |      0%      Default |
|                               |            

# Standard emabdding

In [89]:
def standard_preprocessing(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'hyponym: ' + train_data_en.term + ' | hypernyms:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'hyponym: ' + test_data_en.term + ' | hypernyms:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

train_data_en, train_gold_en, test_data_en, test_gold_en = standard_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0          hyponym: blackfly | hypernyms:
1          hyponym: Turonian | hypernyms:
2        hyponym: abhorrence | hypernyms:
3    hyponym: tropical storm | hypernyms:
4    hyponym: militarization | hypernyms:
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    hyponym: maliciousness | hypernyms:
1          hyponym: buckler | hypernyms:
2        hyponym: spelunker | hypernyms:
3     hyponym: quo warranto | hypernyms:
4     hyponym: Jeff Francis | hypernyms:
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2                    exploration, adventurer, explorer
3    proceedings, legal proceedings, proceeding, du...
4       

In [None]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [91]:
trainer.train()



Step,Training Loss
500,0.5425
1000,0.463
1500,0.4169


TrainOutput(global_step=1504, training_loss=0.4741376611463567, metrics={'train_runtime': 514.9895, 'train_samples_per_second': 46.603, 'train_steps_per_second': 2.92, 'total_flos': 1849914513408000.0, 'train_loss': 0.4741376611463567, 'epoch': 16.0})

In [92]:
test_pred_en_standard = predict(test_data_en, test_gold_en)

1500it [14:04,  1.78it/s]


In [93]:
name  = 'test_pred_en_standard.txt'

test_pred_en_df = []
for i in test_pred_en_standard:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [94]:
!python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_standard.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.3940902930402933
MAP: 0.2506139995806665
P@1: 0.30866666666666664
P@3: 0.22044444444444422
P@5: 0.21683333333333213
P@15: 0.2820797202797196


In [14]:
f = io.StringIO()
with redirect_stdout(f):
    !python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_standard.txt


_std_out = f.getvalue()
_std_out = _std_out  + 'hyponym: [CHILD] | hypernyms: [PARENTS]'
embading_standard_table = answers(_std_out)
embading_standard_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hyponym: [CHILD] | hypernyms: [PARENTS],0.39409,0.25061,0.30867,0.22044,0.21683,0.28208


In [98]:
cleanup()
del trainer

In [99]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 31 16:48:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   37C    P2    66W / 460W |   9609MiB / 24564MiB |     15%      Default |
|                               |            

# QandA_preprocessing

In [100]:
def QandA_preprocessing(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'what are hypernyms for hyponym ' + train_data_en.term + ' ?'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'what are hypernyms for hyponym '  + test_data_en.term + ' ?'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

train_data_en, train_gold_en, test_data_en, test_gold_en = QandA_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0          what are hypernyms for hyponym blackfly ?
1          what are hypernyms for hyponym Turonian ?
2        what are hypernyms for hyponym abhorrence ?
3    what are hypernyms for hyponym tropical storm ?
4    what are hypernyms for hyponym militarization ?
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    what are hypernyms for hyponym maliciousness ?
1          what are hypernyms for hyponym buckler ?
2        what are hypernyms for hyponym spelunker ?
3     what are hypernyms for hyponym quo warranto ?
4     what are hypernyms for hyponym Jeff Francis ?
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2       

In [101]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [102]:
trainer.train()



Step,Training Loss
500,0.402
1000,0.3409
1500,0.3022


TrainOutput(global_step=1504, training_loss=0.34835820320121785, metrics={'train_runtime': 515.1554, 'train_samples_per_second': 46.588, 'train_steps_per_second': 2.92, 'total_flos': 2052887697408000.0, 'train_loss': 0.34835820320121785, 'epoch': 16.0})

In [103]:
test_pred_en_QA = predict(test_data_en, test_gold_en)

1500it [18:46,  1.33it/s]


In [104]:
name  = 'test_pred_en_QA.txt'

test_pred_en_df = []
for i in test_pred_en_QA:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [15]:
!python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_QA.txt

MRR: 0.3711945535945533
MAP: 0.24918376426043087
P@1: 0.2826666666666667
P@3: 0.21066666666666667
P@5: 0.21613333333333243
P@15: 0.29123732563732563


In [16]:
f = io.StringIO()
with redirect_stdout(f):
    !python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_QA.txt


_std_out = f.getvalue()
_std_out = _std_out  + 'what are hypernyms for hyponym [CHILD] ? [PARENTS]'
QA_table = answers(_std_out)
QA_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
what are hypernyms for hyponym [CHILD] ? [PARENTS],0.37119,0.24918,0.28267,0.21067,0.21613,0.29124


In [109]:
cleanup()

del trainer

!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 31 17:19:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
| 30%   47C    P2    59W / 460W |   9537MiB / 24564MiB |      0%      Default |
|                               |            

# question: what are what are hypernyms for hyponym [hyponym] ? | answer: [hypernyms]


In [21]:
def QandA_preprocessing(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'question: what are hypernyms for hyponym ' + train_data_en.term + ' ? | answer: '
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'question: what are hypernyms for hyponym '  + test_data_en.term + ' ? | answer: '
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

train_data_en, train_gold_en, test_data_en, test_gold_en = QandA_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0    question: what are hypernyms for hyponym black...
1    question: what are hypernyms for hyponym Turon...
2    question: what are hypernyms for hyponym abhor...
3    question: what are hypernyms for hyponym tropi...
4    question: what are hypernyms for hyponym milit...
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    question: what are hypernyms for hyponym malic...
1    question: what are hypernyms for hyponym buckl...
2    question: what are hypernyms for hyponym spelu...
3    question: what are hypernyms for hyponym quo w...
4    question: what are hypernyms for hyponym Jeff ...
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                     

In [22]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [23]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.9285
1000,0.6414
1500,0.5715


TrainOutput(global_step=1504, training_loss=0.7136728069249619, metrics={'train_runtime': 519.467, 'train_samples_per_second': 46.201, 'train_steps_per_second': 2.895, 'total_flos': 2560320657408000.0, 'train_loss': 0.7136728069249619, 'epoch': 16.0})

In [26]:
test_pred_en_QA2 = predict(test_data_en, test_gold_en)

1500it [09:52,  2.53it/s]


In [27]:
name  = 'test_pred_en_QA2.txt'

test_pred_en_df = []
for i in test_pred_en_QA2:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [28]:
f = io.StringIO()
with redirect_stdout(f):
    !python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_QA2.txt


_std_out = f.getvalue()
_std_out = _std_out  + 'question: what are hypernyms for hyponym [CHILD] ? | answer: [PARENTS]'
QA2_table = answers(_std_out)
QA2_table

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
question: what are hypernyms for hyponym [CHILD] ? | answer: [PARENTS],0.4063,0.26081,0.32267,0.22644,0.23028,0.28664


In [30]:
test_pred_en_df[0].str.split('\t').apply(lambda x: x[:20]).str.join('\t').to_csv('pruned.txt', header=None, index=None)

In [31]:
!python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt pruned.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.40687333037333034
MAP: 0.26098189970523306
P@1: 0.3233333333333333
P@3: 0.22666666666666643
P@5: 0.2304111111111097
P@15: 0.2867542346542343


In [32]:
f = io.StringIO()
with redirect_stdout(f):
    !python scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt pruned.txt


_std_out = f.getvalue()
_std_out = _std_out  + 'question: what are hypernyms for hyponym [CHILD] ? | answer: [PARENTS] + select top20'
QA2_pruned = answers(_std_out)
QA2_pruned

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
question: what are hypernyms for hyponym [CHILD] ? | answer: [PARENTS] + select top20,0.40687,0.26098,0.32333,0.22667,0.23041,0.28675


In [35]:
cleanup()

del trainer

!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Mar 31 17:57:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
| 30%   50C    P2    66W / 460W |   9681MiB / 24564MiB |      0%      Default |
|                               |            

# Result 

In [33]:
import numpy as np

In [34]:
def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')




metrics_table = pd.concat([embading_find_hyponyms_uppercase_table, 
                           embading_standard_table,
                           QA_table, 
                           QA2_table, 
                           QA2_pruned])


metrics_table.to_csv('metrics_table.csv')
metrics_table = metrics_table.style.apply(highlight_max, 
                                          props='color:white; background-color:#1FC29D', axis=0).apply(highlight_min, props='color:white; background-color:#FF5555', axis=0)
metrics_table


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"find hypernyms for hyponym: [CHILD] , target: [PARENTS]",0.41024,0.2557,0.32267,0.22956,0.22899,0.27718
hyponym: [CHILD] | hypernyms: [PARENTS],0.39409,0.25061,0.30867,0.22044,0.21683,0.28208
what are hypernyms for hyponym [CHILD] ? [PARENTS],0.37119,0.24918,0.28267,0.21067,0.21613,0.29124
question: what are hypernyms for hyponym [CHILD] ? | answer: [PARENTS],0.4063,0.26081,0.32267,0.22644,0.23028,0.28664
question: what are hypernyms for hyponym [CHILD] ? | answer: [PARENTS] + select top20,0.40687,0.26098,0.32333,0.22667,0.23041,0.28675
