In [1]:
!nvidia-smi

Mon Feb 27 10:11:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   25C    P8    25W / 370W |      2MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install torch
!pip install transformers

Collecting torch
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00

In [3]:
import numpy as np
import torch

from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig


from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange


import pandas as pd
from sklearn.utils import shuffle

from collections import Counter
from tqdm import tqdm

# model

In [4]:
if torch.cuda.is_available():
    device='cuda:0'
    print('GPU')
else:
    device='cpu'
    print('CPU')
    
    
SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

GPU


In [5]:
from transformers import TrainingArguments, Trainer

model_checkpoint = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [7]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [8]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [9]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

# **Read data SemEval2018-Task9**

In [10]:
path_data_en = "SemEval2018-Task9/training/data/1A.english.training.data.txt"
path_gold_en = "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"

train_data_en_data = pd.read_csv(path_data_en, header=None, sep="\t", names=['term', 'relation'])
train_gold_en_data = pd.read_csv(path_gold_en, header=None, names=['hypernym'])

train_data_en_data.head()

Unnamed: 0,term,relation
0,blackfly,Concept
1,Turonian,Entity
2,abhorrence,Concept
3,tropical storm,Concept
4,militarization,Concept


In [11]:
path_test_data_en = "SemEval2018-Task9/test/data/1A.english.test.data.txt"
path_test_gold_en = "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"

test_data_en_data = pd.read_csv(path_test_data_en, header=None, sep="\t", names=['term', 'relation'])
test_gold_en_data = pd.read_csv(path_test_gold_en, header=None, names=['hypernym'])

In [13]:
def embading_find_hyponyms_uppercase(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'find hyponyms for hyperonym: ' + train_data_en.term + ' , target:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'find hyponyms for hyperonym: ' + test_data_en.term + ' , target:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

In [13]:

train_data_en, train_gold_en, test_data_en, test_gold_en = embading_find_hyponyms_uppercase(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

# Model training embading_find_hyponyms_uppercase

In [12]:
# use only 1 time per session
model_checkpoint = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=100, block_size=64)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [16]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 1500
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1504
  Number of trainable parameters = 737668096
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.9522
1000,0.6498
1500,0.5794




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1504, training_loss=0.7269695131702626, metrics={'train_runtime': 576.4958, 'train_samples_per_second': 41.631, 'train_steps_per_second': 2.609, 'total_flos': 2052887697408000.0, 'train_loss': 0.7269695131702626, 'epoch': 16.0})

In [18]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Feb 26 17:54:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
| 74%   53C    P2   179W / 370W |  24213MiB / 24576MiB |      0%      Default |
|                               |            

# EVALUATION 

In [13]:
def predict(test_data_en, test_gold_en):
    test_pred_en=[]
    for i_2, j_2 in tqdm(zip(test_data_en.tolist(), test_gold_en.tolist())):
            input_ids = tokenizer.encode(i_2, return_tensors="pt")
            output_batch = model.generate(input_ids.cuda(), no_repeat_ngram_size=2, max_new_tokens=2048, 
                                          num_return_sequences=50, num_beams=50, early_stopping=True, 
                                          num_beam_groups=5, 
                                          diversity_penalty=1.0)
            decoded_list = []
            for outputs in output_batch:
                decoded = tokenizer.decode(outputs, skip_special_tokens=True).split(", ")
                decoded_list.extend(decoded)

            sorted_predicted_answer = [i[0] for i in Counter(decoded_list).most_common()]

            test_pred_en.append(sorted_predicted_answer)
    return test_pred_en
        

In [24]:
test_pred_en = predict(test_data_en, test_gold_en)

0it [00:00, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

1it [00:00,  2.42it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

2it [00:00,  3.06it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

3it [00:01,  2.53it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

4it [00:01,  2.84it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

5it [00:01,  3.01it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"

In [25]:
name  = 'test_pred_en_2v_14_02.txt'

test_pred_en_df = []
for i in test_pred_en:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [13]:
!python /home/jovyan/work/debuged_task9-scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_2v_14_02.txt

MRR: 0.4065315517815516
MAP: 0.25253802592469293
P@1: 0.32133333333333336
P@3: 0.22666666666666618
P@5: 0.22343333333333187
P@15: 0.27160429200429154


In [17]:
embading_find_hyponyms_uppercase = """
MRR: 0.4065315517815516
MAP: 0.25253802592469293
P@1: 0.32133333333333336
P@3: 0.22666666666666618
P@5: 0.22343333333333187
P@15: 0.27160429200429154
find hyponyms for hyperonym: ... , target:
"""

def answers(str_ans):
    columns_name = []
    values = []
    for ind, metrics in enumerate(str_ans.split('\n')[1:-1]):
        if ind == 6:
            _name = 'prefix'
            number = metrics
        else:
            _name, number = metrics.split(' ')
            number = round(float(number), 5)
            _name = _name[:-1]
        
        columns_name.append(_name)
        values.append([number])
        
    
        
    df = pd.DataFrame(values).T
    df.columns = columns_name
    df.set_index('prefix', inplace=True)
    return df
        
        

In [18]:
embading_find_hyponyms_uppercase_table = answers(embading_find_hyponyms_uppercase)
embading_find_hyponyms_uppercase_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"find hyponyms for hyperonym: ... , target:",0.40653,0.25254,0.32133,0.22667,0.22343,0.2716


In [137]:
cleanup()

In [141]:
del trainer

NameError: name 'trainer' is not defined

In [142]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Feb 26 18:30:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   24C    P8    24W / 370W |  12791MiB / 24576MiB |      0%      Default |
|                               |            

# Embaing find hponyms for hypernym + lower case

In [153]:
def embading_find_hyponyms_lowercase(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'find hyponyms for hyperonym: ' + train_data_en.term.str.lower() + ' , target:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ').str.lower() 
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'find hyponyms for hyperonym: ' + test_data_en.term.str.lower() + ' , target:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ').str.lower() 
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

In [154]:
train_data_en, train_gold_en, test_data_en, test_gold_en = embading_find_hyponyms_lowercase(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0      find hyponyms for hyperonym: blackfly , target:
1      find hyponyms for hyperonym: turonian , target:
2    find hyponyms for hyperonym: abhorrence , target:
3    find hyponyms for hyperonym: tropical storm , ...
4    find hyponyms for hyperonym: militarization , ...
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    find hyponyms for hyperonym: maliciousness , t...
1       find hyponyms for hyperonym: buckler , target:
2     find hyponyms for hyperonym: spelunker , target:
3    find hyponyms for hyperonym: quo warranto , ta...
4    find hyponyms for hyperonym: jeff francis , ta...
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                     

In [155]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [156]:
trainer.train()

***** Running training *****
  Num examples = 1500
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1504
  Number of trainable parameters = 737668096


Step,Training Loss
500,0.5518
1000,0.474
1500,0.4269




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1504, training_loss=0.48421862214169603, metrics={'train_runtime': 560.0755, 'train_samples_per_second': 42.851, 'train_steps_per_second': 2.685, 'total_flos': 2195544016896000.0, 'train_loss': 0.48421862214169603, 'epoch': 16.0})

In [159]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Feb 26 18:45:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   32C    P8    24W / 370W |  24251MiB / 24576MiB |      0%      Default |
|                               |            

In [160]:
test_pred_en_lower = predict(test_data_en, test_gold_en)

0it [00:00, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

1it [00:00,  1.87it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

2it [00:00,  2.30it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

3it [00:01,  2.46it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

4it [00:01,  2.78it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

5it [00:01,  2.68it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"

In [None]:
name  = 'test_pred_en_lower.txt'

test_pred_en_df = []
for i in test_pred_en_lower:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [30]:
!python /home/jovyan/work/debuged_task9-scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_lower.txt

MRR: 0.38439761534761535
MAP: 0.24742700657367342
P@1: 0.298
P@3: 0.21433333333333338
P@5: 0.21696666666666548
P@15: 0.27681979686979663


In [20]:
embading_find_hyponyms_lowercase = """
MRR: 0.38439761534761535
MAP: 0.24742700657367342
P@1: 0.298
P@3: 0.21433333333333338
P@5: 0.21696666666666548
P@15: 0.27681979686979663
find hyponyms for hyperonym: ..., target: + all words lowercase'
""" 


embading_find_hyponyms_lowercase_table = answers(embading_find_hyponyms_lowercase)
embading_find_hyponyms_lowercase_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"find hyponyms for hyperonym: ..., target: + all words lowercase'",0.3844,0.24743,0.298,0.21433,0.21697,0.27682


In [168]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Feb 26 19:00:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   24C    P8    25W / 370W |  24251MiB / 24576MiB |      0%      Default |
|                               |            

In [169]:
cleanup()
del trainer

In [170]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Feb 26 19:00:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   28C    P3   133W / 370W |  12925MiB / 24576MiB |      0%      Default |
|                               |            

# Standard emabdding

In [192]:
def standard_preprocessing(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'hyperonym: ' + train_data_en.term + ' | hyponyms:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'hyperonym: ' + test_data_en.term + ' | hyponyms:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

train_data_en, train_gold_en, test_data_en, test_gold_en = standard_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0          hyperonym: blackfly | hyponyms:
1          hyperonym: Turonian | hyponyms:
2        hyperonym: abhorrence | hyponyms:
3    hyperonym: tropical storm | hyponyms:
4    hyperonym: militarization | hyponyms:
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    hyperonym: maliciousness | hyponyms:
1          hyperonym: buckler | hyponyms:
2        hyperonym: spelunker | hyponyms:
3     hyperonym: quo warranto | hyponyms:
4     hyperonym: Jeff Francis | hyponyms:
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2                    exploration, adventurer, explorer
3    proceedings, legal proceedings, proceeding, du..

In [193]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [194]:
trainer.train()

***** Running training *****
  Num examples = 1500
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1504
  Number of trainable parameters = 737668096


Step,Training Loss
500,0.3049
1000,0.2529
1500,0.2206




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1504, training_loss=0.2594152844332634, metrics={'train_runtime': 547.8558, 'train_samples_per_second': 43.807, 'train_steps_per_second': 2.745, 'total_flos': 1646941329408000.0, 'train_loss': 0.2594152844332634, 'epoch': 16.0})

In [195]:
test_pred_en_standard = predict(test_data_en, test_gold_en)

0it [00:00, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

1it [00:00,  1.23it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

2it [00:01,  1.34it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

3it [00:02,  1.48it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

4it [00:02,  1.72it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

5it [00:03,  1.74it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"

In [21]:
name  = 'test_pred_en_standard.txt'

test_pred_en_df = []
for i in test_pred_en_standard:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [29]:
!python /home/jovyan/work/debuged_task9-scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_standard.txt

MRR: 0.36780024420024393
MAP: 0.2419887540854206
P@1: 0.2846666666666667
P@3: 0.20277777777777792
P@5: 0.20804444444444353
P@15: 0.2814029766529765


In [22]:
embading_standard = """
MRR: 0.36780024420024393
MAP: 0.2419887540854206
P@1: 0.2846666666666667
P@3: 0.20277777777777792
P@5: 0.20804444444444353
P@15: 0.2814029766529765
hyperonym: ... | hyponyms:
""" 


embading_standard_table = answers(embading_standard)
embading_standard_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hyperonym: ... | hyponyms:,0.3678,0.24199,0.28467,0.20278,0.20804,0.2814


In [None]:
cleanup()
del trainer

In [199]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Feb 26 20:02:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   33C    P2   125W / 370W |  12925MiB / 24576MiB |      0%      Default |
|                               |            

# Embadding with term 

In [216]:
def emadding_with_term_and_find_prefix(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'find hyponyms for hyperonym ' + train_data_en.term + \
    ' with relationship ' + train_data_en.relation.str.lower() + ' , target:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'find hyponyms for hyperonym ' + test_data_en.term + \
    ' with relationship ' + test_data_en.relation.str.lower() + ' , target:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

train_data_en, train_gold_en, test_data_en, test_gold_en = emadding_with_term_and_find_prefix(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0    find hyponyms for hyperonym blackfly with rela...
1    find hyponyms for hyperonym Turonian with rela...
2    find hyponyms for hyperonym abhorrence with re...
3    find hyponyms for hyperonym tropical storm wit...
4    find hyponyms for hyperonym militarization wit...
dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    find hyponyms for hyperonym maliciousness with...
1    find hyponyms for hyperonym buckler with relat...
2    find hyponyms for hyperonym spelunker with rel...
3    find hyponyms for hyperonym quo warranto with ...
4    find hyponyms for hyperonym Jeff Francis with ...
dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2      

In [217]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [218]:
trainer.train()

***** Running training *****
  Num examples = 1500
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1504
  Number of trainable parameters = 737668096


Step,Training Loss
500,0.2177
1000,0.1727
1500,0.1459




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1504, training_loss=0.17872325755021673, metrics={'train_runtime': 565.7534, 'train_samples_per_second': 42.421, 'train_steps_per_second': 2.658, 'total_flos': 2255860881408000.0, 'train_loss': 0.17872325755021673, 'epoch': 16.0})

In [219]:
test_pred_en_with_term_find = predict(test_data_en, test_gold_en)

0it [00:00, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

1it [00:00,  1.15it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

2it [00:01,  1.15it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

3it [00:02,  1.11it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

4it [00:03,  1.14it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

5it [00:04,  1.27it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"

In [221]:
name  = 'test_pred_en_with_term_find.txt'

test_pred_en_df = []
for i in test_pred_en_with_term_find:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [28]:
!python /home/jovyan/work/debuged_task9-scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_with_term_find.txt

MRR: 0.3551455766455761
MAP: 0.23321399933399914
P@1: 0.258
P@3: 0.19966666666666685
P@5: 0.2013222222222211
P@15: 0.27276278721278713


In [25]:
embading_with_term_find = """
MRR: 0.3551455766455761
MAP: 0.23321399933399914
P@1: 0.258
P@3: 0.19966666666666685
P@5: 0.2013222222222211
P@15: 0.27276278721278713
find hyponyms for hyperonym ... with relationship [Cncept, Entity], target:
""" 


embading_with_term_find_table = answers(embading_with_term_find)
embading_with_term_find_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"find hyponyms for hyperonym ... with relationship [Cncept, Entity], target:",0.35515,0.23321,0.258,0.19967,0.20132,0.27276


# Increase the size of batch with standart embadding

In [19]:
!nvidia-smi

Mon Feb 27 10:24:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   25C    P8    25W / 370W |   6393MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [62]:
def QandA_preprocessing(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'question: what are hyponyms for hyperonym ' + train_data_en.term + ' ?' + ' | answer: _20_'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'question: what are hyponyms for hyperonym ' + test_data_en.term + ' ?' + ' | answer: _20_' 
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

train_data_en, train_gold_en, test_data_en, test_gold_en = QandA_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0    question: what are hyponyms for hyperonym blac...
1    question: what are hyponyms for hyperonym Turo...
2    question: what are hyponyms for hyperonym abho...
3    question: what are hyponyms for hyperonym trop...
4    question: what are hyponyms for hyperonym mili...
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    question: what are hyponyms for hyperonym mali...
1    question: what are hyponyms for hyperonym buck...
2    question: what are hyponyms for hyperonym spel...
3    question: what are hyponyms for hyperonym quo ...
4    question: what are hyponyms for hyperonym Jeff...
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                     

In [63]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [64]:
trainer.train()

***** Running training *****
  Num examples = 1500
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1504
  Number of trainable parameters = 737668096


Step,Training Loss
500,0.3988
1000,0.3383
1500,0.2994




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1504, training_loss=0.34549561824570313, metrics={'train_runtime': 565.5631, 'train_samples_per_second': 42.436, 'train_steps_per_second': 2.659, 'total_flos': 2763293841408000.0, 'train_loss': 0.34549561824570313, 'epoch': 16.0})

In [68]:
test_pred_en_QA = predict(test_data_en, test_gold_en)

0it [00:00, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

1it [00:00,  1.76it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

2it [00:01,  1.78it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

3it [00:02,  1.40it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

4it [00:02,  1.77it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

5it [00:02,  1.70it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"

In [26]:
name  = 'test_pred_en_QA.txt'

test_pred_en_df = []
for i in test_pred_en_QA:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [27]:
!python /home/jovyan/work/debuged_task9-scorer.py /home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_en_QA.txt

MRR: 0.39486768601768607
MAP: 0.25371332383665746
P@1: 0.31
P@3: 0.21866666666666673
P@5: 0.21817777777777636
P@15: 0.29226473341473297


In [117]:
test_pred_en_df[0].str.split('\t').apply(lambda x: x[:20]).str.join('\t').to_csv('pruned.txt', header=None, index=None)

In [118]:
!python debuged_task9-scorer.py SemEval2018-Task9/test/gold/1A.english.test.gold.txt pruned.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.3987962574462576
MAP: 0.25522200108533477
P@1: 0.31466666666666665
P@3: 0.2202222222222222
P@5: 0.21941111111110967
P@15: 0.293409971509971


In [72]:
test_pred_en_QA = """
MRR: 0.39486768601768607
MAP: 0.25371332383665746
P@1: 0.31
P@3: 0.21866666666666673
P@5: 0.21817777777777636
P@15: 0.29226473341473297
Q&A
""" 


standart_embading_with_batch_size_32_table = answers(test_pred_en_QA)
standart_embading_with_batch_size_32_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
embadding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Q&A,0.39487,0.25371,0.31,0.21867,0.21818,0.29226


In [119]:
test_pred_en_QA_pruned = """
MRR: 0.3987962574462576
MAP: 0.25522200108533477
P@1: 0.31466666666666665
P@3: 0.2202222222222222
P@5: 0.21941111111110967
P@15: 0.293409971509971
Q&A_Pruned
""" 


standart_embading_QA_pruned = answers(test_pred_en_QA_pruned)
standart_embading_QA_pruned

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
embadding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Q&A_Pruned,0.3988,0.25522,0.31467,0.22022,0.21941,0.29341


In [73]:
cleanup()

del trainer

!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Feb 27 12:50:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   29C    P2    70W / 370W |  12895MiB / 24576MiB |      0%      Default |
|                               |            

# Pruning embadding

In [148]:
def QA_with_closing_s(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'answer question: what are hyponyms for hyperonym: ' + train_data_en.term + ' </s>"' + ' , target:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'answer question: what are hyponyms for hyperonym: ' + test_features.term + ' </s>"' + ' , target:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en

In [149]:
train_data_en, train_gold_en, test_data_en, test_gold_en = QA_with_closing_s(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0    answer question: what are hyponyms for hyperon...
1    answer question: what are hyponyms for hyperon...
2    answer question: what are hyponyms for hyperon...
3    answer question: what are hyponyms for hyperon...
4    answer question: what are hyponyms for hyperon...
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    answer question: what are hyponyms for hyperon...
1    answer question: what are hyponyms for hyperon...
2    answer question: what are hyponyms for hyperon...
3    answer question: what are hyponyms for hyperon...
4    answer question: what are hyponyms for hyperon...
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                     

In [150]:
train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(output_dir="t5-finetuned-large", 
                         num_train_epochs=16, 
                         per_device_train_batch_size=16, save_steps=10000)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [151]:
trainer.train()

***** Running training *****
  Num examples = 1500
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1504
  Number of trainable parameters = 737668096


Step,Training Loss
500,0.2039
1000,0.1649
1500,0.1387




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1504, training_loss=0.16911852993863694, metrics={'train_runtime': 565.9416, 'train_samples_per_second': 42.407, 'train_steps_per_second': 2.658, 'total_flos': 2661807249408000.0, 'train_loss': 0.16911852993863694, 'epoch': 16.0})

In [152]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Feb 27 13:54:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
| 74%   53C    P2   255W / 370W |  24243MiB / 24576MiB |      0%      Default |
|                               |            

In [154]:
test_pred_en_QA_with_closing_s = predict(test_data_en, test_gold_en)

0it [00:00, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

1it [00:00,  1.43it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

2it [00:01,  1.56it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

3it [00:01,  1.52it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

4it [00:02,  1.67it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

5it [00:03,  1.52it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"

In [155]:
name  = 'QA_with_closing_s.txt'

test_pred_en_df = []
for i in test_pred_en_QA_with_closing_s:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)


!python debuged_task9-scorer.py SemEval2018-Task9/test/gold/1A.english.test.gold.txt QA_with_closing_s.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.37523532208532173
MAP: 0.2431501325834664
P@1: 0.2826666666666667
P@3: 0.2108888888888888
P@5: 0.20878888888888772
P@15: 0.28244743404743383


In [156]:
QA_with_closing_s = """
MRR: 0.37523532208532173
MAP: 0.2431501325834664
P@1: 0.2826666666666667
P@3: 0.2108888888888888
P@5: 0.20878888888888772
P@15: 0.28244743404743383
QA_with_closing_s
""" 


QA_with_closing_s_table = answers(QA_with_closing_s)
QA_with_closing_s_table

Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
embadding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
QA_with_closing_s,0.37524,0.24315,0.28267,0.21089,0.20879,0.28245


In [157]:
cleanup()

del trainer

!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Feb 27 14:16:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
|  0%   35C    P2    96W / 370W |  13219MiB / 24576MiB |      0%      Default |
|                               |            

# Result 

In [186]:
def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')




metrics_table = pd.concat([embading_standard_table,
                           embading_find_hyponyms_uppercase_table, 
                           embading_find_hyponyms_lowercase_table,            
                           embading_with_term_find_table, 
                           standart_embading_with_batch_size_32_table, 
                           standart_embading_QA_pruned, 
                           test_pred_en_QA_single_table, 
                           QA_with_closing_s_table])



metrics_table = metrics_table.style.apply(highlight_max, 
                                          props='color:white; background-color:#1FC29D', axis=0).apply(highlight_min, props='color:white; background-color:#FF5555', axis=0)
metrics_table


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
embadding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
embading_standard,0.3678,0.24199,0.28467,0.20278,0.20804,0.2814
embading_find_hyponyms_uppercase,0.40653,0.25254,0.32133,0.22667,0.22343,0.2716
embading_find_hyponyms_lowercase,0.3844,0.24743,0.298,0.21433,0.21697,0.27682
emadding_with_term_and_find_prefix,0.35515,0.23321,0.258,0.19967,0.20132,0.27276
Q&A,0.39487,0.25371,0.31,0.21867,0.21818,0.29226
Q&A_Pruned,0.3988,0.25522,0.31467,0.22022,0.21941,0.29341
Q&A_single,0.37801,0.24353,0.292,0.20333,0.20931,0.2836
QA_with_closing_s,0.37524,0.24315,0.28267,0.21089,0.20879,0.28245
