In [1]:
!nvidia-smi

Mon Feb 27 15:24:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   35C    P8    25W / 460W |  20619MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 52%   51C    P8    18W / 260W |   7823MiB / 11264MiB |      0%      Default |
|       

In [31]:
!pip install torch
!pip install transformers

In [25]:
import numpy as np
import torch

from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig


from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange


import pandas as pd
from sklearn.utils import shuffle

from collections import Counter
from tqdm import tqdm

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# model

In [4]:
!nvidia-smi

Mon Feb 27 15:27:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   35C    P8    25W / 460W |  20619MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 45%   48C    P8    17W / 260W |   7823MiB / 11264MiB |      0%      Default |
|       

In [5]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]='3'

In [6]:
if torch.cuda.is_available():
    device='cuda'
    print('GPU')
else:
    device='cpu'
    print('CPU')
    
    
SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

GPU


In [7]:
from transformers import TrainingArguments, Trainer

model_checkpoint = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [9]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [10]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [11]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

# **Read data SemEval2018-Task9**

In [12]:
path_data_en = "SemEval2018-Task9/training/data/1A.english.training.data.txt"
path_gold_en = "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"

train_data_en_data = pd.read_csv(path_data_en, header=None, sep="\t", names=['term', 'relation'])
train_gold_en_data = pd.read_csv(path_gold_en, header=None, names=['hypernym'])

train_data_en_data.head()

Unnamed: 0,term,relation
0,blackfly,Concept
1,Turonian,Entity
2,abhorrence,Concept
3,tropical storm,Concept
4,militarization,Concept


In [13]:
path_test_data_en = "SemEval2018-Task9/test/data/1A.english.test.data.txt"
path_test_gold_en = "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"

test_data_en_data = pd.read_csv(path_test_data_en, header=None, sep="\t", names=['term', 'relation'])
test_gold_en_data = pd.read_csv(path_test_gold_en, header=None, names=['hypernym'])

In [38]:
hearst_patterns = """
There are a lot of [PARENT] such as

There were a lot of [PARENT] such as

There are a lot of [PARENT] here such as

Other [PARENT] such as

My favorite [PARENT] is either

There were a lot of [PARENT] here such as

which includes various [PARENT] like

Other [PARENT] especially

which includes various [PARENT] such as

My favorite [PARENT] is

I know such types of [PARENT] as

I know such kinds of [PARENT] as

[PARENT] such as

I know many kinds of [PARENT] for example

Other [PARENT] for example

[PARENT] ndmely

I know many types of [PARENT] for example

[PARENT] including

There are a lot of [PARENT] for example

which includes various [PARENT] for example

There are a lot of [PARENT] here for example

[PARENT] e.g.

[PARENT] like

[PARENT] especially

[PARENT] for example

[PARENT] for instance
"""

hearst_patterns = [pattern for ind, pattern in enumerate(hearst_patterns.split('\n')) if ind % 2 != 0]
hearst_patterns

['There are a lot of [PARENT] such as',
 'There were a lot of [PARENT] such as',
 'There are a lot of [PARENT] here such as',
 'Other [PARENT] such as',
 'My favorite [PARENT] is either',
 'There were a lot of [PARENT] here such as',
 'which includes various [PARENT] like',
 'Other [PARENT] especially',
 'which includes various [PARENT] such as',
 'My favorite [PARENT] is',
 'I know such types of [PARENT] as',
 'I know such kinds of [PARENT] as',
 '[PARENT] such as',
 'I know many kinds of [PARENT] for example',
 'Other [PARENT] for example',
 '[PARENT] ndmely',
 'I know many types of [PARENT] for example',
 '[PARENT] including',
 'There are a lot of [PARENT] for example',
 'which includes various [PARENT] for example',
 'There are a lot of [PARENT] here for example',
 '[PARENT] e.g.',
 '[PARENT] like',
 '[PARENT] especially',
 '[PARENT] for example',
 '[PARENT] for instance']

In [49]:
def hearest_preprocessing(train_features, train_target, test_features, test_target, hearst_pattern):
    
    stage_1, stage_2 = hearst_patterns[0].split('[PARENT]')
    
    train_data_en = train_features.copy()
    train_data_en = stage_1 + train_data_en.term + stage_2
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = stage_1 + test_data_en.term + stage_2
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en


In [50]:
train_data_en, train_gold_en, test_data_en, test_gold_en = hearest_preprocessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data, 
                                                                                 hearst_patterns[0])

0          There are a lot of blackfly such as
1          There are a lot of Turonian such as
2        There are a lot of abhorrence such as
3    There are a lot of tropical storm such as
4    There are a lot of militarization such as
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    There are a lot of maliciousness such as
1          There are a lot of buckler such as
2        There are a lot of spelunker such as
3     There are a lot of quo warranto such as
4     There are a lot of Jeff Francis such as
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2                    exploration, adventurer, explorer
3    proceedi

# Model training

In [51]:
# use only 1 time per session
model_checkpoint = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=100, block_size=64)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [19]:
# train_dataset = PairsDataset(tokenizer(train_data_en.tolist()), tokenizer(train_gold_en.tolist()))
# test_dataset = PairsDataset(tokenizer(test_data_en.tolist()), tokenizer(test_gold_en.tolist()))
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# args = TrainingArguments(output_dir="t5-finetuned-large", 
#                          num_train_epochs=16, 
#                          per_device_train_batch_size=16, save_steps=10000)

# trainer = Trainer(
#     model = model,
#     args = args,
#     train_dataset = train_dataset,
#     eval_dataset = test_dataset,
#     tokenizer = tokenizer,
#     data_collator = data_collator
# )

In [52]:
!nvidia-smi

Mon Feb 27 16:41:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   35C    P8    25W / 460W |  20619MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 45%   55C    P8    18W / 260W |   7823MiB / 11264MiB |      0%      Default |
|       

# EVALUATION 

In [56]:
example = test_data_en.tolist()[0]

In [57]:
input_ids = tokenizer.encode(example, return_tensors="pt")
output_batch = model.generate(input_ids.cuda(), no_repeat_ngram_size=2, max_new_tokens=2048, 
                              num_return_sequences=50, num_beams=50, early_stopping=True, 
                              num_beam_groups=5, 
                              diversity_penalty=1.0)
decoded_list = []
for outputs in output_batch:
    decoded = tokenizer.decode(outputs, skip_special_tokens=True).split(", ")
    decoded_list.extend(decoded)
    
sorted_predicted_answer = [i[0] for i in Counter(decoded_list).most_common()]
sorted_predicted_answer

['type of phishing that can be found in the internet nowadays. as cybercrime',
 'malware and so on which can harm your computer or phone. as a lot of spamming which are very harmful. as spam',
 'virus',
 'a lot of maliciousness. as phishing. as malware. as virus',
 'malware',
 'as hacking and a lot of maliciousness. as virus',
 'and so on. as spam. as as scam.ness such as malicious. as hacking... as.. There are so many malicious... Then there are malicious. Such as........... as viruses',
 'worms',
 'trojan horses',
 'etc.',
 'trojans etc. It',
 'trojans etc. ',
 'trojans etc. In',
 'and so on. as spam. as as scam.ness such as malicious. as hacking... as.. There are so many malicious... Then there are malicious. Such as........... as bots',
 'viruses',
 'trojans etc. It is',
 'trojans etc.,',
 'trojans etc. These',
 'trojans etc. which',
 'trojans etc. They',
 'trojan',
 'etc. It',
 'trojan etc. as phishing. as malware. as cybercrime. as as spam.ness such as as scam... as.. There are s

In [59]:
%%time 
test_pred_en=[]
for i_2 in tqdm(test_data_en.tolist()):
        input_ids = tokenizer.encode(i_2, return_tensors="pt")
        output_batch = model.generate(input_ids.cuda(), no_repeat_ngram_size=2, max_new_tokens=2048, 
                                      num_return_sequences=50, num_beams=50, early_stopping=True, 
                                      num_beam_groups=5, 
                                      diversity_penalty=1.0)
        decoded_list = []
        for outputs in output_batch:
            decoded = tokenizer.decode(outputs, skip_special_tokens=True).split(", ")
            decoded_list.extend(decoded)

        sorted_predicted_answer = [i[0] for i in Counter(decoded_list).most_common()]
        
        test_pred_en.append(sorted_predicted_answer)
        

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [56:21<00:00,  2.25s/it]

CPU times: user 56min 14s, sys: 9.19 s, total: 56min 23s
Wall time: 56min 21s





In [60]:
name  = 'test_pred_hearst_pattern.txt'

test_pred_en_df = []
for i in test_pred_en:
    test_pred_en_df.append('\t'.join(i))


test_pred_en_df = pd.DataFrame(test_pred_en_df)
test_pred_en_df.to_csv(name, header=None, index=None)

In [61]:
!python debuged_task9-scorer.py SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_hearst_pattern.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
MRR: 0.0025351851851851855
MAP: 0.002115307531974199
P@1: 0.0
P@3: 0.001888888888888889
P@5: 0.002333333333333334
P@15: 0.0024685185185185188


In [124]:
from contextlib import redirect_stdout
import io

f = io.StringIO()
with redirect_stdout(f):
    !python debuged_task9-scorer.py SemEval2018-Task9/test/gold/1A.english.test.gold.txt test_pred_hearst_pattern.txt
_std_out = f.getvalue()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [125]:
# _std_out = _std_out.replace('\r', '')
_std_out = _std_out  + 'name_of_pattern'
_std_out

'MRR: 0.0025351851851851855\r\nMAP: 0.002115307531974199\r\nP@1: 0.0\r\nP@3: 0.001888888888888889\r\nP@5: 0.002333333333333334\r\nP@15: 0.0024685185185185188\r\nname_of_pattern'

In [126]:
_std_out.split('\r\n')

['MRR: 0.0025351851851851855',
 'MAP: 0.002115307531974199',
 'P@1: 0.0',
 'P@3: 0.001888888888888889',
 'P@5: 0.002333333333333334',
 'P@15: 0.0024685185185185188',
 'name_of_pattern']

In [137]:
def answers(str_ans, name_of_pattern):
    str_ans += name_of_pattern
    columns_name = []
    values = []
    for ind, metrics in enumerate(_std_out.split('\r\n')):
        print(metrics)
        if ind == 6:
            _name = 'pattern'
            number = metrics
        else:
            _name, number = metrics.split(' ')
            number = round(float(number), 5)
            _name = _name[:-1]
        
        columns_name.append(_name)
        values.append([number])
        
    
        
    df = pd.DataFrame(values).T
    df.columns = columns_name
    df.set_index('pattern', inplace=True)
    return df

In [138]:
answers(ans, 'name_of_pattern')

MRR: 0.0025351851851851855
MAP: 0.002115307531974199
P@1: 0.0
P@3: 0.001888888888888889
P@5: 0.002333333333333334
P@15: 0.0024685185185185188
name_of_pattern


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
pattern,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
name_of_pattern,0.00254,0.00212,0.0,0.00189,0.00233,0.00247
