In [1]:
import os
import json

from transformers import AutoTokenizer, BertTokenizerFast, DataCollatorForTokenClassification
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd
from utils.dataset import LemmaDataSet



In [2]:
import torch
import random
import numpy as np

RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7f78093b39b0>

# Tokenizer

In [3]:
TOKENIZER_PATH = os.path.join('models', 'tokenizers')
TOKENIZED_DATASET_PATH = os.path.join('dataset','tokenized_pos')
NAME = 'gnlp-lemma-tokenizer'

In [4]:
MAX_LEN = 30
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '<START>'
SEP_TOKEN = '<END>'
POS_TOKEN = '<POS>'
MASK_TOKEN = '<MASK>'
SPECIAL_TOKENS = [UNK_TOKEN, PAD_TOKEN, CLS_TOKEN, SEP_TOKEN, MASK_TOKEN, POS_TOKEN]

In [5]:
train_df = pd.read_csv(os.path.join('csv','train.csv'))
MAX_LEN = max(train_df.word.str.len().max() ,train_df.lemma.str.len().max())
POS_TAGS = list(set(train_df.pos_tag))
POS_TAGS, MAX_LEN

(['A', 'Interj', 'Other', 'Adv', 'V', 'Pron', 'Num', 'Pp', 'Cj', 'N'], 36)

In [6]:
train_df[train_df.word == 'მხცოვანია']

Unnamed: 0.1,Unnamed: 0,word,lemma,pos_tag,freq,word_length,lemma_length,ratio,lemma_length_category,word_length_category,ratio_category,reported_speech,freq_category
10,478849,მხცოვანია,მხცოვანი,N,6,9,8,1.125,medium,low,greater,False,medium


In [7]:
tokenizer = Tokenizer.from_file(os.path.join(TOKENIZER_PATH, 'wordpiece_2000.json'))
# print(help(tokenizer))
tokenizer.add_tokens([*POS_TAGS, POS_TOKEN])

pos_token_id = tokenizer.token_to_id(POS_TOKEN)
cls_token_id = tokenizer.token_to_id(CLS_TOKEN)
sep_token_id = tokenizer.token_to_id(SEP_TOKEN)


tokenizer.post_processor = processors.TemplateProcessing(
          single=f"{CLS_TOKEN}:0 $A:0 {SEP_TOKEN}:0",
          pair=f"{CLS_TOKEN}:0 $A:0 {POS_TOKEN}:0 $B:1 {SEP_TOKEN}:1",
          special_tokens=[(CLS_TOKEN, cls_token_id), (SEP_TOKEN, sep_token_id), (POS_TOKEN, pos_token_id)],
          )

tokenizer = BertTokenizerFast(tokenizer_object=tokenizer, unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN, cls_token=CLS_TOKEN, sep_token=SEP_TOKEN, 
                              model_input_names=['input_ids', 'attention_mask', 'decoder_input_ids'],
                              additional_special_tokens=[*POS_TAGS, POS_TOKEN])



In [8]:
tokenizer.additional_special_tokens

['A', 'Interj', 'Other', 'Adv', 'V', 'Pron', 'Num', 'Pp', 'Cj', 'N', '<POS>']

In [9]:
word, pos_tag, lemma = train_df.iloc[0].word, train_df.iloc[0].pos_tag, train_df.iloc[0].lemma
word, pos_tag, lemma 

('მონოზონისა', 'N', 'მონოზონი')

In [10]:
tokenizer(word, text_pair=pos_tag, text_target=pos_tag).tokens()

['<START>', 'მონ', '##ო', '##ზო', '##ნის', '##ა', '<POS>', 'N', '<END>']

# Dataset

In [14]:
train_df = pd.read_csv(os.path.join('csv','train.csv'))
train_df = train_df[~train_df.lemma.str.startswith('*')]
test_df = pd.read_csv(os.path.join('csv','test.csv'))
test_df = test_df[~test_df.lemma.str.startswith('*')]

In [15]:
dataset = DatasetDict({'train': Dataset.from_pandas(train_df), 'test': Dataset.from_pandas(test_df)})

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'word', 'lemma', 'pos_tag', 'freq', 'word_length', 'lemma_length', 'ratio', 'lemma_length_category', 'word_length_category', 'ratio_category', 'reported_speech', 'freq_category', '__index_level_0__'],
        num_rows: 921949
    })
    test: Dataset({
        features: ['Unnamed: 0', 'word', 'lemma', 'pos_tag', 'freq', 'word_length', 'lemma_length', 'ratio', 'lemma_length_category', 'word_length_category', 'ratio_category', 'reported_speech', 'freq_category', '__index_level_0__'],
        num_rows: 395215
    })
})

In [17]:
dataset['train'][11]

{'Unnamed: 0': 621372,
 'word': 'უქადიდნენ',
 'lemma': 'ქადება',
 'pos_tag': 'V',
 'freq': 1,
 'word_length': 9,
 'lemma_length': 6,
 'ratio': 1.5,
 'lemma_length_category': 'low',
 'word_length_category': 'low',
 'ratio_category': 'greater',
 'reported_speech': False,
 'freq_category': 'low',
 '__index_level_0__': 11}

In [18]:
def tokenize(examples):
    model_inputs = tokenizer(
        examples["word"], text_pair=examples['pos_tag'], text_target=examples["lemma"], max_length=MAX_LEN, truncation=True
    )
    return model_inputs


In [19]:
tokenized_datasets = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset['train'].column_names,
)

Map:   0%|          | 0/921949 [00:00<?, ? examples/s]

Map:   0%|          | 0/395215 [00:00<?, ? examples/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 921949
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 395215
    })
})

In [21]:
train_set = tokenized_datasets['train'].train_test_split(train_size=0.9, seed=RANDOM_STATE)
tokenized_datasets["validation"] = train_set.pop("test")
tokenized_datasets['train'] = train_set['train']
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 829754
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 395215
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 92195
    })
})

In [22]:
tokenized_datasets.save_to_disk(TOKENIZED_DATASET_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/829754 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/395215 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/92195 [00:00<?, ? examples/s]

# Model

In [23]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, BartConfig, BartForConditionalGeneration
from datasets import load_from_disk
import numpy as np
import evaluate


In [24]:
tokenized_datasets = load_from_disk(TOKENIZED_DATASET_PATH)

In [25]:
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

def compute_metrics(eval_preds):

    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100s in the labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    chrf_res = chrf.compute(predictions=decoded_preds, references=decoded_labels)
    
    # For BLEU score
    decoded_preds = [' '.join(pred.strip()) for pred in decoded_preds]
    decoded_labels = [[' '.join(label.strip())] for label in decoded_labels]

    bleu_res = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {'bleu': bleu_res['score'], 'chrf': chrf_res['score']}

In [26]:
config = BartConfig(
    vocab_size=len(tokenizer),  
    d_model=128,       
    encoder_layers=1, 
    decoder_layers=1,  
    encoder_attention_heads=1,  
    decoder_attention_heads=1,  
    encoder_ffn_dim=128,  
    decoder_ffn_dim=128,  
    pad_token_id=tokenizer.pad_token_id, 
    bos_token_id=tokenizer.cls_token_id,
    decoder_start_token_id=tokenizer.cls_token_id,
    eos_token_id=tokenizer.sep_token_id,
    forced_eos_token_id=tokenizer.sep_token_id
)
config

BartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 128,
  "decoder_attention_heads": 1,
  "decoder_ffn_dim": 128,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 1,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 1,
  "encoder_ffn_dim": 128,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 1,
  "eos_token_id": 3,
  "forced_eos_token_id": 3,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  "num_hidden_layers": 1,
  "pad_token_id": 1,
  "scale_embedding": false,
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 2011
}

In [27]:
model = BartForConditionalGeneration(config=config)
# model = BartForConditionalGeneration.from_pretrained('models/pos_transformer/checkpoint-12630')
print('Number of parameters: ', sum(p.numel() for p in model.parameters()))

Number of parameters:  786048


In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [29]:
args = Seq2SeqTrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1.2e-3,
    per_device_train_batch_size=1024,
    per_device_eval_batch_size=1024,
    weight_decay=0.01,
    num_train_epochs=50,
    predict_with_generate=True,
    output_dir='./models/pos_transformer',
    save_safetensors=True, 
    save_total_limit=3,
    logging_steps=3
)



In [31]:
model.to('xla')
model.model.decoder.embed_tokens = model.model.shared
model.model.encoder.embed_tokens = model.model.shared


In [32]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


# Evaluation

In [24]:
trainer.predict(tokenized_datasets["test"])


KeyboardInterrupt



In [30]:
tokenizer.batch_decode(tokenized_datasets["train"][0:100]['input_ids'])

['<START> გურულებმა <POS> N <END>',
 '<START> აღსაშფოთებელია <POS> A <END>',
 '<START> ქონდრისკაციც <POS> N <END>',
 '<START> ამომასუნთქე <POS> V <END>',
 '<START> გეცინებათ <POS> V <END>',
 '<START> ლოცვებშიც <POS> N <END>',
 '<START> უბრალოისა <POS> A <END>',
 '<START> ხვანჯი <POS> N <END>',
 '<START> ყოველისფერისა <POS> A <END>',
 '<START> მოსკოვთანო <POS> N <END>',
 '<START> უცხადესს <POS> A <END>',
 '<START> ვმუსაიფობთ <POS> V <END>',
 '<START> მოხნავდი <POS> V <END>',
 '<START> ტარანტულივით <POS> N <END>',
 '<START> ასიმილირებისათვის <POS> N <END>',
 '<START> შარვლიანისთვის <POS> A <END>',
 '<START> მოსთხოვა <POS> V <END>',
 '<START> სიტის <POS> N <END>',
 '<START> ამიერკავკასიელთა <POS> N <END>',
 '<START> აგიძგერდება <POS> V <END>',
 '<START> ენებზე <POS> N <END>',
 '<START> ფოსოების <POS> N <END>',
 '<START> მოსტოვოისთვის <POS> N <END>',
 '<START> ხევსურებსა <POS> N <END>',
 '<START> დაებეზღებინათ <POS> V <END>',
 '<START> კურსორისთვის <POS> N <END>',
 '<START> ეადვილებოდა <PO

In [34]:
from transformers import Text2TextGenerationPipeline
recognizer = Text2TextGenerationPipeline(model=model.to('cpu'), tokenizer=tokenizer)

In [36]:
tokenizer.batch_decode(model.generate(torch.tensor(tokenizer('ამოასუნთქე', text_pair='V').input_ids).unsqueeze(0)))[0].split()[2]

'ამოასუნთქე'

In [98]:
tokenizer.batch_decode(model(torch.tensor(tokenizer('ვიყენებთ', text_pair='N').input_ids).unsqueeze(0))[0].argmax(-1))[0].split()

['<START>', 'ყყენებაებაბა', 'A', 'V']

In [70]:
train_df = pd.read_csv(os.path.join('csv','train.csv'))
incomplete_df = train_df[train_df.lemma.str.startswith('*')]

In [71]:
incomplete_df.head(30)

Unnamed: 0.1,Unnamed: 0,word,lemma,pos_tag,freq,word_length,lemma_length,ratio,lemma_length_category,word_length_category,ratio_category,reported_speech,freq_category
13,932306,სდეს,*დება,V,1,4,5,0.8,low,low,less,False,low
61,382597,ვაკურთხევთ,*კურთხევა,V,7,10,9,1.111111,medium,medium,greater,False,medium
73,103890,სწურავდა,*წურვა,V,9,8,6,1.333333,low,low,greater,False,medium
77,405866,გაბნევთ,*ბნევა,V,7,7,6,1.166667,low,low,greater,False,medium
81,1159757,მიგზავნიდაო,*გზავნა,V,1,11,7,1.571429,low,medium,greater,True,low
98,901995,აქებსო,*ქება,V,5,6,5,1.2,low,low,greater,True,medium
99,433274,გვეპყრობიან,*პყრობა,V,28,11,7,1.571429,low,medium,greater,False,medium
101,471321,მახურებს,*ხურება,V,2,8,7,1.142857,low,low,greater,False,low
103,581105,წყვეტ-მეთქი,*წყვეტა,V,1,11,7,1.571429,low,medium,greater,True,low
121,203372,გვცემს,*ცემა,V,88,6,5,1.2,low,low,greater,False,high


In [78]:
# incomplete_df['new_lemma'] = incomplete_df.apply(lambda row: tokenizer.batch_decode(model.generate(torch.tensor(tokenizer(row.word, text_pair='V').input_ids).unsqueeze(0)))[0].split()[2], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incomplete_df['new_lemma'] = incomplete_df.apply(lambda row: tokenizer.batch_decode(model.generate(torch.tensor(tokenizer(row.word, text_pair='V').input_ids).unsqueeze(0)))[0].split()[2], axis=1)


In [81]:
# incomplete_df[~incomplete_df.new_lemma.str.startswith('*')]

Unnamed: 0.1,Unnamed: 0,word,lemma,pos_tag,freq,word_length,lemma_length,ratio,lemma_length_category,word_length_category,ratio_category,reported_speech,freq_category,new_lemma
61,382597,ვაკურთხევთ,*კურთხევა,V,7,10,9,1.111111,medium,medium,greater,False,medium,კურთხევა
77,405866,გაბნევთ,*ბნევა,V,7,7,6,1.166667,low,low,greater,False,medium,გაბნევა
81,1159757,მიგზავნიდაო,*გზავნა,V,1,11,7,1.571429,low,medium,greater,True,low,მიგზავნა
121,203372,გვცემს,*ცემა,V,88,6,5,1.200000,low,low,greater,False,high,ცემა
294,633236,ვკადრებთ,*კადრება,V,5,8,8,1.000000,medium,low,equal,False,medium,კადრულობა
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956211,193018,ემსგავსები,*მსგავსება,V,36,10,10,1.000000,high,medium,equal,False,medium,მსგავსება
956296,354000,იყაო,*ყოფა,V,3,4,5,0.800000,low,low,less,False,medium,ყოფნა
956479,421640,ხტებაო,*ხტომა,V,2,6,6,1.000000,low,low,equal,True,low,ხტომა
956582,23608,ეწეოდა,*წევნა,V,2289,6,6,1.000000,low,low,equal,False,high,წოლა


In [101]:
model.save_pretrained(os.path.join('models', 'pos_transformer', 'lemma-transformer'))