In [None]:
# An example of POS tagging of the Kazakh training set. The test and validation corpora are tagged in the same way.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("Justice0893/xlm-roberta-base-kazakh-pos-tagging")
model = AutoModelForTokenClassification.from_pretrained("Justice0893/xlm-roberta-base-kazakh-pos-tagging")
model.to(device)

with open("kk_train_shuffled.txt-filtered.kk", 'r', encoding='utf-8') as file:
    content = file.readlines()

with open("RoBERTa_KK_POS_train", 'w', encoding='utf-8') as output_file:

    for line in content:
        line = line.strip()

        tokens = tokenizer(line, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in tokens.items()}
        outputs = model(**inputs).logits

        predictions = torch.argmax(outputs, dim=2)[0]
        tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
        word_ids = tokenizer(line).word_ids()

        previous_word_idx = None
        word_pos_dict = {}

        for idx, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            token = tokens[idx]
            if token.startswith("##"):
                token = token[2:]
                word_pos_dict[word_idx] = (word_pos_dict[word_idx][0] + token, word_pos_dict[word_idx][1])
            else:
                tag = model.config.id2label[predictions[idx].item()]
                word_pos_dict[word_idx] = (token, tag)
            previous_word_idx = word_idx

        word_pos_tags = [f"{tag}" for word, tag in word_pos_dict.values()]
        tagged_sentence = " ".join(word_pos_tags)

        output_file.write(tagged_sentence + "\n")

In [None]:
# kk vocab creation
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab kk_vocab kk_train_shuffled.txt-filtered.kk

In [None]:
def find_unique_words(input_filepath, output_filepath):
    unique_words = set() 

    with open(input_filepath, 'r', encoding='utf-8') as file:
        for line in file:

            words = line.split()

            unique_words.update(word.lower() for word in words)


    with open(output_filepath, 'w', encoding='utf-8') as file:
        for word in sorted(unique_words):
            file.write(word + '\n')

    print(f"Total unique words: {len(unique_words)}")

input_file = "RoBERTa_KK_POS_train"  
output_file = "RoBERTa_KK_unique_pos" 
find_unique_words(input_file, output_file)

In [None]:
# en vocab creation
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab tgt_1_en_vocab en_train_shuffled.txt-filtered.en

In [7]:
# Kazakh subwording and subword units POS tags
import re
import sentencepiece as spm

def preprocess_text(text, pos_tags):

    text = re.sub(r"([.,!?;:()\"'])", r" \1 ", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    
    words = text.split()
    
    new_pos_tags = []
    i = 0  # pos_tags index'i
    for word in words:
        #if re.match(r"[.,!?;:()\"']", word):
            #pass
        #else:
            if i < len(pos_tags):
                new_pos_tags.append(pos_tags[i]) 
                i += 1

    return words, new_pos_tags

def tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path):

    sp = spm.SentencePieceProcessor()
    sp.Load(model_path)
    
    with open(input_text_path, 'r', encoding='utf-8') as text_file, \
         open(input_pos_path, 'r', encoding='utf-8') as pos_file, \
         open(output_token_path, 'w', encoding='utf-8') as token_file, \
         open(output_pos_path, 'w', encoding='utf-8') as pos_file_out:
        
        for text_line, pos_line in zip(text_file, pos_file):
            pos_tags = pos_line.strip().split()
            preprocessed_words, adjusted_pos_tags = preprocess_text(text_line, pos_tags)
            
            tokenized_text = []
            tokenized_tags = []
            
            for word, tag in zip(preprocessed_words, adjusted_pos_tags):
                tokens = sp.EncodeAsPieces(word)
                tokenized_text.extend(tokens)
                tokenized_tags.extend([tag] * len(tokens))
            
            token_file.write(" ".join(tokenized_text) + "\n")
            pos_file_out.write(" ".join(tokenized_tags) + "\n")


input_text_path = "kk_train_shuffled.txt-filtered.kk" 
input_pos_path = "RoBERTa_KK_POS_train" 
output_token_path = "tokens_train.txt" 
output_pos_path = "pos_tags_train.txt" 
model_path = "kk_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)


In [8]:
input_text_path = "kk_test_shuffled.txt-filtered.kk" 
input_pos_path = "RoBERTa_KK_POS_test" 
output_token_path = "tokens_test.txt" 
output_pos_path = "pos_tags_test.txt"
model_path = "kk_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

input_text_path = "kk_valid_shuffled.txt-filtered.kk" 
input_pos_path = "RoBERTa_KK_POS_valid" 
output_token_path = "tokens_dev.txt" 
output_pos_path = "pos_tags_dev.txt" 
model_path = "kk_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

In [9]:
# English subwording
import re
import sentencepiece as spm

sp_model = spm.SentencePieceProcessor()
sp_model.Load("tgt_1_en_vocab.model")

def preprocess(text):

    text = re.sub(r'([.,!?;:()])', r' \1 ', text)

    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize_and_save(input_file, model, output_file):
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as fo:
        for line in f:
            
            preprocessed_line = preprocess(line.strip())
            tokens = model.encode_as_pieces(preprocessed_line)
            fo.write(" ".join(tokens) + "\n")

tokenize_and_save("en_train_shuffled.txt-filtered.en", sp_model, "train_target_tokens.txt")
tokenize_and_save("en_valid_shuffled.txt-filtered.en", sp_model, "dev_target_tokens.txt")
tokenize_and_save("en_test_shuffled.txt-filtered.en", sp_model, "test_target_tokens.txt")

In [1]:
# Kk-En (POS Tags)
!onmt-main --model modelim.py --config data.yml --auto_config train --with_eval --num_gpus 2

2024-06-13 15:42:06.953477: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-13 15:42:07.697775: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-13 15:42:07.697844: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [2]:
!CUDA_VISIBLE_DEVICES=0 onmt-main --config data.yml --auto_config --checkpoint_path RoBERTa_POS-KK-EN/ckpt-100000 infer --features_file tokens_test.txt pos_tags_test.txt --predictions_file output_kk_en.txt

2024-06-14 13:36:54.811532: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-14 13:36:55.674074: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-14 13:36:55.674180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [None]:
!python3 MT-Preparation/subwording/3-desubword.py tgt_1_en_vocab.model output_kk_en.txt

In [1]:
# BLEU and chrF scores
!python3 compute-bleu.py en_test_shuffled.txt-filtered.en output_kk_en.txt.desubword

Reference first sentence: In the developed world, this figure is 35 25%
Translated first sentence: In the developed countries of the world , this figure is 35 25%
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
BLEU:  BLEU = 50.02 71.5/54.3/44.0/36.6 (BP = 1.000 ratio = 1.093 hyp_len = 452895 ref_len = 414303)
CHRF:  chrF2 = 73.89


In [6]:
# Average METEOR score (Ortalama METEOR Puanı)
import nltk
from nltk.translate.meteor_score import meteor_score

def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [nltk.word_tokenize(line.strip()) for line in lines]

def calculate_meteor(reference_file, hypothesis_file):
    references = read_and_tokenize_file(reference_file)
    hypotheses = read_and_tokenize_file(hypothesis_file)
    
    if len(references) != len(hypotheses):
        raise ValueError("Dosyaların satır sayıları eşleşmiyor")

    total_meteor_score = 0.0
    for ref, hyp in zip(references, hypotheses):
        total_meteor_score += meteor_score([ref], hyp)

    average_meteor_score = total_meteor_score / len(references)
    return average_meteor_score

reference_file = 'en_test_shuffled.txt-filtered.en'
hypothesis_file = 'output_kk_en.txt.desubword'

score = calculate_meteor(reference_file, hypothesis_file)
print(f"Ortalama METEOR Puanı: {score}") #Average METEOR score


Ortalama METEOR Puanı: 0.7377232733926027


In [14]:
# Kk-En (base model)
!onmt-main --model kk-standart-modelim.py --config data-transformer.yml --auto_config train --with_eval --num_gpus 2

2024-06-16 11:50:11.725490: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-16 11:50:12.480913: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-16 11:50:12.480996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [15]:
!CUDA_VISIBLE_DEVICES=0 onmt-main --config data-transformer.yml --auto_config --checkpoint_path KK-EN-Standard-Transformer/ckpt-100000 infer --features_file tokens_test.txt --predictions_file output_kk_en_standart.txt

2024-06-17 13:04:24.111462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-17 13:04:24.877466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-17 13:04:24.877548: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [None]:
!python3 MT-Preparation/subwording/3-desubword.py tgt_1_en_vocab.model output_kk_en_standart.txt

In [2]:
# BLEU and chrF scores
!python3 compute-bleu.py en_test_shuffled.txt-filtered.en output_kk_en_standart.txt.desubword

Reference first sentence: In the developed world, this figure is 35 25%
Translated first sentence: In developed countries of the world , this figure is 35%
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
BLEU:  BLEU = 49.30 70.9/53.6/43.3/35.9 (BP = 1.000 ratio = 1.098 hyp_len = 454860 ref_len = 414303)
CHRF:  chrF2 = 73.67


In [20]:
# Average METEOR score (Ortalama METEOR Puanı)
import nltk
from nltk.translate.meteor_score import meteor_score

def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [nltk.word_tokenize(line.strip()) for line in lines]

def calculate_meteor(reference_file, hypothesis_file):
    references = read_and_tokenize_file(reference_file)
    hypotheses = read_and_tokenize_file(hypothesis_file)
    
    if len(references) != len(hypotheses):
        raise ValueError("Dosyaların satır sayıları eşleşmiyor")

    total_meteor_score = 0.0
    for ref, hyp in zip(references, hypotheses):
        total_meteor_score += meteor_score([ref], hyp)

    average_meteor_score = total_meteor_score / len(references)
    return average_meteor_score

reference_file = 'en_test_shuffled.txt-filtered.en'
hypothesis_file = 'output_kk_en_standart.txt.desubword'

score = calculate_meteor(reference_file, hypothesis_file)
print(f"Ortalama METEOR Puanı: {score}") #Average METEOR score


Ortalama METEOR Puanı: 0.7329952068579672
