In [9]:
#Turkish POS Tagging
import re
from zemberek.morphology import TurkishMorphology
from zemberek.tokenization import TurkishTokenizer

def pos_tagger(input, output):
    tokenizer = TurkishTokenizer.DEFAULT
    morphology = TurkishMorphology.create_with_defaults()
    
    input_file_path = input
    output_file_path = output
    
    with open(output_file_path, "w") as output_file:
        with open(input_file_path, "r") as input_file:
            line_count = 0
            error_count = 0
            
            for line in input_file:
                line_count += 1
                tags = []
                data = line.strip()
                tokens = tokenizer.tokenize(data)
                
                for token in tokens:
                    analysis = morphology.analyze(token.normalized)
                    if analysis.analysis_results:
                        match = re.search(r"\[(.*?):(.*?)(?=[\]\.,;!?]|$)", str(analysis.analysis_results[0]))
                        if match:
                            tag = match.group(2)
                            tag = re.sub(r'[^\w\s]', '', str(tag))
                            tags.append(str(tag) + " ")
                    else:
                        tags.append("X ")
                
                if len(tokens) != len(tags):
                    error_count += 1
                    print(f"Error in line {line_count}: Token count ({len(tokens)}) does not match tag count ({len(tags)})")
    
                output_file.write("".join(tags) + "\n")
    
            print(f"Total lines processed: {line_count}")
            print(f"Total errors found: {error_count}")


In [None]:
pos_tagger("Tatoeba.en-tr.tr-filtered.tr.train","Tatoeba_pos_tags_train")
pos_tagger("Tatoeba.en-tr.tr-filtered.tr.dev","Tatoeba_pos_tags_dev")
pos_tagger("Tatoeba.en-tr.tr-filtered.tr.test","Tatoeba_pos_tags_test")

In [None]:
#Tatoeba tr_vocab creation
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab tr_vocab Tatoeba.en-tr.tr-filtered.tr.train

In [None]:
#Tatoeba en_vocab creation
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab en_vocab Tatoeba.en-tr.en-filtered.en.train

In [15]:
#Tatoeba Tr side shared_vocab subwording and subword units POS tags
import re
import sentencepiece as spm

def preprocess_text(text, pos_tags):

    text = re.sub(r"([.,!?;:()\"'])", r" \1 ", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    
    words = text.split()
    
    new_pos_tags = []
    i = 0  # pos_tags index'i
    for word in words:
        #if re.match(r"[.,!?;:()\"']", word):
            #new_pos_tags.append('Punc') 
            #pass
        #else:
            if i < len(pos_tags):
                new_pos_tags.append(pos_tags[i]) 
                i += 1

    return words, new_pos_tags

def tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path):

    sp = spm.SentencePieceProcessor()
    sp.Load(model_path)
    
    with open(input_text_path, 'r', encoding='utf-8') as text_file, \
         open(input_pos_path, 'r', encoding='utf-8') as pos_file, \
         open(output_token_path, 'w', encoding='utf-8') as token_file, \
         open(output_pos_path, 'w', encoding='utf-8') as pos_file_out:
        
        for text_line, pos_line in zip(text_file, pos_file):
            pos_tags = pos_line.strip().split()
            preprocessed_words, adjusted_pos_tags = preprocess_text(text_line, pos_tags)
            
            tokenized_text = []
            tokenized_tags = []
            
            for word, tag in zip(preprocessed_words, adjusted_pos_tags):
                tokens = sp.EncodeAsPieces(word)
                tokenized_text.extend(tokens)
                tokenized_tags.extend([tag] * len(tokens))
            
            token_file.write(" ".join(tokenized_text) + "\n")
            pos_file_out.write(" ".join(tokenized_tags) + "\n")


input_text_path = "Tatoeba.en-tr.tr-filtered.tr.train" 
input_pos_path = "Tatoeba_pos_tags_train" 
output_token_path = "Tatoeba_tokens_train_shared"
output_pos_path = "Tatoeba_pos_tags_train_shared.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)


In [17]:
input_text_path = "Tatoeba.en-tr.tr-filtered.tr.dev"
input_pos_path = "Tatoeba_pos_tags_dev" 
output_token_path = "Tatoeba_tokens_dev_shared" 
output_pos_path = "Tatoeba_pos_tags_dev_shared.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

input_text_path = "Tatoeba.en-tr.tr-filtered.tr.test"
input_pos_path = "Tatoeba_pos_tags_test" 
output_token_path = "Tatoeba_tokens_test_shared" 
output_pos_path = "Tatoeba_pos_tags_test_shared.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

In [None]:
# Tatoeba tr_vocab subwording and subword POS tags
input_text_path = "Tatoeba.en-tr.tr-filtered.tr.train" 
input_pos_path = "Tatoeba_pos_tags_train" 
output_token_path = "Tatoeba_tokens_train" 
output_pos_path = "Tatoeba_pos_tags_train.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

input_text_path = "Tatoeba.en-tr.tr-filtered.tr.dev" 
input_pos_path = "Tatoeba_pos_tags_dev" 
output_token_path = "Tatoeba_tokens_dev" 
output_pos_path = "Tatoeba_pos_tags_dev.txt" 
model_path = "kk_tr_shared_vocab.model"

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

input_text_path = "Tatoeba.en-tr.tr-filtered.tr.test"
input_pos_path = "Tatoeba_pos_tags_test" 
output_token_path = "Tatoeba_tokens_test" 
output_pos_path = "Tatoeba_pos_tags_test.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

In [19]:
#Kazakh shared_vocab subwording and subword units POS tags
import re
import sentencepiece as spm

def preprocess_text(text, pos_tags):
    
    text = re.sub(r"([.,!?;:()\"'])", r" \1 ", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    
    words = text.split()
    
    new_pos_tags = []
    i = 0  # pos_tags index'i
    for word in words:
        #if re.match(r"[.,!?;:()\"']", word):
            #new_pos_tags.append('Punc') 
            #pass
        #else:
            if i < len(pos_tags):
                new_pos_tags.append(pos_tags[i]) 
                i += 1

    return words, new_pos_tags

def tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path):

    sp = spm.SentencePieceProcessor()
    sp.Load(model_path)
    
    with open(input_text_path, 'r', encoding='utf-8') as text_file, \
         open(input_pos_path, 'r', encoding='utf-8') as pos_file, \
         open(output_token_path, 'w', encoding='utf-8') as token_file, \
         open(output_pos_path, 'w', encoding='utf-8') as pos_file_out:
        
        for text_line, pos_line in zip(text_file, pos_file):
            pos_tags = pos_line.strip().split()
            preprocessed_words, adjusted_pos_tags = preprocess_text(text_line, pos_tags)
            
            tokenized_text = []
            tokenized_tags = []
            
            for word, tag in zip(preprocessed_words, adjusted_pos_tags):
                tokens = sp.EncodeAsPieces(word)
                tokenized_text.extend(tokens)
                tokenized_tags.extend([tag] * len(tokens))
            
            token_file.write(" ".join(tokenized_text) + "\n")
            pos_file_out.write(" ".join(tokenized_tags) + "\n")


input_text_path = "kk_train_shuffled.txt-filtered.kk"
input_pos_path = "RoBERTa_KK_POS_train" 
output_token_path = "KK_tokens_train_shared"
output_pos_path = "KK_pos_tags_train_shared.txt" 
model_path = "kk_tr_shared_vocab.model"

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)


In [21]:
input_text_path = "kk_valid_shuffled.txt-filtered.kk"
input_pos_path = "RoBERTa_KK_POS_valid" 
output_token_path = "KK_tokens_valid_shared"
output_pos_path = "KK_pos_tags_valid_shared.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

input_text_path = "kk_test_shuffled.txt-filtered.kk" 
input_pos_path = "RoBERTa_KK_POS_test" 
output_token_path = "KK_tokens_test_shared" 
output_pos_path = "KK_pos_tags_test_shared.txt" 
model_path = "kk_tr_shared_vocab.model" 

tokenize_and_label_separate_files(input_text_path, input_pos_path, output_token_path, output_pos_path, model_path)

In [22]:
#Tatoeba corpus English side subwording
import re
import sentencepiece as spm

sp_model = spm.SentencePieceProcessor()
sp_model.Load("en_vocab.model")

def preprocess(text):

    text = re.sub(r'([.,!?;:()])', r' \1 ', text)

    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize_and_save(input_file, model, output_file):
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as fo:
        for line in f:
            preprocessed_line = preprocess(line.strip())
            tokens = model.encode_as_pieces(preprocessed_line)
            fo.write(" ".join(tokens) + "\n")

tokenize_and_save("Tatoeba.en-tr.en-filtered.en.train", sp_model, "Tatoeba_train_target_tokens.txt")
tokenize_and_save("Tatoeba.en-tr.en-filtered.en.dev", sp_model, "Tatoeba_dev_target_tokens.txt")
tokenize_and_save("Tatoeba.en-tr.en-filtered.en.test", sp_model, "Tatoeba_test_target_tokens.txt")

In [23]:
#Tatoeba corpus English side shared vocab subwording
import re
import sentencepiece as spm

sp_model = spm.SentencePieceProcessor()
sp_model.Load("en_shared_vocab.model")

def preprocess(text):

    text = re.sub(r'([.,!?;:()])', r' \1 ', text)

    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize_and_save(input_file, model, output_file):
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as fo:
        for line in f:
            preprocessed_line = preprocess(line.strip())
            tokens = model.encode_as_pieces(preprocessed_line)
            fo.write(" ".join(tokens) + "\n")

tokenize_and_save("Tatoeba.en-tr.en-filtered.en.train", sp_model, "Tatoeba_train_target_tokens_shared.txt")
tokenize_and_save("Tatoeba.en-tr.en-filtered.en.dev", sp_model, "Tatoeba_dev_target_tokens_shared.txt")
tokenize_and_save("Tatoeba.en-tr.en-filtered.en.test", sp_model, "Tatoeba_test_target_tokens_shared.txt")

In [24]:
#Kazakh corpus English side shared vocab subwording
import re
import sentencepiece as spm

sp_model = spm.SentencePieceProcessor()
sp_model.Load("en_shared_vocab.model")

def preprocess(text):

    text = re.sub(r'([.,!?;:()])', r' \1 ', text)

    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize_and_save(input_file, model, output_file):
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as fo:
        for line in f:
            preprocessed_line = preprocess(line.strip())
            tokens = model.encode_as_pieces(preprocessed_line)
            fo.write(" ".join(tokens) + "\n")

tokenize_and_save("en_train_shuffled.txt-filtered.en", sp_model, "KK_train_target_tokens_shared.txt")
tokenize_and_save("en_valid_shuffled.txt-filtered.en", sp_model, "KK_valid_target_tokens_shared.txt")
tokenize_and_save("en_test_shuffled.txt-filtered.en", sp_model, "KK_test_target_tokens_shared.txt")

In [25]:
# Tr-En(Tatoeba)(POS Tags) -> Kk-En(POS Tags)
!onmt-main --model kk-tr-en-modelim.py --config kk-tr-en-pos.yml --auto_config train --with_eval --num_gpus 2

2024-11-28 13:42:38.609568: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-28 13:42:39.408065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-11-28 13:42:39.408129: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [26]:
# Tr-En(Tatoeba)(POS Tags) -> Kk-En(POS Tags)
!onmt-main --model kk-tr-en-modelim.py --config kk-tr-en-pos-2.yml --auto_config --checkpoint_path POS_TR_KK_EN/ckpt-100000 train --with_eval --num_gpus 2

2024-11-29 08:24:55.914844: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-29 08:24:56.706744: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-11-29 08:24:56.706810: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [27]:
!CUDA_VISIBLE_DEVICES=0 onmt-main --config kk-tr-en-pos-2.yml --auto_config --checkpoint_path POS_TR_KK_EN-2/ckpt-100000 infer --features_file KK_tokens_test_shared KK_pos_tags_test_shared.txt --predictions_file output_tr_kk_en_pos.txt

2024-11-30 03:05:27.557556: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-30 03:05:28.341958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-11-30 03:05:28.342028: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [None]:
!python3 MT-Preparation/subwording/3-desubword.py en_shared_vocab.model output_tr_kk_en_pos.txt

In [1]:
# BLEU and chrF scores
!python3 compute-bleu.py en_test_shuffled.txt-filtered.en output_tr_kk_en_pos.txt.desubword

Reference first sentence: In the developed world, this figure is 35 25%
Translated first sentence: In the developed countries of the world , this figure is 355%
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
BLEU:  BLEU = 52.47 73.0/56.8/46.6/39.2 (BP = 1.000 ratio = 1.106 hyp_len = 458384 ref_len = 414303)
CHRF:  chrF2 = 76.48


In [30]:
# Average METEOR score (Ortalama METEOR Puanı)
import nltk
from nltk.translate.meteor_score import meteor_score

def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [nltk.word_tokenize(line.strip()) for line in lines]

def calculate_meteor(reference_file, hypothesis_file):
    references = read_and_tokenize_file(reference_file)
    hypotheses = read_and_tokenize_file(hypothesis_file)
    
    if len(references) != len(hypotheses):
        raise ValueError("Dosyaların satır sayıları eşleşmiyor")

    total_meteor_score = 0.0
    for ref, hyp in zip(references, hypotheses):
        total_meteor_score += meteor_score([ref], hyp)

    average_meteor_score = total_meteor_score / len(references)
    return average_meteor_score

reference_file = 'en_test_shuffled.txt-filtered.en'
hypothesis_file = 'output_tr_kk_en_pos.txt.desubword'

score = calculate_meteor(reference_file, hypothesis_file)
print(f"Ortalama METEOR Puanı: {score}") #Average METEOR score

Ortalama METEOR Puanı: 0.7632754220252569


In [31]:
#Kk-En (POS Tags) -> Tr-En (Tatoeba)(POS Tags)
!onmt-main --model kk-tr-en-modelim.py --config kk-tr-en-pos-asıl.yml --auto_config train --with_eval --num_gpus 2

2024-11-30 03:11:08.073872: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-30 03:11:08.827506: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-11-30 03:11:08.827586: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [32]:
#Kk-En (POS Tags) -> Tr-En (Tatoeba)(POS Tags)
!onmt-main --model kk-tr-en-modelim.py --config kk-tr-en-pos-asıl-2.yml --auto_config --checkpoint_path POS_KK_TR_EN/ckpt-100000 train --with_eval --num_gpus 2

2024-11-30 22:15:22.840054: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-30 22:15:23.628115: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-11-30 22:15:23.628190: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [33]:
!CUDA_VISIBLE_DEVICES=0 onmt-main --config kk-tr-en-pos-asıl-2.yml --auto_config --checkpoint_path POS_KK_TR_EN_2/ckpt-100000 infer --features_file Tatoeba_tokens_test_shared Tatoeba_pos_tags_test_shared.txt --predictions_file output_kk_tr_en_pos.txt

2024-12-01 15:47:25.160438: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-01 15:47:25.977042: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-12-01 15:47:25.977116: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [None]:
!python3 MT-Preparation/subwording/3-desubword.py en_shared_vocab.model output_kk_tr_en_pos.txt

In [2]:
#BLEU and chrF scores
!python3 compute-bleu.py Tatoeba.en-tr.en-filtered.en.test output_kk_tr_en_pos.txt.desubword

Reference first sentence: I won't stay there very long.
Translated first sentence: I won't stay there very long .
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
BLEU:  BLEU = 60.01 80.3/64.6/54.3/46.1 (BP = 1.000 ratio = 1.018 hyp_len = 79015 ref_len = 77587)
CHRF:  chrF2 = 74.12


In [36]:
# Average METEOR score (Ortalama METEOR Puanı)
import nltk
from nltk.translate.meteor_score import meteor_score

def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [nltk.word_tokenize(line.strip()) for line in lines]

def calculate_meteor(reference_file, hypothesis_file):
    references = read_and_tokenize_file(reference_file)
    hypotheses = read_and_tokenize_file(hypothesis_file)
    
    if len(references) != len(hypotheses):
        raise ValueError("Dosyaların satır sayıları eşleşmiyor")

    total_meteor_score = 0.0
    for ref, hyp in zip(references, hypotheses):
        total_meteor_score += meteor_score([ref], hyp)

    average_meteor_score = total_meteor_score / len(references)
    return average_meteor_score

reference_file = 'Tatoeba.en-tr.en-filtered.en.test'
hypothesis_file = 'output_kk_tr_en_pos.txt.desubword'

score = calculate_meteor(reference_file, hypothesis_file)
print(f"Ortalama METEOR Puanı: {score}") #Average METEOR score

Ortalama METEOR Puanı: 0.816043893864375


In [1]:
# Tr-En base model
!onmt-main --model kk-tr-en-shared.py --config tr-en.yml --auto_config train --with_eval --num_gpus 2

2024-12-05 09:43:44.127767: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-05 09:43:44.899212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-12-05 09:43:44.899275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [2]:
!CUDA_VISIBLE_DEVICES=0 onmt-main --config tr-en.yml --auto_config --checkpoint_path TR-EN_std/ckpt-100000 infer --features_file Tatoeba_tokens_test --predictions_file output_tr_en_std.txt

2024-12-06 08:18:28.973271: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-06 08:18:29.782963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-12-06 08:18:29.783034: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [None]:
!python3 MT-Preparation/subwording/3-desubword.py en_vocab.model output_tr_en_std.txt

In [3]:
# BLEU and chrF scores
!python3 compute-bleu.py Tatoeba.en-tr.en-filtered.en.test output_tr_en_std.txt.desubword

Reference first sentence: I won't stay there very long.
Translated first sentence: I won't stay there for too long .
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
BLEU:  BLEU = 58.14 79.2/62.9/52.2/43.9 (BP = 1.000 ratio = 1.018 hyp_len = 78947 ref_len = 77587)
CHRF:  chrF2 = 72.67


In [6]:
# Average METEOR score (Ortalama METEOR Puanı)
import nltk
from nltk.translate.meteor_score import meteor_score

def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [nltk.word_tokenize(line.strip()) for line in lines]

def calculate_meteor(reference_file, hypothesis_file):
    references = read_and_tokenize_file(reference_file)
    hypotheses = read_and_tokenize_file(hypothesis_file)
    
    if len(references) != len(hypotheses):
        raise ValueError("Dosyaların satır sayıları eşleşmiyor")

    total_meteor_score = 0.0
    for ref, hyp in zip(references, hypotheses):
        total_meteor_score += meteor_score([ref], hyp)

    average_meteor_score = total_meteor_score / len(references)
    return average_meteor_score

reference_file = 'Tatoeba.en-tr.en-filtered.en.test'
hypothesis_file = 'output_tr_en_std.txt.desubword'

score = calculate_meteor(reference_file, hypothesis_file)
print(f"Ortalama METEOR Puanı: {score}") #Average METEOR score

Ortalama METEOR Puanı: 0.8038217751678343


In [9]:
# Tr-En (POS Tags)
!onmt-main --model kk-tr-en-modelim.py --config tr-en-pos.yml --auto_config train --with_eval --num_gpus 2

2024-12-07 02:44:57.213104: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-07 02:44:58.011251: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-12-07 02:44:58.011331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [10]:
!CUDA_VISIBLE_DEVICES=0 onmt-main --config tr-en-pos.yml --auto_config --checkpoint_path POS_TR_EN/ckpt-100000 infer --features_file Tatoeba_tokens_test Tatoeba_pos_tags_test.txt --predictions_file output_tr_en_pos.txt

2024-12-07 20:00:00.786407: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-07 20:00:01.569765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-12-07 20:00:01.569832: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of

In [None]:
!python3 MT-Preparation/subwording/3-desubword.py en_vocab.model output_tr_en_pos.txt

In [4]:
# BLEU and chrF scores
!python3 compute-bleu.py Tatoeba.en-tr.en-filtered.en.test output_tr_en_pos.txt.desubword

Reference first sentence: I won't stay there very long.
Translated first sentence: I won't stay there very long .
That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
BLEU:  BLEU = 59.70 80.5/64.4/53.8/45.6 (BP = 1.000 ratio = 1.006 hyp_len = 78064 ref_len = 77587)
CHRF:  chrF2 = 73.20


In [13]:
# Average METEOR score (Ortalama METEOR Puanı)
import nltk
from nltk.translate.meteor_score import meteor_score

# İki dosyayı oku ve tokenleştir
def read_and_tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [nltk.word_tokenize(line.strip()) for line in lines]

def calculate_meteor(reference_file, hypothesis_file):
    references = read_and_tokenize_file(reference_file)
    hypotheses = read_and_tokenize_file(hypothesis_file)
    
    if len(references) != len(hypotheses):
        raise ValueError("Dosyaların satır sayıları eşleşmiyor")

    total_meteor_score = 0.0
    for ref, hyp in zip(references, hypotheses):
        total_meteor_score += meteor_score([ref], hyp)

    average_meteor_score = total_meteor_score / len(references)
    return average_meteor_score

reference_file = 'Tatoeba.en-tr.en-filtered.en.test'
hypothesis_file = 'output_tr_en_pos.txt.desubword'

score = calculate_meteor(reference_file, hypothesis_file)
print(f"Ortalama METEOR Puanı: {score}") #Average METEOR score

Ortalama METEOR Puanı: 0.8074712168919849
