In [None]:
# Define the GPU

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
# Load the libraries

from transformers import AutoTokenizer
from datasets import concatenate_datasets, Dataset
import pandas as pd

In [None]:
# A function for text vectorization (f.e. KK->RU)

def preprocessing(examples):
    inputs = examples["source_lang"]
    targets = examples["target_lang"]
    model_inputs = tokenizer(inputs, text_target=targets, return_tensors="pt", truncation=True, 
                             padding=True, max_length=256).to('cuda')
    return model_inputs

In [None]:
# A function for text vectorization for the same data, but in the opposite direction (f.e. RU->KK)

def preprocessing_inverce(examples):
    inputs = examples["target_lang"]
    targets = examples["source_lang"]
    model_inputs = tokenizer(inputs, text_target=targets, return_tensors="pt", truncation=True, 
                             padding=True, max_length=256).to('cuda')
    return model_inputs

In [None]:
# Load synthetic EN-TR data

en_tr_train_synt = pd.read_csv('./data/18_sync_train_en_tr.csv')
en_tr_dev_synt = pd.read_csv('./data/24_sync_valid_en_tr.csv')

In [None]:
# Load KazParC EN-TR data

en_tr_train_kazparc =  pd.read_csv('./data/04_kazparc_train_en_tr.csv')
en_tr_dev_kazparc =  pd.read_csv('./data/10_kazparc_valid_en_tr.csv')

In [None]:
# Rename the columns

en_tr_train_kazparc.rename(columns={'en': 'source_lang', 'tr': 'target_lang'}, inplace=True)
en_tr_dev_kazparc.rename(columns={'en': 'source_lang', 'tr': 'target_lang'}, inplace=True)
en_tr_train_synt.rename(columns={'en': 'source_lang', 'tr': 'target_lang'}, inplace=True)
en_tr_dev_synt.rename(columns={'en': 'source_lang', 'tr': 'target_lang'}, inplace=True)

In [None]:
# Drop the unused in training columns to reduse the datasets' size

en_tr_train_synt.drop(columns=['id'], inplace=True)
en_tr_dev_synt.drop(columns=['id'], inplace=True)
en_tr_train_kazparc.drop(columns=['id', 'domain'], inplace=True)
en_tr_dev_kazparc.drop(columns=['id', 'domain'], inplace=True)

In [None]:
# Convert the pandas datasets to HF format

en_tr_train_synt = Dataset.from_pandas(en_tr_train_synt)
en_tr_dev_synt = Dataset.from_pandas(en_tr_dev_synt)
en_tr_train_kazparc = Dataset.from_pandas(en_tr_train_kazparc)
en_tr_dev_kazparc = Dataset.from_pandas(en_tr_dev_kazparc)

In [None]:
# Concatenate the synthetic and KazParC datasets

en_tr_train_all = concatenate_datasets([en_tr_train_synt, en_tr_train_kazparc])
en_tr_dev_all = concatenate_datasets([en_tr_dev_synt, en_tr_dev_kazparc])

In [None]:
# Download the tokenizer for EN->TR

tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-1.3B', src_lang='eng_Latn', tgt_lang='tur_Latn')

In [None]:
# Vectorization for English to Turkish

en_tr_tokenized_data_train = en_tr_train_all.map(preprocessing, batched=True)
en_tr_tokenized_data_dev = en_tr_dev_all.map(preprocessing, batched=True)

In [None]:
# Download the tokenizer for TR->EN

tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-1.3B', src_lang='tur_Latn', tgt_lang='eng_Latn')

In [None]:
# Vectorization for Turkish to English

en_tr_tokenized_data_train2 = en_tr_train_all.map(preprocessing_inverce, batched=True)
en_tr_tokenized_data_dev2 = en_tr_dev_all.map(preprocessing_inverce, batched=True)

In [None]:
# Concatenate the EN->TR and TR->EN datasets

dataset_train = concatenate_datasets([en_tr_tokenized_data_train, en_tr_tokenized_data_train2])
dataset_dev = concatenate_datasets([en_tr_tokenized_data_dev, en_tr_tokenized_data_dev2])

In [None]:
# Load synthetic EN-RU data

en_ru_train_synt = pd.read_csv('./data/17_sync_train_en_ru.csv')
en_ru_dev_synt = pd.read_csv('./data/23_sync_valid_en_ru.csv')

In [None]:
# Load KazParC EN-RU data

en_ru_train_kazparc = pd.read_csv('./data/03_kazparc_train_en_ru.csv')
en_ru_dev_kazparc = pd.read_csv('./data/09_kazparc_valid_en_ru.csv')

In [None]:
# Rename the columns

en_ru_train_kazparc.rename(columns={'en': 'source_lang', 'ru': 'target_lang'}, inplace=True)
en_ru_dev_kazparc.rename(columns={'en': 'source_lang', 'ru': 'target_lang'}, inplace=True)
en_ru_train_synt.rename(columns={'en': 'source_lang', 'ru': 'target_lang'}, inplace=True)
en_ru_dev_synt.rename(columns={'en': 'source_lang', 'ru': 'target_lang'}, inplace=True)

In [None]:
# Drop the unused in training columns to reduse the datasets' size

en_ru_train_synt.drop(columns=['id'], inplace=True)
en_ru_dev_synt.drop(columns=['id'], inplace=True)
en_ru_train_kazparc.drop(columns=['id', 'domain'], inplace=True)
en_ru_dev_kazparc.drop(columns=['id', 'domain'], inplace=True)

In [None]:
# Convert the pandas datasets to HF format

en_ru_train_synt = Dataset.from_pandas(en_ru_train_synt)
en_ru_dev_synt = Dataset.from_pandas(en_ru_dev_synt)
en_ru_train_kazparc = Dataset.from_pandas(en_ru_train_kazparc)
en_ru_dev_kazparc = Dataset.from_pandas(en_ru_dev_kazparc)

In [None]:
# Concatenate the synthetic and KazParC datasets

en_ru_train_all = concatenate_datasets([en_ru_train_synt, en_ru_train_kazparc])
en_ru_dev_all = concatenate_datasets([en_ru_dev_synt, en_ru_dev_kazparc])

In [None]:
# Download the tokenizer for EN->RU

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='eng_Latn', tgt_lang='rus_Cyrl')

In [None]:
# Vectorization for English to Russian

en_ru_tokenized_data_train = en_ru_train_all.map(preprocessing, batched=True)
en_ru_tokenized_data_dev = en_ru_dev_all.map(preprocessing, batched=True)

In [None]:
# Concatenate the EN->RU and the vectorized datasets

dataset_dev = concatenate_datasets([en_ru_tokenized_data_dev, dataset_dev])
dataset_train = concatenate_datasets([en_ru_tokenized_data_train, dataset_train])

In [None]:
# Download the tokenizer for RU->EN

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='rus_Cyrl', tgt_lang='eng_Latn')

In [None]:
# Vectorization for Russian to English

en_ru_tokenized_data_train2 = en_ru_train_all.map(preprocessing_inverce, batched=True)
en_ru_tokenized_data_dev2 = en_ru_dev_all.map(preprocessing_inverce, batched=True)

In [None]:
# Concatenate the RU->EN and the vectorized datasets

dataset_train = concatenate_datasets([en_ru_tokenized_data_train2, dataset_train])
dataset_dev = concatenate_datasets([en_ru_tokenized_data_dev2, dataset_dev])

In [None]:
# Load synthetic EN-KK data

kk_en_train_synt = pd.read_csv('./data/16_sync_train_en_kk.csv')
kk_en_dev_synt = pd.read_csv('./data/22_sync_valid_en_kk.csv')

In [None]:
# Load KazParC EN-KK data

kk_en_train_kazparc = pd.read_csv('./data/02_kazparc_train_en_kk.csv')
kk_en_dev_kazparc = pd.read_csv('./data/08_kazparc_valid_en_kk.csv')

In [None]:
# Rename the columns

kk_en_train_kazparc.rename(columns={'kk': 'source_lang', 'en': 'target_lang'}, inplace=True)
kk_en_dev_kazparc.rename(columns={'kk': 'source_lang', 'en': 'target_lang'}, inplace=True)
kk_en_train_synt.rename(columns={'kk': 'source_lang', 'en': 'target_lang'}, inplace=True)
kk_en_dev_synt.rename(columns={'kk': 'source_lang', 'en': 'target_lang'}, inplace=True)

In [None]:
# Drop the unused in training columns to reduse the datasets' size

kk_en_train_synt.drop(columns=['id'], inplace=True)
kk_en_dev_synt.drop(columns=['id'], inplace=True)
kk_en_train_kazparc.drop(columns=['id', 'domain'], inplace=True)
kk_en_dev_kazparc.drop(columns=['id', 'domain'], inplace=True)

In [None]:
# Convert the pandas datasets to HF format

kk_en_train_synt = Dataset.from_pandas(kk_en_train_synt)
kk_en_dev_synt = Dataset.from_pandas(kk_en_dev_synt)
kk_en_train_kazparc = Dataset.from_pandas(kk_en_train_kazparc)
kk_en_dev_kazparc = Dataset.from_pandas(kk_en_dev_kazparc)

In [None]:
# Concatenate the synthetic and KazParC datasets

kk_en_train_all = concatenate_datasets([kk_en_train_synt, kk_en_train_kazparc])
kk_en_dev_all = concatenate_datasets([kk_en_dev_synt, kk_en_dev_kazparc])

In [None]:
# Download the tokenizer for KK->EN

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='kaz_Cyrl', tgt_lang='eng_Latn')

In [None]:
# Vectorization for Kazakh to English 

kk_en_tokenized_data_train = kk_en_train_all.map(preprocessing, batched=True)
kk_en_tokenized_data_dev = kk_en_dev_all.map(preprocessing, batched=True)

In [None]:
# Concatenate the KK->EN and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, kk_en_tokenized_data_train])
dataset_dev = concatenate_datasets([dataset_dev, kk_en_tokenized_data_dev])

In [None]:
# Download the tokenizer for EN->KK

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='eng_Latn', tgt_lang='kaz_Cyrl')

In [None]:
# Vectorization for English to Kazakh

kk_en_tokenized_data_train2 = kk_en_train_all.map(preprocessing_inverce, batched=True)
kk_en_tokenized_data_dev2 = kk_en_dev_all.map(preprocessing_inverce, batched=True)

In [None]:
# Concatenate the EN->KK and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, kk_en_tokenized_data_train2])
dataset_dev = concatenate_datasets([dataset_dev, kk_en_tokenized_data_dev2])

In [None]:
# Load synthetic KK-RU data

kk_ru_train_synt = pd.read_csv('./data/19_sync_train_kk_ru.csv')
kk_ru_dev_synt = pd.read_csv('./data/25_sync_valid_kk_ru.csv')

In [None]:
# Load KazParC KK-RU data

kk_ru_train_kazparc = pd.read_csv('./data/05_kazparc_train_kk_ru.csv')
kk_ru_dev_kazparc = pd.read_csv('./data/11_kazparc_valid_kk_ru.csv')

In [None]:
# Rename the columns

kk_ru_train_kazparc.rename(columns={'kk': 'source_lang', 'ru': 'target_lang'}, inplace=True)
kk_ru_dev_kazparc.rename(columns={'kk': 'source_lang', 'ru': 'target_lang'}, inplace=True)
kk_ru_train_synt.rename(columns={'kk': 'source_lang', 'ru': 'target_lang'}, inplace=True)
kk_ru_dev_synt.rename(columns={'kk': 'source_lang', 'ru': 'target_lang'}, inplace=True)

In [None]:
# Drop the unused in training columns to reduse the datasets' size

kk_ru_train_synt.drop(columns=['id'], inplace=True)
kk_ru_dev_synt.drop(columns=['id'], inplace=True)
kk_ru_train_kazparc.drop(columns=['id', 'domain'], inplace=True)
kk_ru_dev_kazparc.drop(columns=['id', 'domain'], inplace=True)

In [None]:
# Convert the pandas datasets to HF format

kk_ru_train_synt = Dataset.from_pandas(kk_ru_train_synt)
kk_ru_dev_synt = Dataset.from_pandas(kk_ru_dev_synt)
kk_ru_train_kazparc = Dataset.from_pandas(kk_ru_train_kazparc)
kk_ru_dev_kazparc = Dataset.from_pandas(kk_ru_dev_kazparc)

In [None]:
# Concatenate the synthetic and KazParC datasets

kk_ru_train_all = concatenate_datasets([kk_ru_train_synt, kk_ru_train_kazparc])
kk_ru_dev_all = concatenate_datasets([kk_ru_dev_synt, kk_ru_dev_kazparc])

In [None]:
# Download the tokenizer for KK->RU

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='kaz_Cyrl', tgt_lang='rus_Cyrl')

In [None]:
# Vectorization for Kazakh to Russian 

kk_ru_tokenized_data_train = kk_ru_train_all.map(preprocessing, batched=True)
kk_ru_tokenized_data_dev = kk_ru_dev_all.map(preprocessing, batched=True)

In [None]:
# Concatenate the KK->RU and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, kk_ru_tokenized_data_train])
dataset_dev = concatenate_datasets([dataset_dev, kk_ru_tokenized_data_dev])

In [None]:
# Download the tokenizer for RU->KK

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='rus_Cyrl', tgt_lang='kaz_Cyrl')

In [None]:
# Vectorization for Russian to Kazakh

kk_ru_tokenized_data_train2 = kk_ru_train_all.map(preprocessing_inverce, batched=True)
kk_ru_tokenized_data_dev2 = kk_ru_dev_all.map(preprocessing_inverce, batched=True)

In [None]:
# Concatenate the RU->KK and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, kk_ru_tokenized_data_train2])
dataset_dev = concatenate_datasets([dataset_dev, kk_ru_tokenized_data_dev2])

In [None]:
# Load synthetic KK-TR data

kk_tr_train_synt = pd.read_csv('./data/20_sync_train_kk_tr.csv')
kk_tr_dev_synt = pd.read_csv('./data/26_sync_valid_kk_tr.csv')

In [None]:
# Load KazParC KK-TR data

kk_tr_train_kazparc = pd.read_csv('./data/06_kazparc_train_kk_tr.csv')
kk_tr_dev_kazparc = pd.read_csv('./data/12_kazparc_valid_kk_tr.csv')

In [None]:
# Rename the columns

kk_tr_train_kazparc.rename(columns={'kk': 'source_lang', 'tr': 'target_lang'}, inplace=True)
kk_tr_dev_kazparc.rename(columns={'kk': 'source_lang', 'tr': 'target_lang'}, inplace=True)
kk_tr_train_synt.rename(columns={'kk': 'source_lang', 'tr': 'target_lang'}, inplace=True)
kk_tr_dev_synt.rename(columns={'kk': 'source_lang', 'tr': 'target_lang'}, inplace=True)

In [None]:
# Drop the unused in training columns to reduse the datasets' size

kk_tr_train_synt.drop(columns=['id'], inplace=True)
kk_tr_dev_synt.drop(columns=['id'], inplace=True)
kk_tr_train_kazparc.drop(columns=['id', 'domain'], inplace=True)
kk_tr_dev_kazparc.drop(columns=['id', 'domain'], inplace=True)

In [None]:
# Convert the pandas datasets to HF format

kk_tr_train_synt = Dataset.from_pandas(kk_tr_train_synt)
kk_tr_dev_synt = Dataset.from_pandas(kk_tr_dev_synt)
kk_tr_train_kazparc = Dataset.from_pandas(kk_tr_train_kazparc)
kk_tr_dev_kazparc = Dataset.from_pandas(kk_tr_dev_kazparc)

In [None]:
# Concatenate the synthetic and KazParC datasets

kk_tr_train_all = concatenate_datasets([kk_tr_train_synt, kk_tr_train_kazparc])
kk_tr_dev_all = concatenate_datasets([kk_tr_dev_synt, kk_tr_dev_kazparc])

In [None]:
# Download the tokenizer for KK->TR

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='kaz_Cyrl', tgt_lang='tur_Latn')

In [None]:
# Vectorization for Kazakh to Turkish

kk_tr_tokenized_data_train = kk_tr_train_all.map(preprocessing, batched=True)
kk_tr_tokenized_data_dev = kk_tr_dev_all.map(preprocessing, batched=True)

In [None]:
# Concatenate the KK->TR and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, kk_tr_tokenized_data_train])
dataset_dev = concatenate_datasets([dataset_dev, kk_tr_tokenized_data_dev])

In [None]:
# Download the tokenizer for TR->KK

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='tur_Latn', tgt_lang='kaz_Cyrl')

In [None]:
# Vectorization for Turkish to Kazakh

kk_tr_tokenized_data_train2 = kk_tr_train_all.map(preprocessing_inverce, batched=True)
kk_tr_tokenized_data_dev2 = kk_tr_dev_all.map(preprocessing_inverce, batched=True)

In [None]:
# Concatenate the TR->KK and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, kk_tr_tokenized_data_train2])
dataset_dev = concatenate_datasets([dataset_dev, kk_tr_tokenized_data_dev2])

In [None]:
# Load synthetic RU-TR data

ru_tr_train_synt = pd.read_csv('./data/21_sync_train_ru_tr.csv')
ru_tr_dev_synt = pd.read_csv('./data/27_sync_valid_ru_tr.csv')

In [None]:
# Load KazParC RU-TR data

ru_tr_train_kazparc = pd.read_csv('./data/07_kazparc_train_ru_tr.csv')
ru_tr_dev_kazparc = pd.read_csv('./data/13_kazparc_valid_ru_tr.csv')

In [None]:
# Rename the columns

ru_tr_train_kazparc.rename(columns={'ru': 'source_lang', 'tr': 'target_lang'}, inplace=True)
ru_tr_dev_kazparc.rename(columns={'ru': 'source_lang', 'tr': 'target_lang'}, inplace=True)
ru_tr_train_synt.rename(columns={'ru': 'source_lang', 'tr': 'target_lang'}, inplace=True)
ru_tr_dev_synt.rename(columns={'ru': 'source_lang', 'tr': 'target_lang'}, inplace=True)

In [None]:
# Drop the unused in training columns to reduse the datasets' size

ru_tr_train_synt.drop(columns=['id'], inplace=True)
ru_tr_dev_synt.drop(columns=['id'], inplace=True)
ru_tr_train_kazparc.drop(columns=['id', 'domain'], inplace=True)
ru_tr_dev_kazparc.drop(columns=['id', 'domain'], inplace=True)

In [None]:
# Convert the pandas datasets to HF format

ru_tr_train_synt = Dataset.from_pandas(ru_tr_train_synt)
ru_tr_dev_synt = Dataset.from_pandas(ru_tr_dev_synt)
ru_tr_train_kazparc = Dataset.from_pandas(ru_tr_train_kazparc)
ru_tr_dev_kazparc = Dataset.from_pandas(ru_tr_dev_kazparc)

In [None]:
# Concatenate the synthetic and KazParC datasets

ru_tr_train_all = concatenate_datasets([ru_tr_train_synt, ru_tr_train_kazparc])
ru_tr_dev_all = concatenate_datasets([ru_tr_dev_synt, ru_tr_dev_kazparc])

In [None]:
# Download the tokenizer for RU->TR

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='rus_Cyrl', tgt_lang='tur_Latn')

In [None]:
# Vectorization for Russian to Turkish

ru_tr_tokenized_data_train = ru_tr_train_all.map(preprocessing, batched=True)
ru_tr_tokenized_data_dev = ru_tr_dev_all.map(preprocessing, batched=True)

In [None]:
# Concatenate the RU->TR and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, ru_tr_tokenized_data_train])
dataset_dev = concatenate_datasets([dataset_dev, ru_tr_tokenized_data_dev])

In [None]:
# Download the tokenizer for TR->RU

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang='tur_Latn', tgt_lang='rus_Cyrl')

In [None]:
# Vectorization for Turkish to Russian

ru_tr_tokenized_data_train2 = ru_tr_train_all.map(preprocessing_inverce, batched=True)
ru_tr_tokenized_data_dev2 = ru_tr_dev_all.map(preprocessing_inverce, batched=True)

In [None]:
# Concatenate the TR->RU and the vectorized datasets

dataset_train = concatenate_datasets([dataset_train, ru_tr_tokenized_data_train2])
dataset_dev = concatenate_datasets([dataset_dev, ru_tr_tokenized_data_dev2])

In [None]:
# Save the train and dev datasets in local machine

dataset_train.save_to_disk('./kazparc_train/')
dataset_dev.save_to_disk('./kazparc_dev/')