# Run in kaggle T4 * 2 gpu

In [1]:
%pip install transformers
%pip install datasets
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModel
import torch
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from accelerate import Accelerator

In [3]:
if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print out the name of each GPU and memory details
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
    
    # If multiple GPUs are available, use DataParallel for multi-GPU
    if num_gpus > 1:
        print("Using DataParallel for multi-GPU training.")
        # Example: Wrap your model with DataParallel
    else:
        print("Only one GPU available, using single GPU mode.")
else:
    print("No GPUs available, using CPU.")
    
def setup_translation_pipeline(source_lang, target_lang):
    """
    Set up a translation pipeline using a pretrained model from Helsinki-NLP, optimized for multi-GPU with Accelerate.

    Args:
        source_lang (str): The source language code (e.g., "en" for English).
        target_lang (str): The target language code (e.g., "fr" for French).

    Returns:
        transformers.Pipeline: A translation pipeline configured for the given language pair.
    """
    # Define the model name based on the source and target languages
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    # Load the tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load the translation model with memory optimization and half-precision support
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        return_dict=True,  # Use a dictionary for the model's output for easier access
        low_cpu_mem_usage=True,  # Optimize for low memory usage during model loading
        torch_dtype=torch.float16,  # Use half-precision for faster computation (requires supported GPU)
        trust_remote_code=True,  # Trust the model's remote configuration if specified
    )
    
    # Use the Accelerate library for better multi-GPU support
    accelerator = Accelerator()

    # Move the model to the appropriate device using Accelerator (handles multi-GPU)
    model = accelerator.prepare(model)

    # Set up and return the translation pipeline
    translation_pipeline = pipeline(
        "translation",  # Specify the task as translation
        model=model,  # Provide the loaded model
        tokenizer=tokenizer,  # Provide the loaded tokenizer
        torch_dtype=torch.float16,  # Use half-precision for the pipeline
        max_length=512,
        truncation=True,
        device=accelerator.device  # Use the device from Accelerator (handles multi-GPU)
    )

    return translation_pipeline


Number of GPUs available: 2
GPU 0: Tesla T4
Total memory: 15.828320256 GB
GPU 1: Tesla T4
Total memory: 15.828320256 GB
Using DataParallel for multi-GPU training.


In [4]:
# Function to translate and retranslate for a given target language
def translate_retranslate(ds, source_lang, target_lang, batch_size=32):
    """
    Translate the dataset's 'comment_sentence' column to a target language and back to the source language.
    
    Args:
        ds (Dataset): Hugging Face dataset containing a 'comment_sentence' column.
        source_lang (str): Source language code (e.g., "en" for English).
        target_lang (str): Target language code to translate to and back.
        batch_size (int): Number of rows to process in a single batch.

    Returns:
        Dataset: Hugging Face dataset with a new column for retranslated texts.
    """
    # Set up translation pipelines
    pipe_to_target = setup_translation_pipeline(source_lang, target_lang)  # English to target language
    pipe_to_source = setup_translation_pipeline(target_lang, source_lang)  # Target language back to English

    retranslated_texts = []

    # Process the dataset in batches to optimize memory usage and speed
    for i in tqdm(range(0, len(ds), batch_size), desc=f"Translating to {target_lang} and back"):
        # Extract a batch of 'comment_sentence' without converting to pandas
        batch = ds[i: i + batch_size]["comment_sentence"]

        # Step 1: Translate from source language to target language
        target_translations = pipe_to_target(batch, max_length=512, do_sample=False)
        target_texts = [result["translation_text"] for result in target_translations]

        # Step 2: Translate back to the source language (English)
        english_translations = pipe_to_source(target_texts, max_length=512, do_sample=False)
        retranslated_batch = [result["translation_text"].strip() for result in english_translations]

        # Append the retranslated texts to the result list
        retranslated_texts.extend(retranslated_batch)
                # Clear CUDA cache to prevent memory overload
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Add the retranslated texts as a new column in the dataset
    ds = ds.add_column(f"retranslated_text_english_from_{target_lang}", retranslated_texts)
    return ds

# Load the dataset (e.g., the first 10 rows from the Python training split)
df = load_dataset("NLBSE/nlbse25-code-comment-classification", split="pharo_train")

# List of target languages to process for retranslation
languages = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca","gl","grk","trk","uk","ur","vi","roa","sv"]

# Loop through each language and retranslate
for lang in tqdm(languages, desc="Processing languages"):
    df = translate_retranslate(df, "en", lang)


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

Processing languages:   0%|          | 0/18 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to de and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to de and back:   2%|▏         | 1/41 [00:11<07:28, 11.22s/it][A
Translating to de and back:   5%|▍         | 2/41 [00:21<06:51, 10.56s/it][A
Translating to de and back:   7%|▋         | 3/41 [00:33<07:04, 11.17s/it][A
Translating to de and back:  10%|▉         | 4/41 [00:47<07:34, 12.27s/it][A
Translating to de and back:  12%|█▏        | 5/41 [01:02<08:04, 13.46s/it][A
Translating to de and back:  15%|█▍        | 6/41 [01:14<07:31, 12.90s/it][A
Translating to de and back:  17%|█▋        | 7/41 [01:26<07:12, 12.71s/it][A
Translating to de and back:  20%|█▉        | 8/41 [01:37<06:39, 12.10s/it][A
Translating to de and back:  22%|██▏       | 9/41 [01:46<05:55, 11.12s/it][A
Translating to de and back:  24%|██▍       | 10/41 [02:02<06:32, 12.65s/it][AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Translating to de and back:  27%|██▋   

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to fr and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to fr and back:   2%|▏         | 1/41 [00:10<06:46, 10.16s/it][A
Translating to fr and back:   5%|▍         | 2/41 [00:20<06:34, 10.10s/it][A
Translating to fr and back:   7%|▋         | 3/41 [00:32<06:55, 10.94s/it][A
Translating to fr and back:  10%|▉         | 4/41 [00:46<07:29, 12.14s/it][A
Translating to fr and back:  12%|█▏        | 5/41 [00:55<06:46, 11.29s/it][A
Translating to fr and back:  15%|█▍        | 6/41 [01:07<06:40, 11.45s/it][A
Translating to fr and back:  17%|█▋        | 7/41 [01:19<06:37, 11.70s/it][A
Translating to fr and back:  20%|█▉        | 8/41 [01:30<06:16, 11.39s/it][A
Translating to fr and back:  22%|██▏       | 9/41 [01:40<05:44, 10.76s/it][A
Translating to fr and back:  24%|██▍       | 10/41 [01:48<05:09,  9.97s/it][A
Translating to fr and back:  27%|██▋       | 11/41 [02:02<05:41, 11.37s/it][A
Translating to fr and back:  29%|██▉       | 12/41 [02:19<06:15, 12.9

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to zh and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to zh and back:   2%|▏         | 1/41 [00:09<06:10,  9.25s/it][A
Translating to zh and back:   5%|▍         | 2/41 [00:19<06:23,  9.83s/it][A
Translating to zh and back:   7%|▋         | 3/41 [00:30<06:42, 10.59s/it][A
Translating to zh and back:  10%|▉         | 4/41 [00:45<07:22, 11.95s/it][A
Translating to zh and back:  12%|█▏        | 5/41 [00:54<06:43, 11.22s/it][A
Translating to zh and back:  15%|█▍        | 6/41 [01:07<06:44, 11.56s/it][A
Translating to zh and back:  17%|█▋        | 7/41 [01:23<07:29, 13.22s/it][A
Translating to zh and back:  20%|█▉        | 8/41 [01:35<07:05, 12.88s/it][A
Translating to zh and back:  22%|██▏       | 9/41 [01:45<06:17, 11.79s/it][A
Translating to zh and back:  24%|██▍       | 10/41 [01:54<05:44, 11.11s/it][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_lengt

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to hi and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to hi and back:   2%|▏         | 1/41 [00:09<06:28,  9.72s/it][A
Translating to hi and back:   5%|▍         | 2/41 [00:19<06:14,  9.59s/it][A
Translating to hi and back:   7%|▋         | 3/41 [00:29<06:22, 10.06s/it][A
Translating to hi and back:  10%|▉         | 4/41 [00:43<06:59, 11.34s/it][A
Translating to hi and back:  12%|█▏        | 5/41 [00:52<06:19, 10.56s/it][A
Translating to hi and back:  15%|█▍        | 6/41 [01:02<06:05, 10.43s/it][A
Translating to hi and back:  17%|█▋        | 7/41 [01:13<05:59, 10.58s/it][A
Translating to hi and back:  20%|█▉        | 8/41 [01:23<05:39, 10.30s/it][A
Translating to hi and back:  22%|██▏       | 9/41 [01:31<05:06,  9.58s/it][A
Translating to hi and back:  24%|██▍       | 10/41 [01:38<04:38,  8.97s/it][A
Translating to hi and back:  27%|██▋       | 11/41 [01:53<05:20, 10.69s/it][A
Translating to hi and back:  29%|██▉       | 12/41 [02:09<06:00, 12.4

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to es and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to es and back:   2%|▏         | 1/41 [00:09<06:33,  9.83s/it][A
Translating to es and back:   5%|▍         | 2/41 [00:25<08:44, 13.46s/it][A
Translating to es and back:   7%|▋         | 3/41 [00:37<08:00, 12.66s/it][A
Translating to es and back:  10%|▉         | 4/41 [00:51<08:02, 13.03s/it][A
Translating to es and back:  12%|█▏        | 5/41 [01:00<07:05, 11.82s/it][A
Translating to es and back:  15%|█▍        | 6/41 [01:12<06:52, 11.78s/it][A
Translating to es and back:  17%|█▋        | 7/41 [01:24<06:45, 11.93s/it][A
Translating to es and back:  20%|█▉        | 8/41 [01:35<06:26, 11.71s/it][A
Translating to es and back:  22%|██▏       | 9/41 [01:45<05:50, 10.94s/it][A
Translating to es and back:  24%|██▍       | 10/41 [01:52<05:08,  9.95s/it][A
Translating to es and back:  27%|██▋       | 11/41 [02:07<05:41, 11.40s/it][A
Translating to es and back:  29%|██▉       | 12/41 [02:24<06:19, 13.0

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ru and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to ru and back:   2%|▏         | 1/41 [00:10<07:16, 10.92s/it][A
Translating to ru and back:   5%|▍         | 2/41 [00:21<07:04, 10.88s/it][A
Translating to ru and back:   7%|▋         | 3/41 [00:33<07:12, 11.39s/it][A
Translating to ru and back:  10%|▉         | 4/41 [00:48<07:44, 12.56s/it][A
Translating to ru and back:  12%|█▏        | 5/41 [00:57<06:54, 11.52s/it][A
Translating to ru and back:  15%|█▍        | 6/41 [01:09<06:42, 11.49s/it][A
Translating to ru and back:  17%|█▋        | 7/41 [01:22<06:46, 11.96s/it][A
Translating to ru and back:  20%|█▉        | 8/41 [01:34<06:39, 12.11s/it][A
Translating to ru and back:  22%|██▏       | 9/41 [01:45<06:13, 11.68s/it][A
Translating to ru and back:  24%|██▍       | 10/41 [01:56<05:55, 11.48s/it][A
Translating to ru and back:  27%|██▋       | 11/41 [02:11<06:18, 12.62s/it][A
Translating to ru and back:  29%|██▉       | 12/41 [02:29<06:54, 14.3

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ar and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to ar and back:   2%|▏         | 1/41 [00:16<10:55, 16.38s/it][AYour input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to ar and back:   5%|▍         | 2/41 [00:45<15:24, 23.71s/it][A
Translating to ar and back:   7%|▋         | 3/41 [01:00<12:28, 19.70s/it][A
Translating to ar and back:  10%|▉         | 4/41 [01:13<10:38, 17.27s/it][A
Translating to ar and back:  12%|█▏        | 5/41 [01:23<08:38, 14.40s/it][A
Translating to ar and back:  15%|█▍        | 6/41 [01:38<08:33, 14.69s/it][A
Translating to ar and back:  17%|█▋        | 7/41 [02:04<10:24, 18.37s/it][A
Translating to ar and back:  20%|█▉        | 8/41 [02:51<15:07, 27.49s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to nl and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to nl and back:   2%|▏         | 1/41 [00:10<07:06, 10.66s/it][A
Translating to nl and back:   5%|▍         | 2/41 [00:20<06:35, 10.15s/it][A
Translating to nl and back:   7%|▋         | 3/41 [00:31<06:48, 10.74s/it][A
Translating to nl and back:  10%|▉         | 4/41 [00:46<07:31, 12.19s/it][A
Translating to nl and back:  12%|█▏        | 5/41 [00:58<07:23, 12.32s/it][A
Translating to nl and back:  15%|█▍        | 6/41 [01:10<07:01, 12.05s/it][A
Translating to nl and back:  17%|█▋        | 7/41 [01:22<06:54, 12.18s/it][A
Translating to nl and back:  20%|█▉        | 8/41 [01:33<06:28, 11.77s/it][A
Translating to nl and back:  22%|██▏       | 9/41 [01:43<05:53, 11.04s/it][A
Translating to nl and back:  24%|██▍       | 10/41 [01:51<05:17, 10.25s/it][A
Translating to nl and back:  27%|██▋       | 11/41 [02:06<05:48, 11.61s/it][A
Translating to nl and back:  29%|██▉       | 12/41 [02:22<06:17, 13.0

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/344M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to it and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to it and back:   2%|▏         | 1/41 [00:11<07:51, 11.79s/it][A
Translating to it and back:   5%|▍         | 2/41 [00:22<07:09, 11.02s/it][A
Translating to it and back:   7%|▋         | 3/41 [00:34<07:26, 11.74s/it][A
Translating to it and back:  10%|▉         | 4/41 [00:49<07:57, 12.91s/it][A
Translating to it and back:  12%|█▏        | 5/41 [01:00<07:15, 12.10s/it][A
Translating to it and back:  15%|█▍        | 6/41 [01:12<07:04, 12.13s/it][A
Translating to it and back:  17%|█▋        | 7/41 [01:25<07:02, 12.43s/it][A
Translating to it and back:  20%|█▉        | 8/41 [01:37<06:47, 12.36s/it][A
Translating to it and back:  22%|██▏       | 9/41 [01:47<06:13, 11.67s/it][A
Translating to it and back:  24%|██▍       | 10/41 [01:56<05:32, 10.71s/it][A
Translating to it and back:  27%|██▋       | 11/41 [02:11<06:04, 12.16s/it][A
Translating to it and back:  29%|██▉       | 12/41 [02:29<06:41, 13.8

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ca and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to ca and back:   2%|▏         | 1/41 [00:09<06:35,  9.88s/it][A
Translating to ca and back:   5%|▍         | 2/41 [00:20<06:35, 10.15s/it][A
Translating to ca and back:   7%|▋         | 3/41 [00:31<06:45, 10.66s/it][A
Translating to ca and back:  10%|▉         | 4/41 [00:45<07:20, 11.90s/it][A
Translating to ca and back:  12%|█▏        | 5/41 [00:55<06:43, 11.20s/it][A
Translating to ca and back:  15%|█▍        | 6/41 [01:06<06:34, 11.28s/it][A
Translating to ca and back:  17%|█▋        | 7/41 [01:18<06:30, 11.49s/it][A
Translating to ca and back:  20%|█▉        | 8/41 [01:29<06:08, 11.17s/it][A
Translating to ca and back:  22%|██▏       | 9/41 [01:37<05:31, 10.36s/it][A
Translating to ca and back:  24%|██▍       | 10/41 [01:45<04:59,  9.67s/it][A
Translating to ca and back:  27%|██▋       | 11/41 [01:59<05:31, 11.03s/it][A
Translating to ca and back:  29%|██▉       | 12/41 [02:14<05:54, 12.2

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/293M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/450k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/443k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/472k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/222M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to gl and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to gl and back:   2%|▏         | 1/41 [00:09<06:24,  9.60s/it][A
Translating to gl and back:   5%|▍         | 2/41 [00:18<06:01,  9.28s/it][A
Translating to gl and back:   7%|▋         | 3/41 [00:29<06:11,  9.78s/it][A
Translating to gl and back:  10%|▉         | 4/41 [00:41<06:44, 10.93s/it][A
Translating to gl and back:  12%|█▏        | 5/41 [00:50<06:11, 10.31s/it][A
Translating to gl and back:  15%|█▍        | 6/41 [01:01<06:06, 10.46s/it][A
Translating to gl and back:  17%|█▋        | 7/41 [01:12<06:03, 10.70s/it][A
Translating to gl and back:  20%|█▉        | 8/41 [01:22<05:44, 10.44s/it][A
Translating to gl and back:  22%|██▏       | 9/41 [01:31<05:14,  9.83s/it][A
Translating to gl and back:  24%|██▍       | 10/41 [01:38<04:37,  8.96s/it][A
Translating to gl and back:  27%|██▋       | 11/41 [01:51<05:03, 10.12s/it][A
Translating to gl and back:  29%|██▉       | 12/41 [02:05<05:30, 11.4

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/441k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/507k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/802k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/507k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/441k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/802k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to grk and back:   0%|          | 0/41 [00:00<?, ?it/s][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to grk and back:   2%|▏         | 1/41 [00:36<24:37, 36.94s/it][A
Translating to grk and back:   5%|▍         | 2/41 [00:58<18:11, 27.99s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to grk and back:   7%|▋         | 3/41 [02:01<27:56, 44.11s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 507 is bigger than 0.9 * max_length: 512. You mig

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/804k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/908k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/922k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to trk and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to trk and back:   2%|▏         | 1/41 [00:13<09:00, 13.50s/it][A
Translating to trk and back:   5%|▍         | 2/41 [00:27<08:55, 13.74s/it][A
Translating to trk and back:   7%|▋         | 3/41 [00:40<08:35, 13.57s/it][A
Translating to trk and back:  10%|▉         | 4/41 [00:57<09:14, 14.99s/it][A
Translating to trk and back:  12%|█▏        | 5/41 [01:10<08:30, 14.19s/it][A
Translating to trk and back:  15%|█▍        | 6/41 [01:24<08:06, 13.89s/it][A
Translating to trk and back:  17%|█▋        | 7/41 [01:36<07:41, 13.57s/it][A
Translating to trk and back:  20%|█▉        | 8/41 [01:49<07:17, 13.24s/it][A
Translating to trk and back:  22%|██▏       | 9/41 [01:59<06:31, 12.24s/it][A
Translating to trk and back:  24%|██▍       | 10/41 [02:15<06:57, 13.47s/it][A
Translating to trk and back:  27%|██▋       | 11/41 [02:32<07:16, 14.55s/it][A
Translating to trk and back:  29%|██▉       | 12/41 [02:5

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to uk and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to uk and back:   2%|▏         | 1/41 [00:10<06:56, 10.42s/it][A
Translating to uk and back:   5%|▍         | 2/41 [00:23<07:51, 12.08s/it][A
Translating to uk and back:   7%|▋         | 3/41 [00:34<07:22, 11.65s/it][A
Translating to uk and back:  10%|▉         | 4/41 [00:48<07:47, 12.63s/it][A
Translating to uk and back:  12%|█▏        | 5/41 [00:57<06:46, 11.29s/it][A
Translating to uk and back:  15%|█▍        | 6/41 [01:09<06:36, 11.32s/it][A
Translating to uk and back:  17%|█▋        | 7/41 [01:21<06:33, 11.57s/it][A
Translating to uk and back:  20%|█▉        | 8/41 [01:30<05:58, 10.85s/it][A
Translating to uk and back:  22%|██▏       | 9/41 [01:39<05:27, 10.22s/it][A
Translating to uk and back:  24%|██▍       | 10/41 [01:47<04:57,  9.60s/it][A
Translating to uk and back:  27%|██▋       | 11/41 [02:01<05:30, 11.03s/it][A
Translating to uk and back:  29%|██▉       | 12/41 [02:17<06:03, 12.5

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ur and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to ur and back:   2%|▏         | 1/41 [00:10<07:05, 10.64s/it][A
Translating to ur and back:   5%|▍         | 2/41 [00:20<06:44, 10.38s/it][A
Translating to ur and back:   7%|▋         | 3/41 [00:33<07:06, 11.23s/it][A
Translating to ur and back:  10%|▉         | 4/41 [00:47<07:37, 12.36s/it][A
Translating to ur and back:  12%|█▏        | 5/41 [00:57<06:59, 11.66s/it][A
Translating to ur and back:  15%|█▍        | 6/41 [01:08<06:38, 11.39s/it][A
Translating to ur and back:  17%|█▋        | 7/41 [01:22<06:59, 12.34s/it][A
Translating to ur and back:  20%|█▉        | 8/41 [01:31<06:13, 11.32s/it][A
Translating to ur and back:  22%|██▏       | 9/41 [01:41<05:45, 10.80s/it][A
Translating to ur and back:  24%|██▍       | 10/41 [01:50<05:17, 10.26s/it][A
Translating to ur and back:  27%|██▋       | 11/41 [02:08<06:22, 12.74s/it][A
Translating to ur and back:  29%|██▉       | 12/41 [02:22<06:12, 12.8

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to vi and back:   0%|          | 0/41 [00:00<?, ?it/s][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to vi and back:   2%|▏         | 1/41 [00:39<26:31, 39.79s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to vi and back:   5%|▍         | 2/41 [01:02<19:30, 30.02s/

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/786k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/793k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to roa and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to roa and back:   2%|▏         | 1/41 [00:15<10:25, 15.64s/it][A
Translating to roa and back:   5%|▍         | 2/41 [00:26<08:30, 13.09s/it][A
Translating to roa and back:   7%|▋         | 3/41 [00:39<08:13, 12.98s/it][A
Translating to roa and back:  10%|▉         | 4/41 [00:55<08:43, 14.16s/it][A
Translating to roa and back:  12%|█▏        | 5/41 [01:06<07:45, 12.94s/it][A
Translating to roa and back:  15%|█▍        | 6/41 [01:19<07:29, 12.85s/it][A
Translating to roa and back:  17%|█▋        | 7/41 [01:32<07:20, 12.95s/it][A
Translating to roa and back:  20%|█▉        | 8/41 [01:44<06:54, 12.57s/it][A
Translating to roa and back:  22%|██▏       | 9/41 [01:54<06:18, 11.81s/it][A
Translating to roa and back:  24%|██▍       | 10/41 [02:09<06:38, 12.87s/it][A
Translating to roa and back:  27%|██▋       | 11/41 [02:25<06:52, 13.74s/it][A
Translating to roa and back:  29%|██▉       | 12/41 [02:4

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to sv and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to sv and back:   2%|▏         | 1/41 [00:09<06:32,  9.82s/it][A
Translating to sv and back:   5%|▍         | 2/41 [00:19<06:15,  9.62s/it][A
Translating to sv and back:   7%|▋         | 3/41 [00:30<06:27, 10.20s/it][A
Translating to sv and back:  10%|▉         | 4/41 [00:43<06:56, 11.26s/it][A
Translating to sv and back:  12%|█▏        | 5/41 [00:52<06:17, 10.49s/it][A
Translating to sv and back:  15%|█▍        | 6/41 [01:03<06:15, 10.72s/it][A
Translating to sv and back:  17%|█▋        | 7/41 [01:14<06:11, 10.93s/it][A
Translating to sv and back:  20%|█▉        | 8/41 [01:24<05:50, 10.62s/it][A
Translating to sv and back:  22%|██▏       | 9/41 [01:33<05:20, 10.00s/it][A
Translating to sv and back:  24%|██▍       | 10/41 [01:41<04:50,  9.36s/it][A
Translating to sv and back:  27%|██▋       | 11/41 [01:54<05:20, 10.68s/it][A
Translating to sv and back:  29%|██▉       | 12/41 [02:09<05:47, 11.9

In [5]:
ds=df.to_pandas()
ds = ds.drop(columns=["index"])
ds.reset_index(inplace=True)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298 entries, 0 to 1297
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   index                               1298 non-null   int64 
 1   class                               1298 non-null   object
 2   comment_sentence                    1298 non-null   object
 3   partition                           1298 non-null   int64 
 4   combo                               1298 non-null   object
 5   labels                              1298 non-null   object
 6   retranslated_text_english_from_de   1298 non-null   object
 7   retranslated_text_english_from_fr   1298 non-null   object
 8   retranslated_text_english_from_zh   1298 non-null   object
 9   retranslated_text_english_from_hi   1298 non-null   object
 10  retranslated_text_english_from_es   1298 non-null   object
 11  retranslated_text_english_from_ru   1298 non-null   obje

In [6]:
# Define the model and tokenizer for sentence encoding
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# List of languages for retranslation analysis

languages = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca","gl","grk","trk","uk","ur","vi","roa","sv"]

# Function to encode sentences in batches for efficiency
def encode_sentences_in_batches(sentences, batch_size=32):
    """
    Encodes a list of sentences in batches using the pre-trained model.
    Args:
        sentences (list): List of sentences to encode.
        batch_size (int): Size of each batch for encoding.
    Returns:
        torch.Tensor: Tensor containing embeddings for all sentences.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Tokenize and move inputs to the appropriate device (CPU/GPU)
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():  # Disable gradient computation for inference
            model_output = model(**inputs)
        # Take the mean of the last hidden state to get sentence embeddings
        embeddings.append(model_output.last_hidden_state.mean(dim=1).cpu())
    return torch.cat(embeddings)

# Generate embeddings for original comments
comment_embeddings = encode_sentences_in_batches(ds['comment_sentence'].tolist())

# Set a cosine similarity threshold for filtering translations
threshold = 0.7

# Generate embeddings and filter translations for each language
language_columns = [f"retranslated_text_english_from_{lang}" for lang in languages]
for lang_col in language_columns:
    # Generate embeddings for the retranslated sentences
    retranslated_embeddings = encode_sentences_in_batches(ds[lang_col].fillna("").tolist())
    # Compute cosine similarity between original and retranslated embeddings
    cosine_similarities = torch.nn.functional.cosine_similarity(comment_embeddings, retranslated_embeddings).tolist()

    # Filter translations based on similarity threshold
    filtered_translation_col = f"filtered_{lang_col}"
    ds[filtered_translation_col] = [
        row[lang_col] if similarity > threshold else ""  # Keep translation if similarity exceeds threshold
        for (_, row), similarity in zip(ds.iterrows(), cosine_similarities)
    ]

# Prepare for concatenating original and filtered translations
main_columns = ['class', 'comment_sentence', 'labels']  # Main columns to retain in the final dataset
concat_columns = [f"filtered_{lang}" for lang in language_columns]  # Columns containing filtered translations

concat_types = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca","gl","grk","trk","uk","ur","vi","roa","sv"]

# Create a new DataFrame with original data
df_main = ds[main_columns].copy()
df_main['types'] = 'original'  # Add a column to indicate original comments

# Initialize a list to hold concatenated DataFrames
concat_data = [df_main]

# Add filtered translations to the concatenated DataFrame
for col, col_type in zip(concat_columns, concat_types):
    temp_df = df_main[main_columns].copy()  # Copy main structure
    temp_df['comment_sentence'] = ds[col]  # Replace 'comment_sentence' with filtered translations
    temp_df['types'] = col_type  # Label the type of translation
    concat_data.append(temp_df)  # Append to the list

# Concatenate all DataFrames into one
df_extended = pd.concat(concat_data, ignore_index=True)

# Remove rows with empty or null 'comment_sentence' values
df_extended = df_extended.dropna()  # Drop rows with null values
df_extended = df_extended[df_extended['comment_sentence'] != ""].reset_index(drop=True)  # Drop empty strings
# Display dataset information for debugging or confirmation
df_extended.info()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17508 entries, 0 to 17507
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             17508 non-null  object
 1   comment_sentence  17508 non-null  object
 2   labels            17508 non-null  object
 3   types             17508 non-null  object
dtypes: object(4)
memory usage: 547.2+ KB


In [7]:
file_path = 'NLBSE_Dataset_Pharo.csv'

# Save the DataFrame to a CSV file
df_extended.to_csv(file_path, index=False)
print(f"DataFrame saved to {file_path}")

DataFrame saved to NLBSE_Dataset_Pharo.csv
