# Run in kaggle T4 * 2 gpu

In [1]:
%pip install transformers
%pip install datasets
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModel
import torch
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from accelerate import Accelerator

In [3]:
if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print out the name of each GPU and memory details
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
    
    # If multiple GPUs are available, use DataParallel for multi-GPU
    if num_gpus > 1:
        print("Using DataParallel for multi-GPU training.")
        # Example: Wrap your model with DataParallel
    else:
        print("Only one GPU available, using single GPU mode.")
else:
    print("No GPUs available, using CPU.")
    
def setup_translation_pipeline(source_lang, target_lang):
    """
    Set up a translation pipeline using a pretrained model from Helsinki-NLP, optimized for multi-GPU with Accelerate.

    Args:
        source_lang (str): The source language code (e.g., "en" for English).
        target_lang (str): The target language code (e.g., "fr" for French).

    Returns:
        transformers.Pipeline: A translation pipeline configured for the given language pair.
    """
    # Define the model name based on the source and target languages
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    # Load the tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load the translation model with memory optimization and half-precision support
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        return_dict=True,  # Use a dictionary for the model's output for easier access
        low_cpu_mem_usage=True,  # Optimize for low memory usage during model loading
        torch_dtype=torch.float16,  # Use half-precision for faster computation (requires supported GPU)
        trust_remote_code=True,  # Trust the model's remote configuration if specified
    )
    
    # Use the Accelerate library for better multi-GPU support
    accelerator = Accelerator()

    # Move the model to the appropriate device using Accelerator (handles multi-GPU)
    model = accelerator.prepare(model)

    # Set up and return the translation pipeline
    translation_pipeline = pipeline(
        "translation",  # Specify the task as translation
        model=model,  # Provide the loaded model
        tokenizer=tokenizer,  # Provide the loaded tokenizer
        torch_dtype=torch.float16,  # Use half-precision for the pipeline
        max_length=512,
        truncation=True,
        device=accelerator.device  # Use the device from Accelerator (handles multi-GPU)
    )

    return translation_pipeline


Number of GPUs available: 2
GPU 0: Tesla T4
Total memory: 15.828320256 GB
GPU 1: Tesla T4
Total memory: 15.828320256 GB
Using DataParallel for multi-GPU training.


In [4]:
# Function to translate and retranslate for a given target language
def translate_retranslate(ds, source_lang, target_lang, batch_size=32):
    """
    Translate the dataset's 'comment_sentence' column to a target language and back to the source language.
    
    Args:
        ds (Dataset): Hugging Face dataset containing a 'comment_sentence' column.
        source_lang (str): Source language code (e.g., "en" for English).
        target_lang (str): Target language code to translate to and back.
        batch_size (int): Number of rows to process in a single batch.

    Returns:
        Dataset: Hugging Face dataset with a new column for retranslated texts.
    """
    # Set up translation pipelines
    pipe_to_target = setup_translation_pipeline(source_lang, target_lang)  # English to target language
    pipe_to_source = setup_translation_pipeline(target_lang, source_lang)  # Target language back to English

    retranslated_texts = []

    # Process the dataset in batches to optimize memory usage and speed
    for i in tqdm(range(0, len(ds), batch_size), desc=f"Translating to {target_lang} and back"):
        # Extract a batch of 'comment_sentence' without converting to pandas
        batch = ds[i: i + batch_size]["comment_sentence"]

        # Step 1: Translate from source language to target language
        target_translations = pipe_to_target(batch, max_length=512, do_sample=False)
        target_texts = [result["translation_text"] for result in target_translations]

        # Step 2: Translate back to the source language (English)
        english_translations = pipe_to_source(target_texts, max_length=512, do_sample=False)
        retranslated_batch = [result["translation_text"].strip() for result in english_translations]

        # Append the retranslated texts to the result list
        retranslated_texts.extend(retranslated_batch)
                # Clear CUDA cache to prevent memory overload
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Add the retranslated texts as a new column in the dataset
    ds = ds.add_column(f"retranslated_text_english_from_{target_lang}", retranslated_texts)
    return ds

# Load the dataset (e.g., the first 10 rows from the Python training split)
df = load_dataset("NLBSE/nlbse25-code-comment-classification", split="python_train")

# List of target languages to process for retranslation
languages = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca","gl","grk","trk","uk","ur","vi","roa","sv"]

# Loop through each language and retranslate
for lang in tqdm(languages, desc="Processing languages"):
    df = translate_retranslate(df, "en", lang)


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

Processing languages:   0%|          | 0/18 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to de and back:   0%|          | 0/59 [00:00<?, ?it/s][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to de and back:   2%|▏         | 1/59 [00:17<16:40, 17.26s/it][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to de and back:   3%|▎         | 2/59 [00:33<15:37, 16.44s/it][A
Translating to de and back:   5%|▌         | 3/59 [00:40<11:17, 12.09s/it][A
Translating to de and back:   7%|▋         | 4/59 [00:47<09:34, 10.45s/it][A
Translating to de and back:   8%|▊         | 5/59 [00:54<08:09,  9.06s/it][A
Translating to de and back:  10%|█         | 6/59 [01:02<07:45,  8.79s/it][A
Translating to de and back:  12%|█▏        | 7/59 [01:10<07:13,  8.34s/it][A
Translating to de and back:  14%|█▎        | 8/59 [01:17<06:48,  8.02s/

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to fr and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to fr and back:   2%|▏         | 1/59 [00:08<07:51,  8.13s/it][A
Translating to fr and back:   3%|▎         | 2/59 [00:15<07:11,  7.57s/it][A
Translating to fr and back:   5%|▌         | 3/59 [00:25<08:08,  8.73s/it][A
Translating to fr and back:   7%|▋         | 4/59 [00:33<07:50,  8.56s/it][A
Translating to fr and back:   8%|▊         | 5/59 [00:40<07:05,  7.88s/it][A
Translating to fr and back:  10%|█         | 6/59 [00:49<07:26,  8.42s/it][A
Translating to fr and back:  12%|█▏        | 7/59 [00:57<07:06,  8.19s/it][A
Translating to fr and back:  14%|█▎        | 8/59 [01:05<06:48,  8.00s/it][A
Translating to fr and back:  15%|█▌        | 9/59 [01:12<06:24,  7.68s/it][A
Translating to fr and back:  17%|█▋        | 10/59 [01:19<06:17,  7.70s/it][A
Translating to fr and back:  19%|█▊        | 11/59 [01:27<06:04,  7.60s/it][A
Translating to fr and back:  20%|██        | 12/59 [01:33<05:43,  7.3

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to zh and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to zh and back:   2%|▏         | 1/59 [00:07<07:15,  7.51s/it][A
Translating to zh and back:   3%|▎         | 2/59 [00:15<07:20,  7.72s/it][A
Translating to zh and back:   5%|▌         | 3/59 [00:23<07:34,  8.11s/it][A
Translating to zh and back:   7%|▋         | 4/59 [00:32<07:31,  8.22s/it][A
Translating to zh and back:   8%|▊         | 5/59 [00:38<06:53,  7.66s/it][A
Translating to zh and back:  10%|█         | 6/59 [00:46<06:40,  7.56s/it][A
Translating to zh and back:  12%|█▏        | 7/59 [00:54<06:36,  7.62s/it][A
Translating to zh and back:  14%|█▎        | 8/59 [01:01<06:26,  7.58s/it][A
Translating to zh and back:  15%|█▌        | 9/59 [01:08<06:00,  7.22s/it][A
Translating to zh and back:  17%|█▋        | 10/59 [01:15<05:56,  7.28s/it][A
Translating to zh and back:  19%|█▊        | 11/59 [01:23<06:00,  7.52s/it][A
Translating to zh and back:  20%|██        | 12/59 [01:31<05:53,  7.5

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to hi and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to hi and back:   2%|▏         | 1/59 [00:07<07:31,  7.79s/it][A
Translating to hi and back:   3%|▎         | 2/59 [00:15<07:07,  7.50s/it][A
Translating to hi and back:   5%|▌         | 3/59 [00:22<06:45,  7.24s/it][A
Translating to hi and back:   7%|▋         | 4/59 [00:30<07:00,  7.64s/it][A
Translating to hi and back:   8%|▊         | 5/59 [00:36<06:29,  7.21s/it][A
Translating to hi and back:  10%|█         | 6/59 [00:44<06:32,  7.40s/it][A
Translating to hi and back:  12%|█▏        | 7/59 [00:51<06:25,  7.42s/it][A
Translating to hi and back:  14%|█▎        | 8/59 [00:59<06:19,  7.45s/it][A
Translating to hi and back:  15%|█▌        | 9/59 [01:06<06:02,  7.25s/it][A
Translating to hi and back:  17%|█▋        | 10/59 [01:13<05:55,  7.25s/it][A
Translating to hi and back:  19%|█▊        | 11/59 [01:20<05:42,  7.13s/it][A
Translating to hi and back:  20%|██        | 12/59 [01:26<05:26,  6.9

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to es and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to es and back:   2%|▏         | 1/59 [00:07<07:21,  7.61s/it][A
Translating to es and back:   3%|▎         | 2/59 [00:17<08:26,  8.89s/it][A
Translating to es and back:   5%|▌         | 3/59 [00:24<07:19,  7.85s/it][A
Translating to es and back:   7%|▋         | 4/59 [00:32<07:17,  7.95s/it][A
Translating to es and back:   8%|▊         | 5/59 [00:38<06:45,  7.51s/it][A
Translating to es and back:  10%|█         | 6/59 [00:47<06:52,  7.78s/it][A
Translating to es and back:  12%|█▏        | 7/59 [00:54<06:41,  7.72s/it][A
Translating to es and back:  14%|█▎        | 8/59 [01:02<06:28,  7.62s/it][A
Translating to es and back:  15%|█▌        | 9/59 [01:08<06:02,  7.25s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to es and back:  17%|█▋        | 10/59 [01:24<08:03,  9.87s

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ru and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to ru and back:   2%|▏         | 1/59 [00:08<08:31,  8.83s/it][A
Translating to ru and back:   3%|▎         | 2/59 [00:16<08:00,  8.42s/it][A
Translating to ru and back:   5%|▌         | 3/59 [00:24<07:35,  8.14s/it][A
Translating to ru and back:   7%|▋         | 4/59 [00:33<07:47,  8.49s/it][A
Translating to ru and back:   8%|▊         | 5/59 [00:41<07:19,  8.13s/it][A
Translating to ru and back:  10%|█         | 6/59 [00:56<09:18, 10.54s/it][A
Translating to ru and back:  12%|█▏        | 7/59 [01:04<08:30,  9.81s/it][A
Translating to ru and back:  14%|█▎        | 8/59 [01:12<07:51,  9.25s/it][A
Translating to ru and back:  15%|█▌        | 9/59 [01:20<07:19,  8.80s/it][A
Translating to ru and back:  17%|█▋        | 10/59 [01:29<07:15,  8.88s/it][A
Translating to ru and back:  19%|█▊        | 11/59 [01:39<07:13,  9.03s/it][A
Translating to ru and back:  20%|██        | 12/59 [01:47<06:48,  8.7

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ar and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to ar and back:   2%|▏         | 1/59 [00:08<08:16,  8.56s/it][AYour input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to ar and back:   3%|▎         | 2/59 [00:50<26:46, 28.18s/it][A
Translating to ar and back:   5%|▌         | 3/59 [01:02<19:18, 20.69s/it][AYour input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to ar and back:   7%|▋         | 4/59 [01:17<16:56, 18.48s/

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to nl and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to nl and back:   2%|▏         | 1/59 [00:11<11:05, 11.47s/it][A
Translating to nl and back:   3%|▎         | 2/59 [00:19<09:06,  9.59s/it][A
Translating to nl and back:   5%|▌         | 3/59 [00:27<08:04,  8.65s/it][A
Translating to nl and back:   7%|▋         | 4/59 [00:35<07:41,  8.40s/it][A
Translating to nl and back:   8%|▊         | 5/59 [00:41<06:58,  7.75s/it][A
Translating to nl and back:  10%|█         | 6/59 [00:51<07:15,  8.22s/it][A
Translating to nl and back:  12%|█▏        | 7/59 [00:59<07:08,  8.24s/it][A
Translating to nl and back:  14%|█▎        | 8/59 [01:06<06:49,  8.03s/it][A
Translating to nl and back:  15%|█▌        | 9/59 [01:15<06:52,  8.24s/it][A
Translating to nl and back:  17%|█▋        | 10/59 [01:23<06:35,  8.07s/it][A
Translating to nl and back:  19%|█▊        | 11/59 [01:31<06:23,  7.99s/it][A
Translating to nl and back:  20%|██        | 12/59 [01:38<06:03,  7.7

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/344M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to it and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to it and back:   2%|▏         | 1/59 [00:08<08:05,  8.36s/it][A
Translating to it and back:   3%|▎         | 2/59 [00:16<07:33,  7.95s/it][A
Translating to it and back:   5%|▌         | 3/59 [00:23<07:02,  7.54s/it][A
Translating to it and back:   7%|▋         | 4/59 [00:31<07:17,  7.96s/it][A
Translating to it and back:   8%|▊         | 5/59 [00:38<06:48,  7.57s/it][A
Translating to it and back:  10%|█         | 6/59 [00:47<06:57,  7.88s/it][A
Translating to it and back:  12%|█▏        | 7/59 [00:59<07:59,  9.22s/it][A
Translating to it and back:  14%|█▎        | 8/59 [01:06<07:27,  8.77s/it][A
Translating to it and back:  15%|█▌        | 9/59 [01:14<06:59,  8.38s/it][A
Translating to it and back:  17%|█▋        | 10/59 [01:22<06:47,  8.31s/it][A
Translating to it and back:  19%|█▊        | 11/59 [01:30<06:35,  8.24s/it][A
Translating to it and back:  20%|██        | 12/59 [01:37<06:12,  7.9

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ca and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to ca and back:   2%|▏         | 1/59 [00:07<07:37,  7.88s/it][A
Translating to ca and back:   3%|▎         | 2/59 [00:15<07:15,  7.64s/it][A
Translating to ca and back:   5%|▌         | 3/59 [00:22<06:54,  7.40s/it][A
Translating to ca and back:   7%|▋         | 4/59 [00:30<06:56,  7.57s/it][A
Translating to ca and back:   8%|▊         | 5/59 [00:36<06:29,  7.21s/it][A
Translating to ca and back:  10%|█         | 6/59 [00:44<06:32,  7.40s/it][A
Translating to ca and back:  12%|█▏        | 7/59 [00:52<06:29,  7.48s/it][A
Translating to ca and back:  14%|█▎        | 8/59 [00:59<06:22,  7.49s/it][A
Translating to ca and back:  15%|█▌        | 9/59 [01:06<06:02,  7.24s/it][A
Translating to ca and back:  17%|█▋        | 10/59 [01:13<05:56,  7.28s/it][A
Translating to ca and back:  19%|█▊        | 11/59 [01:21<05:51,  7.32s/it][A
Translating to ca and back:  20%|██        | 12/59 [01:27<05:32,  7.0

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/293M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/450k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/443k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/472k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/222M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to gl and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to gl and back:   2%|▏         | 1/59 [00:07<06:53,  7.13s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to gl and back:   3%|▎         | 2/59 [00:21<10:45, 11.32s/it][A
Translating to gl and back:   5%|▌         | 3/59 [00:27<08:28,  9.07s/it][A
Translating to gl and back:   7%|▋         | 4/59 [00:34<07:26,  8.11s/it][A
Translating to gl and back:   8%|▊         | 5/59 [00:40<06:40,  7.42s/it][A
Translating to gl and back:  10%|█         | 6/59 [00:48<06:40,  7.56s/it][A
Translating to gl and back:  12%|█▏        | 7/59 [00:55<06:19,  7.30s/it][A
Translating to gl and back:  14%|█▎        | 8/59 [01:02<06:05,  7.16s/it][A
Translating to gl and back:  15%|█▌        | 9/59 [01:08<05:48,  6.97s/it][A
Translating to gl and back:  17%|█▋        | 10/59 [01:15<05:37,  6.89s

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/441k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/507k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/802k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/507k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/441k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/802k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to grk and back:   0%|          | 0/59 [00:00<?, ?it/s][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to grk and back:   2%|▏         | 1/59 [00:23<22:15, 23.03s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to grk and back:   3%|▎         | 2/59 [00:44<21:03, 22.16s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_len

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/804k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/908k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/922k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to trk and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to trk and back:   2%|▏         | 1/59 [00:10<09:52, 10.22s/it][A
Translating to trk and back:   3%|▎         | 2/59 [00:24<11:53, 12.51s/it][A
Translating to trk and back:   5%|▌         | 3/59 [00:33<10:17, 11.03s/it][A
Translating to trk and back:   7%|▋         | 4/59 [00:43<09:44, 10.64s/it][A
Translating to trk and back:   8%|▊         | 5/59 [00:54<09:43, 10.81s/it][A
Translating to trk and back:  10%|█         | 6/59 [01:08<10:17, 11.65s/it][A
Translating to trk and back:  12%|█▏        | 7/59 [01:22<11:00, 12.71s/it][A
Translating to trk and back:  14%|█▎        | 8/59 [01:36<11:09, 13.12s/it][A
Translating to trk and back:  15%|█▌        | 9/59 [01:45<09:45, 11.71s/it][A
Translating to trk and back:  17%|█▋        | 10/59 [01:55<09:07, 11.18s/it][A
Translating to trk and back:  19%|█▊        | 11/59 [02:04<08:29, 10.61s/it][A
Translating to trk and back:  20%|██        | 12/59 [02:1

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to uk and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to uk and back:   2%|▏         | 1/59 [00:07<07:36,  7.88s/it][A
Translating to uk and back:   3%|▎         | 2/59 [00:15<07:10,  7.55s/it][A
Translating to uk and back:   5%|▌         | 3/59 [00:23<07:22,  7.90s/it][A
Translating to uk and back:   7%|▋         | 4/59 [00:31<07:10,  7.82s/it][A
Translating to uk and back:   8%|▊         | 5/59 [00:38<06:45,  7.51s/it][A
Translating to uk and back:  10%|█         | 6/59 [00:46<06:50,  7.75s/it][A
Translating to uk and back:  12%|█▏        | 7/59 [00:54<06:44,  7.78s/it][A
Translating to uk and back:  14%|█▎        | 8/59 [01:02<06:37,  7.79s/it][A
Translating to uk and back:  15%|█▌        | 9/59 [01:09<06:16,  7.54s/it][A
Translating to uk and back:  17%|█▋        | 10/59 [01:16<06:12,  7.60s/it][A
Translating to uk and back:  19%|█▊        | 11/59 [01:24<06:01,  7.54s/it][A
Translating to uk and back:  20%|██        | 12/59 [01:30<05:43,  7.3

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ur and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to ur and back:   2%|▏         | 1/59 [00:10<09:43, 10.06s/it][A
Translating to ur and back:   3%|▎         | 2/59 [00:18<08:50,  9.30s/it][A
Translating to ur and back:   5%|▌         | 3/59 [00:26<07:54,  8.47s/it][A
Translating to ur and back:   7%|▋         | 4/59 [00:34<07:42,  8.40s/it][A
Translating to ur and back:   8%|▊         | 5/59 [00:43<07:40,  8.52s/it][A
Translating to ur and back:  10%|█         | 6/59 [00:51<07:23,  8.37s/it][A
Translating to ur and back:  12%|█▏        | 7/59 [00:59<07:14,  8.35s/it][A
Translating to ur and back:  14%|█▎        | 8/59 [01:08<07:19,  8.61s/it][A
Translating to ur and back:  15%|█▌        | 9/59 [01:16<06:52,  8.25s/it][A
Translating to ur and back:  17%|█▋        | 10/59 [01:24<06:40,  8.17s/it][A
Translating to ur and back:  19%|█▊        | 11/59 [01:32<06:27,  8.08s/it][A
Translating to ur and back:  20%|██        | 12/59 [01:40<06:15,  8.0

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to vi and back:   0%|          | 0/59 [00:00<?, ?it/s][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to vi and back:   2%|▏         | 1/59 [00:21<20:56, 21.67s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 491 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to vi and back:   3%|▎         | 2/59 [00:58<29:19, 30.87s/

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/786k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/793k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to roa and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to roa and back:   2%|▏         | 1/59 [00:12<12:14, 12.66s/it][A
Translating to roa and back:   3%|▎         | 2/59 [00:24<11:32, 12.14s/it][A
Translating to roa and back:   5%|▌         | 3/59 [00:31<09:21, 10.03s/it][A
Translating to roa and back:   7%|▋         | 4/59 [00:40<08:39,  9.44s/it][A
Translating to roa and back:   8%|▊         | 5/59 [00:48<07:59,  8.88s/it][A
Translating to roa and back:  10%|█         | 6/59 [00:57<07:59,  9.04s/it][A
Translating to roa and back:  12%|█▏        | 7/59 [01:05<07:33,  8.72s/it][A
Translating to roa and back:  14%|█▎        | 8/59 [01:14<07:18,  8.60s/it][A
Translating to roa and back:  15%|█▌        | 9/59 [01:21<06:52,  8.25s/it][A
Translating to roa and back:  17%|█▋        | 10/59 [01:30<06:51,  8.39s/it][A
Translating to roa and back:  19%|█▊        | 11/59 [01:38<06:40,  8.35s/it][A
Translating to roa and back:  20%|██        | 12/59 [01:4

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to sv and back:   0%|          | 0/59 [00:00<?, ?it/s][A
Translating to sv and back:   2%|▏         | 1/59 [00:08<08:25,  8.72s/it][A
Translating to sv and back:   3%|▎         | 2/59 [00:15<07:28,  7.87s/it][A
Translating to sv and back:   5%|▌         | 3/59 [00:22<06:48,  7.29s/it][A
Translating to sv and back:   7%|▋         | 4/59 [00:30<06:46,  7.40s/it][A
Translating to sv and back:   8%|▊         | 5/59 [00:36<06:25,  7.14s/it][A
Translating to sv and back:  10%|█         | 6/59 [00:44<06:35,  7.47s/it][A
Translating to sv and back:  12%|█▏        | 7/59 [00:52<06:30,  7.50s/it][A
Translating to sv and back:  14%|█▎        | 8/59 [00:59<06:16,  7.37s/it][A
Translating to sv and back:  15%|█▌        | 9/59 [01:06<06:01,  7.24s/it][A
Translating to sv and back:  17%|█▋        | 10/59 [01:13<05:53,  7.22s/it][A
Translating to sv and back:  19%|█▊        | 11/59 [01:20<05:45,  7.20s/it][A
Translating to sv and back:  20%|██        | 12/59 [01:27<05:25,  6.9

In [5]:
ds=df.to_pandas()
ds = ds.drop(columns=["index"])
ds.reset_index(inplace=True)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1884 entries, 0 to 1883
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   index                               1884 non-null   int64 
 1   class                               1884 non-null   object
 2   comment_sentence                    1884 non-null   object
 3   partition                           1884 non-null   int64 
 4   combo                               1884 non-null   object
 5   labels                              1884 non-null   object
 6   retranslated_text_english_from_de   1884 non-null   object
 7   retranslated_text_english_from_fr   1884 non-null   object
 8   retranslated_text_english_from_zh   1884 non-null   object
 9   retranslated_text_english_from_hi   1884 non-null   object
 10  retranslated_text_english_from_es   1884 non-null   object
 11  retranslated_text_english_from_ru   1884 non-null   obje

In [6]:
# Define the model and tokenizer for sentence encoding
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# List of languages for retranslation analysis

languages = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca","gl","grk","trk","uk","ur","vi","roa","sv"]

# Function to encode sentences in batches for efficiency
def encode_sentences_in_batches(sentences, batch_size=32):
    """
    Encodes a list of sentences in batches using the pre-trained model.
    Args:
        sentences (list): List of sentences to encode.
        batch_size (int): Size of each batch for encoding.
    Returns:
        torch.Tensor: Tensor containing embeddings for all sentences.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Tokenize and move inputs to the appropriate device (CPU/GPU)
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():  # Disable gradient computation for inference
            model_output = model(**inputs)
        # Take the mean of the last hidden state to get sentence embeddings
        embeddings.append(model_output.last_hidden_state.mean(dim=1).cpu())
    return torch.cat(embeddings)

# Generate embeddings for original comments
comment_embeddings = encode_sentences_in_batches(ds['comment_sentence'].tolist())

# Set a cosine similarity threshold for filtering translations
threshold = 0.7

# Generate embeddings and filter translations for each language
language_columns = [f"retranslated_text_english_from_{lang}" for lang in languages]
for lang_col in language_columns:
    # Generate embeddings for the retranslated sentences
    retranslated_embeddings = encode_sentences_in_batches(ds[lang_col].fillna("").tolist())
    # Compute cosine similarity between original and retranslated embeddings
    cosine_similarities = torch.nn.functional.cosine_similarity(comment_embeddings, retranslated_embeddings).tolist()

    # Filter translations based on similarity threshold
    filtered_translation_col = f"filtered_{lang_col}"
    ds[filtered_translation_col] = [
        row[lang_col] if similarity > threshold else ""  # Keep translation if similarity exceeds threshold
        for (_, row), similarity in zip(ds.iterrows(), cosine_similarities)
    ]

# Prepare for concatenating original and filtered translations
main_columns = ['class', 'comment_sentence', 'labels']  # Main columns to retain in the final dataset
concat_columns = [f"filtered_{lang}" for lang in language_columns]  # Columns containing filtered translations

concat_types = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca","gl","grk","trk","uk","ur","vi","roa","sv"]

# Create a new DataFrame with original data
df_main = ds[main_columns].copy()
df_main['types'] = 'original'  # Add a column to indicate original comments

# Initialize a list to hold concatenated DataFrames
concat_data = [df_main]

# Add filtered translations to the concatenated DataFrame
for col, col_type in zip(concat_columns, concat_types):
    temp_df = df_main[main_columns].copy()  # Copy main structure
    temp_df['comment_sentence'] = ds[col]  # Replace 'comment_sentence' with filtered translations
    temp_df['types'] = col_type  # Label the type of translation
    concat_data.append(temp_df)  # Append to the list

# Concatenate all DataFrames into one
df_extended = pd.concat(concat_data, ignore_index=True)

# Remove rows with empty or null 'comment_sentence' values
df_extended = df_extended.dropna()  # Drop rows with null values
df_extended = df_extended[df_extended['comment_sentence'] != ""].reset_index(drop=True)  # Drop empty strings
# Display dataset information for debugging or confirmation
df_extended.info()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22624 entries, 0 to 22623
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             22624 non-null  object
 1   comment_sentence  22624 non-null  object
 2   labels            22624 non-null  object
 3   types             22624 non-null  object
dtypes: object(4)
memory usage: 707.1+ KB


In [7]:
file_path = 'NLBSE_Dataset_Python.csv'

# Save the DataFrame to a CSV file
df_extended.to_csv(file_path, index=False)
print(f"DataFrame saved to {file_path}")

DataFrame saved to NLBSE_Dataset_Python.csv
