# Run in kaggle T4 * 2 gpu

In [1]:
%pip install transformers
%pip install datasets
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModel
import torch
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from accelerate import Accelerator

In [3]:
if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print out the name of each GPU and memory details
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
    
    # If multiple GPUs are available, use DataParallel for multi-GPU
    if num_gpus > 1:
        print("Using DataParallel for multi-GPU training.")
        # Example: Wrap your model with DataParallel
    else:
        print("Only one GPU available, using single GPU mode.")
else:
    print("No GPUs available, using CPU.")
    
def setup_translation_pipeline(source_lang, target_lang):
    """
    Set up a translation pipeline using a pretrained model from Helsinki-NLP, optimized for multi-GPU with Accelerate.

    Args:
        source_lang (str): The source language code (e.g., "en" for English).
        target_lang (str): The target language code (e.g., "fr" for French).

    Returns:
        transformers.Pipeline: A translation pipeline configured for the given language pair.
    """
    # Define the model name based on the source and target languages
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    # Load the tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load the translation model with memory optimization and half-precision support
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        return_dict=True,  # Use a dictionary for the model's output for easier access
        low_cpu_mem_usage=True,  # Optimize for low memory usage during model loading
        torch_dtype=torch.float16,  # Use half-precision for faster computation (requires supported GPU)
        trust_remote_code=True,  # Trust the model's remote configuration if specified
    )
    
    # Use the Accelerate library for better multi-GPU support
    accelerator = Accelerator()

    # Move the model to the appropriate device using Accelerator (handles multi-GPU)
    model = accelerator.prepare(model)

    # Set up and return the translation pipeline
    translation_pipeline = pipeline(
        "translation",  # Specify the task as translation
        model=model,  # Provide the loaded model
        tokenizer=tokenizer,  # Provide the loaded tokenizer
        torch_dtype=torch.float16,  # Use half-precision for the pipeline
        max_length=512,
        truncation=True,
        device=accelerator.device  # Use the device from Accelerator (handles multi-GPU)
    )

    return translation_pipeline


Number of GPUs available: 2
GPU 0: Tesla T4
Total memory: 15.828320256 GB
GPU 1: Tesla T4
Total memory: 15.828320256 GB
Using DataParallel for multi-GPU training.


In [4]:
# Function to translate and retranslate for a given target language
def translate_retranslate(ds, source_lang, target_lang, batch_size=32):
    """
    Translate the dataset's 'comment_sentence' column to a target language and back to the source language.
    
    Args:
        ds (Dataset): Hugging Face dataset containing a 'comment_sentence' column.
        source_lang (str): Source language code (e.g., "en" for English).
        target_lang (str): Target language code to translate to and back.
        batch_size (int): Number of rows to process in a single batch.

    Returns:
        Dataset: Hugging Face dataset with a new column for retranslated texts.
    """
    # Set up translation pipelines
    pipe_to_target = setup_translation_pipeline(source_lang, target_lang)  # English to target language
    pipe_to_source = setup_translation_pipeline(target_lang, source_lang)  # Target language back to English

    retranslated_texts = []

    # Process the dataset in batches to optimize memory usage and speed
    for i in tqdm(range(0, len(ds), batch_size), desc=f"Translating to {target_lang} and back"):
        # Extract a batch of 'comment_sentence' without converting to pandas
        batch = ds[i: i + batch_size]["comment_sentence"]

        # Step 1: Translate from source language to target language
        target_translations = pipe_to_target(batch, max_length=512, do_sample=False)
        target_texts = [result["translation_text"] for result in target_translations]

        # Step 2: Translate back to the source language (English)
        english_translations = pipe_to_source(target_texts, max_length=512, do_sample=False)
        retranslated_batch = [result["translation_text"].strip() for result in english_translations]

        # Append the retranslated texts to the result list
        retranslated_texts.extend(retranslated_batch)
                # Clear CUDA cache to prevent memory overload
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Add the retranslated texts as a new column in the dataset
    ds = ds.add_column(f"retranslated_text_english_from_{target_lang}", retranslated_texts)
    return ds

# Load the dataset (e.g., the first 10 rows from the Python training split)
df = load_dataset("NLBSE/nlbse25-code-comment-classification", split="java_train")

# List of target languages to process for retranslation
languages = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca"]



README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

In [5]:
ds=df.to_pandas()
from datasets import Dataset
filter_ds = ds[~ds['labels'].apply(lambda x: str(x).startswith("[1 0 0 0 0 0 0]"))]
unfilter_ds =  ds[ds['labels'].apply(lambda x: str(x).startswith("[1 0 0 0 0 0 0]"))]
filter_df = Dataset.from_pandas(filter_ds)
filter_ds.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4064 entries, 1 to 7605
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   index             4064 non-null   int64 
 1   class             4064 non-null   object
 2   comment_sentence  4064 non-null   object
 3   partition         4064 non-null   int64 
 4   combo             4064 non-null   object
 5   labels            4064 non-null   object
dtypes: int64(2), object(4)
memory usage: 222.2+ KB


In [6]:
for lang in tqdm(languages, desc="Processing languages"):
    filter_df = translate_retranslate(filter_df, "en", lang)

Processing languages:   0%|          | 0/10 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to de and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to de and back:   1%|          | 1/127 [00:10<23:01, 10.96s/it][A
Translating to de and back:   2%|▏         | 2/127 [00:19<20:12,  9.70s/it][A
Translating to de and back:   2%|▏         | 3/127 [00:28<18:52,  9.13s/it][A
Translating to de and back:   3%|▎         | 4/127 [00:37<18:58,  9.25s/it][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to de and back:   4%|▍         | 5/127 [01:09<35:43, 17.57s/it][A
Translating to de and back:   5%|▍         | 6/127 [01:17<28:26, 

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to fr and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to fr and back:   1%|          | 1/127 [00:09<20:52,  9.94s/it][A
Translating to fr and back:   2%|▏         | 2/127 [00:18<19:06,  9.17s/it][A
Translating to fr and back:   2%|▏         | 3/127 [00:26<18:11,  8.80s/it][A
Translating to fr and back:   3%|▎         | 4/127 [00:35<18:03,  8.81s/it][A
Translating to fr and back:   4%|▍         | 5/127 [00:43<16:58,  8.35s/it][A
Translating to fr and back:   5%|▍         | 6/127 [00:50<16:02,  7.96s/it][A
Translating to fr and back:   6%|▌         | 7/127 [00:59<16:52,  8.44s/it][A
Translating to fr and back:   6%|▋         | 8/127 [01:06<15:48,  7.97s/it][A
Translating to fr and back:   7%|▋         | 9/127 [01:15<16:14,  8.26s/it][A
Translating to fr and back:   8%|▊         | 10/127 [01:25<16:59,  8.71s/it][A
Translating to fr and back:   9%|▊         | 11/127 [01:33<16:20,  8.45s/it][A
Translating to fr and back:   9%|▉         | 12/127 [01:4

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to zh and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to zh and back:   1%|          | 1/127 [00:18<38:26, 18.31s/it][A
Translating to zh and back:   2%|▏         | 2/127 [00:28<27:55, 13.40s/it][A
Translating to zh and back:   2%|▏         | 3/127 [00:36<22:51, 11.06s/it][A
Translating to zh and back:   3%|▎         | 4/127 [00:47<22:36, 11.03s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to zh and back:   4%|▍         | 5/127 [01:03<25:59, 12.79s/it][A
Translating to zh and back:   5%|▍         | 6/127 [01:11<22:25, 11.12s/it][A
Translating to zh and back:   6%|▌         | 7/127 [01:21<21:30, 10.75s/it][A
Translating to zh and back:   6%|▋         | 8/127 [01:30<20:34, 10.38s/it][A
Translating to zh and back:   7%|▋         | 9/127 [01:40<19:45, 10.05s/it][A
Translating to zh and back:   8%|▊         | 10/127 [01:52<21

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to hi and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to hi and back:   1%|          | 1/127 [00:08<18:33,  8.83s/it][A
Translating to hi and back:   2%|▏         | 2/127 [00:16<17:30,  8.40s/it][A
Translating to hi and back:   2%|▏         | 3/127 [00:24<16:43,  8.09s/it][A
Translating to hi and back:   3%|▎         | 4/127 [00:33<16:53,  8.24s/it][A
Translating to hi and back:   4%|▍         | 5/127 [00:39<15:27,  7.60s/it][A
Translating to hi and back:   5%|▍         | 6/127 [00:46<14:42,  7.29s/it][A
Translating to hi and back:   6%|▌         | 7/127 [00:55<15:36,  7.80s/it][A
Translating to hi and back:   6%|▋         | 8/127 [01:01<14:50,  7.48s/it][A
Translating to hi and back:   7%|▋         | 9/127 [01:10<15:32,  7.91s/it][A
Translating to hi and back:   8%|▊         | 10/127 [01:20<16:13,  8.32s/it][A
Translating to hi and back:   9%|▊         | 11/127 [01:28<16:03,  8.30s/it][A
Translating to hi and back:   9%|▉         | 12/127 [01:3

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to es and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to es and back:   1%|          | 1/127 [00:09<19:48,  9.43s/it][A
Translating to es and back:   2%|▏         | 2/127 [00:18<19:00,  9.13s/it][A
Translating to es and back:   2%|▏         | 3/127 [00:26<17:58,  8.69s/it][A
Translating to es and back:   3%|▎         | 4/127 [00:35<18:10,  8.86s/it][A
Translating to es and back:   4%|▍         | 5/127 [00:45<18:44,  9.21s/it][A
Translating to es and back:   5%|▍         | 6/127 [00:52<17:21,  8.60s/it][A
Translating to es and back:   6%|▌         | 7/127 [01:06<20:27, 10.23s/it][A
Translating to es and back:   6%|▋         | 8/127 [01:13<18:27,  9.30s/it][A
Translating to es and back:   7%|▋         | 9/127 [01:22<18:11,  9.25s/it][A
Translating to es and back:   8%|▊         | 10/127 [01:33<18:39,  9.57s/it][A
Translating to es and back:   9%|▊         | 11/127 [01:41<17:42,  9.16s/it][A
Translating to es and back:   9%|▉         | 12/127 [01:5

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ru and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to ru and back:   1%|          | 1/127 [00:09<20:49,  9.92s/it][A
Translating to ru and back:   2%|▏         | 2/127 [00:19<20:10,  9.69s/it][A
Translating to ru and back:   2%|▏         | 3/127 [00:28<19:41,  9.53s/it][A
Translating to ru and back:   3%|▎         | 4/127 [00:37<19:02,  9.29s/it][A
Translating to ru and back:   4%|▍         | 5/127 [00:45<17:47,  8.75s/it][A
Translating to ru and back:   5%|▍         | 6/127 [00:53<16:54,  8.38s/it][A
Translating to ru and back:   6%|▌         | 7/127 [01:08<21:01, 10.51s/it][A
Translating to ru and back:   6%|▋         | 8/127 [01:15<19:12,  9.69s/it][A
Translating to ru and back:   7%|▋         | 9/127 [01:25<19:10,  9.75s/it][A
Translating to ru and back:   8%|▊         | 10/127 [01:44<24:17, 12.46s/it][A
Translating to ru and back:   9%|▊         | 11/127 [01:52<21:43, 11.23s/it][A
Translating to ru and back:   9%|▉         | 12/127 [02:1

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ar and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to ar and back:   1%|          | 1/127 [00:15<32:27, 15.46s/it][A
Translating to ar and back:   2%|▏         | 2/127 [00:25<25:42, 12.34s/it][A
Translating to ar and back:   2%|▏         | 3/127 [00:34<21:56, 10.61s/it][A
Translating to ar and back:   3%|▎         | 4/127 [00:55<30:38, 14.94s/it][A
Translating to ar and back:   4%|▍         | 5/127 [01:05<26:19, 12.95s/it][A
Translating to ar and back:   5%|▍         | 6/127 [01:13<22:39, 11.24s/it][A
Translating to ar and back:   6%|▌         | 7/127 [01:23<21:39, 10.83s/it][A
Translating to ar and back:   6%|▋         | 8/127 [01:31<19:49,  9.99s/it][A
Translating to ar and back:   7%|▋         | 9/127 [01:41<20:05, 10.22s/it][A
Translating to ar and back:   8%|▊         | 10/127 [01:56<22:37, 11.60s/it][A
Translating to ar and back:   9%|▊         | 11/127 [02:10<24:00, 12.42s/it][A
Translating to ar and back:   9%|▉         | 12/127 [02:2

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to nl and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to nl and back:   1%|          | 1/127 [00:09<19:34,  9.32s/it][A
Translating to nl and back:   2%|▏         | 2/127 [00:18<18:59,  9.12s/it][A
Translating to nl and back:   2%|▏         | 3/127 [00:30<21:50, 10.57s/it][A
Translating to nl and back:   3%|▎         | 4/127 [00:39<20:28,  9.99s/it][A
Translating to nl and back:   4%|▍         | 5/127 [00:54<23:46, 11.69s/it][A
Translating to nl and back:   5%|▍         | 6/127 [01:04<22:15, 11.04s/it][A
Translating to nl and back:   6%|▌         | 7/127 [01:13<20:49, 10.41s/it][A
Translating to nl and back:   6%|▋         | 8/127 [01:20<18:25,  9.29s/it][A
Translating to nl and back:   7%|▋         | 9/127 [01:28<17:57,  9.13s/it][A
Translating to nl and back:   8%|▊         | 10/127 [01:43<20:48, 10.67s/it][A
Translating to nl and back:   9%|▊         | 11/127 [02:02<26:00, 13.45s/it][A
Translating to nl and back:   9%|▉         | 12/127 [02:1

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/344M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to it and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to it and back:   1%|          | 1/127 [00:09<20:29,  9.76s/it][A
Translating to it and back:   2%|▏         | 2/127 [00:19<19:48,  9.51s/it][A
Translating to it and back:   2%|▏         | 3/127 [00:28<19:12,  9.29s/it][A
Translating to it and back:   3%|▎         | 4/127 [00:37<19:21,  9.45s/it][A
Translating to it and back:   4%|▍         | 5/127 [00:51<22:25, 11.03s/it][A
Translating to it and back:   5%|▍         | 6/127 [00:59<20:14, 10.03s/it][A
Translating to it and back:   6%|▌         | 7/127 [01:09<20:11, 10.09s/it][A
Translating to it and back:   6%|▋         | 8/127 [01:17<18:19,  9.24s/it][A
Translating to it and back:   7%|▋         | 9/127 [01:31<20:53, 10.62s/it][A
Translating to it and back:   8%|▊         | 10/127 [01:42<21:12, 10.87s/it][A
Translating to it and back:   9%|▊         | 11/127 [01:51<19:41, 10.19s/it][A
Translating to it and back:   9%|▉         | 12/127 [02:0

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ca and back:   0%|          | 0/127 [00:00<?, ?it/s][A
Translating to ca and back:   1%|          | 1/127 [00:08<17:29,  8.33s/it][A
Translating to ca and back:   2%|▏         | 2/127 [00:16<17:30,  8.40s/it][A
Translating to ca and back:   2%|▏         | 3/127 [00:24<17:04,  8.26s/it][A
Translating to ca and back:   3%|▎         | 4/127 [00:34<17:42,  8.64s/it][A
Translating to ca and back:   4%|▍         | 5/127 [00:41<16:27,  8.10s/it][A
Translating to ca and back:   5%|▍         | 6/127 [00:47<15:23,  7.63s/it][A
Translating to ca and back:   6%|▌         | 7/127 [00:57<16:18,  8.16s/it][A
Translating to ca and back:   6%|▋         | 8/127 [01:04<15:40,  7.90s/it][A
Translating to ca and back:   7%|▋         | 9/127 [01:13<16:04,  8.18s/it][A
Translating to ca and back:   8%|▊         | 10/127 [01:22<16:40,  8.55s/it][A
Translating to ca and back:   9%|▊         | 11/127 [01:30<16:06,  8.33s/it][A
Translating to ca and back:   9%|▉         | 12/127 [01:3

In [7]:
ds=filter_df.to_pandas()
ds.drop(columns=["index","__index_level_0__"],inplace=True)
ds.info()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
ds.sample(1)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4064 entries, 0 to 4063
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   class                              4064 non-null   object
 1   comment_sentence                   4064 non-null   object
 2   partition                          4064 non-null   int64 
 3   combo                              4064 non-null   object
 4   labels                             4064 non-null   object
 5   retranslated_text_english_from_de  4064 non-null   object
 6   retranslated_text_english_from_fr  4064 non-null   object
 7   retranslated_text_english_from_zh  4064 non-null   object
 8   retranslated_text_english_from_hi  4064 non-null   object
 9   retranslated_text_english_from_es  4064 non-null   object
 10  retranslated_text_english_from_ru  4064 non-null   object
 11  retranslated_text_english_from_ar  4064 non-null   object
 12  retran

Unnamed: 0,class,comment_sentence,partition,combo,labels,retranslated_text_english_from_de,retranslated_text_english_from_fr,retranslated_text_english_from_zh,retranslated_text_english_from_hi,retranslated_text_english_from_es,retranslated_text_english_from_ru,retranslated_text_english_from_ar,retranslated_text_english_from_nl,retranslated_text_english_from_it,retranslated_text_english_from_ca
1121,VScrollTable.java,we have rows rendered and,0,we have rows rendered and | VScrollTable.java,"[0, 0, 1, 0, 0, 0, 0]",we have rendered rows and,we have rendered lines and,We're on our own.,And placed a blazing lamp?,We have rendered rows and,"We have set up rows,",We have classes like this and...,We've shown rows and,we have rows yields and,We have colored rows and processed rows


In [8]:
# Define the model and tokenizer for sentence encoding
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# List of languages for retranslation analysis

languages = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca"]

# Function to encode sentences in batches for efficiency
def encode_sentences_in_batches(sentences, batch_size=32):
    """
    Encodes a list of sentences in batches using the pre-trained model.
    Args:
        sentences (list): List of sentences to encode.
        batch_size (int): Size of each batch for encoding.
    Returns:
        torch.Tensor: Tensor containing embeddings for all sentences.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Tokenize and move inputs to the appropriate device (CPU/GPU)
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():  # Disable gradient computation for inference
            model_output = model(**inputs)
        # Take the mean of the last hidden state to get sentence embeddings
        embeddings.append(model_output.last_hidden_state.mean(dim=1).cpu())
    return torch.cat(embeddings)

# Generate embeddings for original comments
comment_embeddings = encode_sentences_in_batches(ds['comment_sentence'].tolist())

# Set a cosine similarity threshold for filtering translations
threshold = 0.7

# Generate embeddings and filter translations for each language
language_columns = [f"retranslated_text_english_from_{lang}" for lang in languages]
for lang_col in language_columns:
    # Generate embeddings for the retranslated sentences
    retranslated_embeddings = encode_sentences_in_batches(ds[lang_col].fillna("").tolist())
    # Compute cosine similarity between original and retranslated embeddings
    cosine_similarities = torch.nn.functional.cosine_similarity(comment_embeddings, retranslated_embeddings).tolist()
    # Filter translations based on similarity threshold
    filtered_translation_col = f"filtered_{lang_col}"
    ds[filtered_translation_col] = [
        row[lang_col] if similarity > threshold else ""  # Keep translation if similarity exceeds threshold
        for (_, row), similarity in zip(ds.iterrows(), cosine_similarities)
    ]
    
# Prepare for concatenating original and filtered translations
main_columns = ['class', 'comment_sentence', 'labels']  # Main columns to retain in the final dataset
concat_columns = [f"filtered_{lang}" for lang in language_columns]  # Columns containing filtered translations

concat_types = ["de", "fr", "zh", "hi", "es", "ru","ar","nl","it","ca"]

# Create a new DataFrame with original data
df_main = ds[main_columns].copy()
df_main['types'] = 'original'  # Add a column to indicate original comments

# Initialize a list to hold concatenated DataFrames
concat_data = [df_main]

# Add filtered translations to the concatenated DataFrame
for col, col_type in zip(concat_columns, concat_types):
    temp_df = df_main[main_columns].copy()  # Copy main structure
    temp_df['comment_sentence'] = ds[col]  # Replace 'comment_sentence' with filtered translations
    temp_df['types'] = col_type  # Label the type of translation
    concat_data.append(temp_df)  # Append to the list

# Concatenate all DataFrames into one
df_extended = pd.concat(concat_data, ignore_index=True)

#Remove rows with empty or null 'comment_sentence' values
df_extended = df_extended.dropna()  # Drop rows with null values
df_extended = df_extended[df_extended['comment_sentence'] != ""].reset_index(drop=True)  # Drop empty strings
# Display dataset information for debugging or confirmation
df_extended.info()
unfilter_ds["types"]= 'original'
merged_df = pd.concat([unfilter_ds[['class', 'comment_sentence', 'labels', 'types']], 
                       df_extended[['class', 'comment_sentence', 'labels', 'types']]], 
                      ignore_index=True)
merged_df.info()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37285 entries, 0 to 37284
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             37285 non-null  object
 1   comment_sentence  37285 non-null  object
 2   labels            37285 non-null  object
 3   types             37285 non-null  object
dtypes: object(4)
memory usage: 1.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40835 entries, 0 to 40834
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             40835 non-null  object
 1   comment_sentence  40835 non-null  object
 2   labels            40835 non-null  object
 3   types             40835 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB


In [9]:
merged_df.to_csv('java_train.csv', index=False)