# Run in kaggle T4 * 2 gpu

In [1]:
%pip install transformers
%pip install datasets
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModel
import torch
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from accelerate import Accelerator

In [3]:
if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print out the name of each GPU and memory details
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
    
    # If multiple GPUs are available, use DataParallel for multi-GPU
    if num_gpus > 1:
        print("Using DataParallel for multi-GPU training.")
        # Example: Wrap your model with DataParallel
    else:
        print("Only one GPU available, using single GPU mode.")
else:
    print("No GPUs available, using CPU.")
    
def setup_translation_pipeline(source_lang, target_lang):
    """
    Set up a translation pipeline using a pretrained model from Helsinki-NLP, optimized for multi-GPU with Accelerate.

    Args:
        source_lang (str): The source language code (e.g., "en" for English).
        target_lang (str): The target language code (e.g., "fr" for French).

    Returns:
        transformers.Pipeline: A translation pipeline configured for the given language pair.
    """
    # Define the model name based on the source and target languages
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    # Load the tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load the translation model with memory optimization and half-precision support
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        return_dict=True,  # Use a dictionary for the model's output for easier access
        low_cpu_mem_usage=True,  # Optimize for low memory usage during model loading
        torch_dtype=torch.float16,  # Use half-precision for faster computation (requires supported GPU)
        trust_remote_code=True,  # Trust the model's remote configuration if specified
    )
    
    # Use the Accelerate library for better multi-GPU support
    accelerator = Accelerator()

    # Move the model to the appropriate device using Accelerator (handles multi-GPU)
    model = accelerator.prepare(model)

    # Set up and return the translation pipeline
    translation_pipeline = pipeline(
        "translation",  # Specify the task as translation
        model=model,  # Provide the loaded model
        tokenizer=tokenizer,  # Provide the loaded tokenizer
        torch_dtype=torch.float16,  # Use half-precision for the pipeline
        max_length=512,
        truncation=True,
        device=accelerator.device  # Use the device from Accelerator (handles multi-GPU)
    )

    return translation_pipeline


Number of GPUs available: 2
GPU 0: Tesla T4
Total memory: 15.828320256 GB
GPU 1: Tesla T4
Total memory: 15.828320256 GB
Using DataParallel for multi-GPU training.


In [4]:
# Function to translate and retranslate for a given target language
def translate_retranslate(ds, source_lang, target_lang, batch_size=32):
    """
    Translate the dataset's 'comment_sentence' column to a target language and back to the source language.
    
    Args:
        ds (Dataset): Hugging Face dataset containing a 'comment_sentence' column.
        source_lang (str): Source language code (e.g., "en" for English).
        target_lang (str): Target language code to translate to and back.
        batch_size (int): Number of rows to process in a single batch.

    Returns:
        Dataset: Hugging Face dataset with a new column for retranslated texts.
    """
    # Set up translation pipelines
    pipe_to_target = setup_translation_pipeline(source_lang, target_lang)  # English to target language
    pipe_to_source = setup_translation_pipeline(target_lang, source_lang)  # Target language back to English

    retranslated_texts = []

    # Process the dataset in batches to optimize memory usage and speed
    for i in tqdm(range(0, len(ds), batch_size), desc=f"Translating to {target_lang} and back"):
        # Extract a batch of 'comment_sentence' without converting to pandas
        batch = ds[i: i + batch_size]["comment_sentence"]

        # Step 1: Translate from source language to target language
        target_translations = pipe_to_target(batch, max_length=512, do_sample=False)
        target_texts = [result["translation_text"] for result in target_translations]

        # Step 2: Translate back to the source language (English)
        english_translations = pipe_to_source(target_texts, max_length=512, do_sample=False)
        retranslated_batch = [result["translation_text"].strip() for result in english_translations]

        # Append the retranslated texts to the result list
        retranslated_texts.extend(retranslated_batch)
                # Clear CUDA cache to prevent memory overload
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Add the retranslated texts as a new column in the dataset
    ds = ds.add_column(f"retranslated_text_english_from_{target_lang}", retranslated_texts)
    return ds

# Load the dataset (e.g., the first 10 rows from the Python training split)
df = load_dataset("NLBSE/nlbse25-code-comment-classification", split="pharo_train")

# List of target languages to process for retranslation
languages = ["de", "fr", "zh", "hi", "es", "ru"]

# Loop through each language and retranslate
for lang in tqdm(languages, desc="Processing languages"):
    df = translate_retranslate(df, "en", lang)


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

Processing languages:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to de and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to de and back:   2%|▏         | 1/41 [00:10<07:18, 10.95s/it][A
Translating to de and back:   5%|▍         | 2/41 [00:20<06:35, 10.14s/it][A
Translating to de and back:   7%|▋         | 3/41 [00:31<06:43, 10.63s/it][A
Translating to de and back:  10%|▉         | 4/41 [00:44<07:09, 11.60s/it][A
Translating to de and back:  12%|█▏        | 5/41 [00:59<07:38, 12.75s/it][A
Translating to de and back:  15%|█▍        | 6/41 [01:10<07:08, 12.23s/it][A
Translating to de and back:  17%|█▋        | 7/41 [01:22<06:49, 12.04s/it][A
Translating to de and back:  20%|█▉        | 8/41 [01:32<06:16, 11.41s/it][A
Translating to de and back:  22%|██▏       | 9/41 [01:41<05:38, 10.57s/it][A
Translating to de and back:  24%|██▍       | 10/41 [01:56<06:13, 12.05s/it][AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Translating to de and back:  27%|██▋   

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to fr and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to fr and back:   2%|▏         | 1/41 [00:09<06:25,  9.65s/it][A
Translating to fr and back:   5%|▍         | 2/41 [00:19<06:20,  9.77s/it][A
Translating to fr and back:   7%|▋         | 3/41 [00:30<06:40, 10.54s/it][A
Translating to fr and back:  10%|▉         | 4/41 [00:44<07:10, 11.64s/it][A
Translating to fr and back:  12%|█▏        | 5/41 [00:53<06:29, 10.82s/it][A
Translating to fr and back:  15%|█▍        | 6/41 [01:04<06:23, 10.95s/it][A
Translating to fr and back:  17%|█▋        | 7/41 [01:16<06:20, 11.20s/it][A
Translating to fr and back:  20%|█▉        | 8/41 [01:26<06:00, 10.92s/it][A
Translating to fr and back:  22%|██▏       | 9/41 [01:35<05:30, 10.33s/it][A
Translating to fr and back:  24%|██▍       | 10/41 [01:43<04:55,  9.53s/it][A
Translating to fr and back:  27%|██▋       | 11/41 [01:57<05:26, 10.90s/it][A
Translating to fr and back:  29%|██▉       | 12/41 [02:13<05:59, 12.4

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to zh and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to zh and back:   2%|▏         | 1/41 [00:08<05:50,  8.77s/it][A
Translating to zh and back:   5%|▍         | 2/41 [00:18<06:05,  9.37s/it][A
Translating to zh and back:   7%|▋         | 3/41 [00:29<06:28, 10.21s/it][A
Translating to zh and back:  10%|▉         | 4/41 [00:43<07:03, 11.45s/it][A
Translating to zh and back:  12%|█▏        | 5/41 [00:52<06:28, 10.80s/it][A
Translating to zh and back:  15%|█▍        | 6/41 [01:04<06:31, 11.19s/it][A
Translating to zh and back:  17%|█▋        | 7/41 [01:20<07:14, 12.79s/it][A
Translating to zh and back:  20%|█▉        | 8/41 [01:32<06:52, 12.50s/it][A
Translating to zh and back:  22%|██▏       | 9/41 [01:41<06:05, 11.43s/it][A
Translating to zh and back:  24%|██▍       | 10/41 [01:51<05:34, 10.79s/it][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_lengt

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to hi and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to hi and back:   2%|▏         | 1/41 [00:09<06:13,  9.34s/it][A
Translating to hi and back:   5%|▍         | 2/41 [00:18<05:58,  9.18s/it][A
Translating to hi and back:   7%|▋         | 3/41 [00:28<06:02,  9.55s/it][A
Translating to hi and back:  10%|▉         | 4/41 [00:41<06:41, 10.86s/it][A
Translating to hi and back:  12%|█▏        | 5/41 [00:49<06:02, 10.08s/it][A
Translating to hi and back:  15%|█▍        | 6/41 [00:59<05:44,  9.84s/it][A
Translating to hi and back:  17%|█▋        | 7/41 [01:09<05:41, 10.04s/it][A
Translating to hi and back:  20%|█▉        | 8/41 [01:18<05:20,  9.71s/it][A
Translating to hi and back:  22%|██▏       | 9/41 [01:26<04:48,  9.01s/it][A
Translating to hi and back:  24%|██▍       | 10/41 [01:33<04:22,  8.45s/it][A
Translating to hi and back:  27%|██▋       | 11/41 [01:47<05:05, 10.20s/it][A
Translating to hi and back:  29%|██▉       | 12/41 [02:03<05:42, 11.8

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to es and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to es and back:   2%|▏         | 1/41 [00:09<06:15,  9.39s/it][A
Translating to es and back:   5%|▍         | 2/41 [00:24<08:18, 12.77s/it][A
Translating to es and back:   7%|▋         | 3/41 [00:35<07:40, 12.12s/it][A
Translating to es and back:  10%|▉         | 4/41 [00:48<07:41, 12.47s/it][A
Translating to es and back:  12%|█▏        | 5/41 [00:58<06:46, 11.29s/it][A
Translating to es and back:  15%|█▍        | 6/41 [01:09<06:31, 11.17s/it][A
Translating to es and back:  17%|█▋        | 7/41 [01:20<06:25, 11.34s/it][A
Translating to es and back:  20%|█▉        | 8/41 [01:31<06:07, 11.12s/it][A
Translating to es and back:  22%|██▏       | 9/41 [01:40<05:34, 10.46s/it][A
Translating to es and back:  24%|██▍       | 10/41 [01:47<04:55,  9.54s/it][A
Translating to es and back:  27%|██▋       | 11/41 [02:01<05:26, 10.89s/it][A
Translating to es and back:  29%|██▉       | 12/41 [02:17<06:01, 12.4

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ru and back:   0%|          | 0/41 [00:00<?, ?it/s][A
Translating to ru and back:   2%|▏         | 1/41 [00:10<06:51, 10.30s/it][A
Translating to ru and back:   5%|▍         | 2/41 [00:20<06:41, 10.28s/it][A
Translating to ru and back:   7%|▋         | 3/41 [00:32<06:56, 10.96s/it][A
Translating to ru and back:  10%|▉         | 4/41 [00:46<07:28, 12.12s/it][A
Translating to ru and back:  12%|█▏        | 5/41 [00:55<06:44, 11.23s/it][A
Translating to ru and back:  15%|█▍        | 6/41 [01:06<06:30, 11.15s/it][A
Translating to ru and back:  17%|█▋        | 7/41 [01:19<06:33, 11.58s/it][A
Translating to ru and back:  20%|█▉        | 8/41 [01:31<06:27, 11.73s/it][A
Translating to ru and back:  22%|██▏       | 9/41 [01:41<05:59, 11.23s/it][A
Translating to ru and back:  24%|██▍       | 10/41 [01:52<05:40, 11.00s/it][A
Translating to ru and back:  27%|██▋       | 11/41 [02:06<06:02, 12.08s/it][A
Translating to ru and back:  29%|██▉       | 12/41 [02:23<06:36, 13.6

In [5]:
ds=df.to_pandas()
ds = ds.drop(columns=["index"])
ds.reset_index(inplace=True)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298 entries, 0 to 1297
Data columns (total 12 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   index                              1298 non-null   int64 
 1   class                              1298 non-null   object
 2   comment_sentence                   1298 non-null   object
 3   partition                          1298 non-null   int64 
 4   combo                              1298 non-null   object
 5   labels                             1298 non-null   object
 6   retranslated_text_english_from_de  1298 non-null   object
 7   retranslated_text_english_from_fr  1298 non-null   object
 8   retranslated_text_english_from_zh  1298 non-null   object
 9   retranslated_text_english_from_hi  1298 non-null   object
 10  retranslated_text_english_from_es  1298 non-null   object
 11  retranslated_text_english_from_ru  1298 non-null   object
dtypes: int

In [6]:
# Define the model and tokenizer for sentence encoding
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# List of languages for retranslation analysis

languages = ["de", "fr", "zh", "hi", "es", "ru"]

# Function to encode sentences in batches for efficiency
def encode_sentences_in_batches(sentences, batch_size=32):
    """
    Encodes a list of sentences in batches using the pre-trained model.
    Args:
        sentences (list): List of sentences to encode.
        batch_size (int): Size of each batch for encoding.
    Returns:
        torch.Tensor: Tensor containing embeddings for all sentences.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Tokenize and move inputs to the appropriate device (CPU/GPU)
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():  # Disable gradient computation for inference
            model_output = model(**inputs)
        # Take the mean of the last hidden state to get sentence embeddings
        embeddings.append(model_output.last_hidden_state.mean(dim=1).cpu())
    return torch.cat(embeddings)

# Generate embeddings for original comments
comment_embeddings = encode_sentences_in_batches(ds['comment_sentence'].tolist())

# Set a cosine similarity threshold for filtering translations
threshold = 0.7

# Generate embeddings and filter translations for each language
language_columns = [f"retranslated_text_english_from_{lang}" for lang in languages]
for lang_col in language_columns:
    # Generate embeddings for the retranslated sentences
    retranslated_embeddings = encode_sentences_in_batches(ds[lang_col].fillna("").tolist())
    # Compute cosine similarity between original and retranslated embeddings
    cosine_similarities = torch.nn.functional.cosine_similarity(comment_embeddings, retranslated_embeddings).tolist()

    # Filter translations based on similarity threshold
    filtered_translation_col = f"filtered_{lang_col}"
    ds[filtered_translation_col] = [
        row[lang_col] if similarity > threshold else ""  # Keep translation if similarity exceeds threshold
        for (_, row), similarity in zip(ds.iterrows(), cosine_similarities)
    ]

# Prepare for concatenating original and filtered translations
main_columns = ['class', 'comment_sentence', 'labels']  # Main columns to retain in the final dataset
concat_columns = [f"filtered_{lang}" for lang in language_columns]  # Columns containing filtered translations

concat_types = ["de", "fr", "zh", "hi", "es", "ru"]

# Create a new DataFrame with original data
df_main = ds[main_columns].copy()
df_main['types'] = 'original'  # Add a column to indicate original comments

# Initialize a list to hold concatenated DataFrames
concat_data = [df_main]

# Add filtered translations to the concatenated DataFrame
for col, col_type in zip(concat_columns, concat_types):
    temp_df = df_main[main_columns].copy()  # Copy main structure
    temp_df['comment_sentence'] = ds[col]  # Replace 'comment_sentence' with filtered translations
    temp_df['types'] = col_type  # Label the type of translation
    concat_data.append(temp_df)  # Append to the list

# Concatenate all DataFrames into one
df_extended = pd.concat(concat_data, ignore_index=True)

# Remove rows with empty or null 'comment_sentence' values
df_extended = df_extended.dropna()  # Drop rows with null values
df_extended = df_extended[df_extended['comment_sentence'] != ""].reset_index(drop=True)  # Drop empty strings
# Display dataset information for debugging or confirmation
df_extended.info()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7781 entries, 0 to 7780
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             7781 non-null   object
 1   comment_sentence  7781 non-null   object
 2   labels            7781 non-null   object
 3   types             7781 non-null   object
dtypes: object(4)
memory usage: 243.3+ KB


In [7]:
file_path = 'NLBSE_Dataset_Pharo.csv'

# Save the DataFrame to a CSV file
df_extended.to_csv(file_path, index=False)
print(f"DataFrame saved to {file_path}")

DataFrame saved to NLBSE_Dataset_Pharo.csv
