# Run in kaggle T4 * 2 gpu

In [1]:
%pip install transformers
%pip install datasets
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModel
import torch
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from accelerate import Accelerator

In [3]:
if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print out the name of each GPU and memory details
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
    
    # If multiple GPUs are available, use DataParallel for multi-GPU
    if num_gpus > 1:
        print("Using DataParallel for multi-GPU training.")
        # Example: Wrap your model with DataParallel
    else:
        print("Only one GPU available, using single GPU mode.")
else:
    print("No GPUs available, using CPU.")
    
def setup_translation_pipeline(source_lang, target_lang):
    """
    Set up a translation pipeline using a pretrained model from Helsinki-NLP, optimized for multi-GPU with Accelerate.

    Args:
        source_lang (str): The source language code (e.g., "en" for English).
        target_lang (str): The target language code (e.g., "fr" for French).

    Returns:
        transformers.Pipeline: A translation pipeline configured for the given language pair.
    """
    # Define the model name based on the source and target languages
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    # Load the tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load the translation model with memory optimization and half-precision support
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        return_dict=True,  # Use a dictionary for the model's output for easier access
        low_cpu_mem_usage=True,  # Optimize for low memory usage during model loading
        torch_dtype=torch.float16,  # Use half-precision for faster computation (requires supported GPU)
        trust_remote_code=True,  # Trust the model's remote configuration if specified
    )
    
    # Use the Accelerate library for better multi-GPU support
    accelerator = Accelerator()

    # Move the model to the appropriate device using Accelerator (handles multi-GPU)
    model = accelerator.prepare(model)

    # Set up and return the translation pipeline
    translation_pipeline = pipeline(
        "translation",  # Specify the task as translation
        model=model,  # Provide the loaded model
        tokenizer=tokenizer,  # Provide the loaded tokenizer
        torch_dtype=torch.float16,  # Use half-precision for the pipeline
        max_length=512,
        truncation=True,
        device=accelerator.device  # Use the device from Accelerator (handles multi-GPU)
    )

    return translation_pipeline


Number of GPUs available: 2
GPU 0: Tesla T4
Total memory: 15.828320256 GB
GPU 1: Tesla T4
Total memory: 15.828320256 GB
Using DataParallel for multi-GPU training.


In [4]:
# Function to translate and retranslate for a given target language
def translate_retranslate(ds, source_lang, target_lang, batch_size=32):
    """
    Translate the dataset's 'comment_sentence' column to a target language and back to the source language.
    
    Args:
        ds (Dataset): Hugging Face dataset containing a 'comment_sentence' column.
        source_lang (str): Source language code (e.g., "en" for English).
        target_lang (str): Target language code to translate to and back.
        batch_size (int): Number of rows to process in a single batch.

    Returns:
        Dataset: Hugging Face dataset with a new column for retranslated texts.
    """
    # Set up translation pipelines
    pipe_to_target = setup_translation_pipeline(source_lang, target_lang)  # English to target language
    pipe_to_source = setup_translation_pipeline(target_lang, source_lang)  # Target language back to English

    retranslated_texts = []

    # Process the dataset in batches to optimize memory usage and speed
    for i in tqdm(range(0, len(ds), batch_size), desc=f"Translating to {target_lang} and back"):
        # Extract a batch of 'comment_sentence' without converting to pandas
        batch = ds[i: i + batch_size]["comment_sentence"]

        # Step 1: Translate from source language to target language
        target_translations = pipe_to_target(batch, max_length=512, do_sample=False)
        target_texts = [result["translation_text"] for result in target_translations]

        # Step 2: Translate back to the source language (English)
        english_translations = pipe_to_source(target_texts, max_length=512, do_sample=False)
        retranslated_batch = [result["translation_text"].strip() for result in english_translations]

        # Append the retranslated texts to the result list
        retranslated_texts.extend(retranslated_batch)
                # Clear CUDA cache to prevent memory overload
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Add the retranslated texts as a new column in the dataset
    ds = ds.add_column(f"retranslated_text_english_from_{target_lang}", retranslated_texts)
    return ds

# Load the dataset (e.g., the first 10 rows from the Python training split)
df = load_dataset("NLBSE/nlbse25-code-comment-classification", split="java_train")

# List of target languages to process for retranslation
languages = ["de", "fr", "zh", "hi", "es", "ru"]

# Loop through each language and retranslate
for lang in tqdm(languages, desc="Processing languages"):
    df = translate_retranslate(df, "en", lang)


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

Processing languages:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to de and back:   0%|          | 0/238 [00:00<?, ?it/s][A
Translating to de and back:   0%|          | 1/238 [00:08<33:43,  8.54s/it][A
Translating to de and back:   1%|          | 2/238 [00:18<36:26,  9.26s/it][A
Translating to de and back:   1%|▏         | 3/238 [00:26<34:10,  8.73s/it][A
Translating to de and back:   2%|▏         | 4/238 [00:38<39:30, 10.13s/it][A
Translating to de and back:   2%|▏         | 5/238 [00:47<37:14,  9.59s/it][A
Translating to de and back:   3%|▎         | 6/238 [00:55<35:22,  9.15s/it][AYour input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 510 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to fr and back:   0%|          | 0/238 [00:00<?, ?it/s][A
Translating to fr and back:   0%|          | 1/238 [00:07<29:49,  7.55s/it][A
Translating to fr and back:   1%|          | 2/238 [00:17<36:04,  9.17s/it][A
Translating to fr and back:   1%|▏         | 3/238 [00:26<34:22,  8.78s/it][A
Translating to fr and back:   2%|▏         | 4/238 [00:34<32:53,  8.43s/it][A
Translating to fr and back:   2%|▏         | 5/238 [00:42<32:27,  8.36s/it][A
Translating to fr and back:   3%|▎         | 6/238 [00:50<31:44,  8.21s/it][A
Translating to fr and back:   3%|▎         | 7/238 [00:58<32:00,  8.31s/it][A
Translating to fr and back:   3%|▎         | 8/238 [01:04<29:12,  7.62s/it][A
Translating to fr and back:   4%|▍         | 9/238 [01:13<30:27,  7.98s/it][A
Translating to fr and back:   4%|▍         | 10/238 [01:21<29:49,  7.85s/it][A
Translating to fr and back:   5%|▍         | 11/238 [01:28<29:32,  7.81s/it][A
Translating to fr and back:   5%|▌         | 12/238 [01:3

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to zh and back:   0%|          | 0/238 [00:00<?, ?it/s][A
Translating to zh and back:   0%|          | 1/238 [00:09<38:17,  9.70s/it][A
Translating to zh and back:   1%|          | 2/238 [00:26<54:49, 13.94s/it][A
Translating to zh and back:   1%|▏         | 3/238 [00:35<45:59, 11.74s/it][A
Translating to zh and back:   2%|▏         | 4/238 [00:43<40:16, 10.32s/it][A
Translating to zh and back:   2%|▏         | 5/238 [00:53<39:33, 10.19s/it][A
Translating to zh and back:   3%|▎         | 6/238 [01:02<37:22,  9.67s/it][AYour input_length: 511 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)

Translating to zh and back:   3%|▎         | 7/238 [01:19<46:10, 12.00s/it][A
Translating to zh and back:   3%|▎         | 8/238 [01:26<39:43, 10.36s/it][A
Translating to zh and back:   4%|▍         | 9/238 [01:35<38:01,  9.96s/it][A
Translating to zh and back:   4%|▍         | 10/238 [01:43<36

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to hi and back:   0%|          | 0/238 [00:00<?, ?it/s][A
Translating to hi and back:   0%|          | 1/238 [00:07<29:47,  7.54s/it][A
Translating to hi and back:   1%|          | 2/238 [00:16<31:49,  8.09s/it][A
Translating to hi and back:   1%|▏         | 3/238 [00:23<31:04,  7.93s/it][A
Translating to hi and back:   2%|▏         | 4/238 [00:31<30:11,  7.74s/it][A
Translating to hi and back:   2%|▏         | 5/238 [00:38<29:26,  7.58s/it][A
Translating to hi and back:   3%|▎         | 6/238 [00:46<29:30,  7.63s/it][A
Translating to hi and back:   3%|▎         | 7/238 [00:52<28:14,  7.33s/it][A
Translating to hi and back:   3%|▎         | 8/238 [00:59<26:46,  6.99s/it][A
Translating to hi and back:   4%|▍         | 9/238 [01:07<28:15,  7.40s/it][A
Translating to hi and back:   4%|▍         | 10/238 [01:14<27:51,  7.33s/it][A
Translating to hi and back:   5%|▍         | 11/238 [01:22<27:46,  7.34s/it][A
Translating to hi and back:   5%|▌         | 12/238 [01:3

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to es and back:   0%|          | 0/238 [00:00<?, ?it/s][A
Translating to es and back:   0%|          | 1/238 [00:07<30:54,  7.83s/it][A
Translating to es and back:   1%|          | 2/238 [00:17<34:16,  8.71s/it][A
Translating to es and back:   1%|▏         | 3/238 [00:25<32:47,  8.37s/it][A
Translating to es and back:   2%|▏         | 4/238 [00:33<32:11,  8.25s/it][A
Translating to es and back:   2%|▏         | 5/238 [00:41<32:26,  8.35s/it][A
Translating to es and back:   3%|▎         | 6/238 [00:49<31:42,  8.20s/it][A
Translating to es and back:   3%|▎         | 7/238 [01:00<34:23,  8.93s/it][A
Translating to es and back:   3%|▎         | 8/238 [01:06<30:54,  8.06s/it][A
Translating to es and back:   4%|▍         | 9/238 [01:15<31:43,  8.31s/it][A
Translating to es and back:   4%|▍         | 10/238 [01:26<35:34,  9.36s/it][A
Translating to es and back:   5%|▍         | 11/238 [01:34<33:38,  8.89s/it][A
Translating to es and back:   5%|▌         | 12/238 [01:4

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translating to ru and back:   0%|          | 0/238 [00:00<?, ?it/s][A
Translating to ru and back:   0%|          | 1/238 [00:08<33:00,  8.35s/it][A
Translating to ru and back:   1%|          | 2/238 [00:18<37:31,  9.54s/it][A
Translating to ru and back:   1%|▏         | 3/238 [00:27<35:58,  9.19s/it][A
Translating to ru and back:   2%|▏         | 4/238 [00:36<35:15,  9.04s/it][A
Translating to ru and back:   2%|▏         | 5/238 [00:44<34:15,  8.82s/it][A
Translating to ru and back:   3%|▎         | 6/238 [00:53<33:36,  8.69s/it][A
Translating to ru and back:   3%|▎         | 7/238 [01:01<33:23,  8.67s/it][A
Translating to ru and back:   3%|▎         | 8/238 [01:08<30:36,  7.98s/it][A
Translating to ru and back:   4%|▍         | 9/238 [01:17<31:36,  8.28s/it][A
Translating to ru and back:   4%|▍         | 10/238 [01:30<37:35,  9.89s/it][A
Translating to ru and back:   5%|▍         | 11/238 [01:39<35:55,  9.50s/it][A
Translating to ru and back:   5%|▌         | 12/238 [01:4

In [5]:
ds=df.to_pandas()
ds = ds.drop(columns=["index"])
ds.reset_index(inplace=True)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7614 entries, 0 to 7613
Data columns (total 12 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   index                              7614 non-null   int64 
 1   class                              7614 non-null   object
 2   comment_sentence                   7614 non-null   object
 3   partition                          7614 non-null   int64 
 4   combo                              7614 non-null   object
 5   labels                             7614 non-null   object
 6   retranslated_text_english_from_de  7614 non-null   object
 7   retranslated_text_english_from_fr  7614 non-null   object
 8   retranslated_text_english_from_zh  7614 non-null   object
 9   retranslated_text_english_from_hi  7614 non-null   object
 10  retranslated_text_english_from_es  7614 non-null   object
 11  retranslated_text_english_from_ru  7614 non-null   object
dtypes: int

In [6]:
# Define the model and tokenizer for sentence encoding
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# List of languages for retranslation analysis

languages = ["de", "fr", "zh", "hi", "es", "ru"]

# Function to encode sentences in batches for efficiency
def encode_sentences_in_batches(sentences, batch_size=32):
    """
    Encodes a list of sentences in batches using the pre-trained model.
    Args:
        sentences (list): List of sentences to encode.
        batch_size (int): Size of each batch for encoding.
    Returns:
        torch.Tensor: Tensor containing embeddings for all sentences.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Tokenize and move inputs to the appropriate device (CPU/GPU)
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():  # Disable gradient computation for inference
            model_output = model(**inputs)
        # Take the mean of the last hidden state to get sentence embeddings
        embeddings.append(model_output.last_hidden_state.mean(dim=1).cpu())
    return torch.cat(embeddings)

# Generate embeddings for original comments
comment_embeddings = encode_sentences_in_batches(ds['comment_sentence'].tolist())

# Set a cosine similarity threshold for filtering translations
threshold = 0.7

# Generate embeddings and filter translations for each language
language_columns = [f"retranslated_text_english_from_{lang}" for lang in languages]
for lang_col in language_columns:
    # Generate embeddings for the retranslated sentences
    retranslated_embeddings = encode_sentences_in_batches(ds[lang_col].fillna("").tolist())
    # Compute cosine similarity between original and retranslated embeddings
    cosine_similarities = torch.nn.functional.cosine_similarity(comment_embeddings, retranslated_embeddings).tolist()

    # Filter translations based on similarity threshold
    filtered_translation_col = f"filtered_{lang_col}"
    ds[filtered_translation_col] = [
        row[lang_col] if similarity > threshold else ""  # Keep translation if similarity exceeds threshold
        for (_, row), similarity in zip(ds.iterrows(), cosine_similarities)
    ]

# Prepare for concatenating original and filtered translations
main_columns = ['class', 'comment_sentence', 'labels']  # Main columns to retain in the final dataset
concat_columns = [f"filtered_{lang}" for lang in language_columns]  # Columns containing filtered translations

concat_types = ["de", "fr", "zh", "hi", "es", "ru"]

# Create a new DataFrame with original data
df_main = ds[main_columns].copy()
df_main['types'] = 'original'  # Add a column to indicate original comments

# Initialize a list to hold concatenated DataFrames
concat_data = [df_main]

# Add filtered translations to the concatenated DataFrame
for col, col_type in zip(concat_columns, concat_types):
    temp_df = df_main[main_columns].copy()  # Copy main structure
    temp_df['comment_sentence'] = ds[col]  # Replace 'comment_sentence' with filtered translations
    temp_df['types'] = col_type  # Label the type of translation
    concat_data.append(temp_df)  # Append to the list

# Concatenate all DataFrames into one
df_extended = pd.concat(concat_data, ignore_index=True)

# Remove rows with empty or null 'comment_sentence' values
df_extended = df_extended.dropna()  # Drop rows with null values
df_extended = df_extended[df_extended['comment_sentence'] != ""].reset_index(drop=True)  # Drop empty strings
# Display dataset information for debugging or confirmation
df_extended.info()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47169 entries, 0 to 47168
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   class             47169 non-null  object
 1   comment_sentence  47169 non-null  object
 2   labels            47169 non-null  object
 3   types             47169 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


In [7]:
file_path = 'NLBSE_Dataset_Java.csv'

# Save the DataFrame to a CSV file
df_extended.to_csv(file_path, index=False)
print(f"DataFrame saved to {file_path}")

DataFrame saved to NLBSE_Dataset_Java.csv
