## Setup and Model Loading

In [None]:
# Import required libraries
from transformers import pipeline
import torch
import pandas as pd
from tqdm.auto import tqdm
import json
import re
import os
import gc
import ast
from collections import defaultdict
import random
from typing import List, Dict, Tuple, Optional

In [None]:
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

clear_memory()

if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {gpu_memory:.1f} GB")
else:
    print("No GPU detected - using CPU (will be very slow)")

## Data Loading and Analysis

In [None]:
dev_data_path = 'PATH_TO_ORIGINAL_DEV_DATASET'

dev_df = pd.read_csv(dev_data_path)
dev_df['instruction'] = dev_df['instruction'].str.replace("Exammple", "Example")

In [None]:
pipe = pipeline(
        "text-generation", 
        model="unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit",
        trust_remote_code=True,
        device_map="auto" if torch.cuda.is_available() else None,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        model_kwargs={
            "load_in_4bit": True,
            "low_cpu_mem_usage": True,
        } if torch.cuda.is_available() else {"low_cpu_mem_usage": True}
)

In [None]:
print(dev_df['instruction'][0])
bangla_part = dev_df['instruction'][0].split("Example:")[0].strip()
example_part = dev_df['instruction'][0].split("Example:")[1].strip()
tests_part = dev_df['test_list'][0]
print(bangla_part)
print(example_part)
print(tests_part)

In [None]:
def translate_bangla_instruction(bangla_instruction):
    messages = [
        {
            "role": "system",
            "content": """You are a professional translator specializing in technical and programming content. Your task is to translate Bangla text containing coding instructions into clear, precise English while preserving all technical meaning and context.

        Translation Guidelines:
        - Preserve Technical Accuracy: Maintain exact meaning of programming concepts
        - Keep Code Elements Intact: Preserve English technical terms in standard form
        - Maintain Instructional Clarity: Ensure natural English coding instructions
        - Precision Over Literal Translation: Focus on exact intended meaning

        Example:
        Input: একটি স্ট্রিং-এ একটি আক্ষরিক স্ট্রিং অনুসন্ধান করার জন্য একটি ফাংশন লিখুন এবং রেজেক্স ব্যবহার করে মূল স্ট্রিং-এর মধ্যে অবস্থানটি খুঁজে বের করুন যেখানে প্যাটার্নটি ঘটে।
        Output: Write a function to search for a literal string within a main string and find the position within the main string where the pattern occurs using regex."""
        },
        {
            "role": "user",
            "content": f"Translate this Bangla coding instruction to English: {bangla_instruction}"
        }
    ]

    result = pipe(
        messages,
        max_new_tokens=768,
        temperature=0.3,
        top_p=0.95,
        do_sample=True,
        return_full_text=False
    )
    
    return result[0]['generated_text']

In [None]:
# Generate responses for ALL instructions in the development set
translated_instructions = []
failed_ids = []

print(f"Translating {len(dev_df)} instructions...")

for idx, row in tqdm(dev_df.iterrows(), total=len(dev_df), desc="Translating instructions"):
    full_instruction = row['instruction']
    sample_id = row['id']
    
    # Split the string at the first occurrence of "Example:"
    parts = full_instruction.split("Example:", 1)
    bangla_instruction = parts[0].strip()
    
    try:
        generated_code = translate_bangla_instruction(bangla_instruction)
        # Reconstruct the string: translated part + "Example:" + rest
        if len(parts) > 1:
            reconstructed_string = generated_code + " Example:" + parts[1]
        else:
            reconstructed_string = generated_code
            
        translated_instructions.append(reconstructed_string)
            
    except Exception as e:
        print(f"\n❌ Error processing sample {sample_id}: {e}")
        failed_ids.append(sample_id)

print(f"\nTranslation complete!")
print(f"Failed samples: {len(failed_ids)}")

if failed_ids:
    print(f"Failed IDs: {failed_ids[:10]}{'...' if len(failed_ids) > 10 else ''}")

In [None]:
result_df = dev_df.copy()
result_df['instruction'] = translated_instructions
result_df['test_list'] = dev_df['test_list']
result_df.to_csv('translated_instructions.csv', index=False)
print("\nNew DataFrame saved to 'translated_instructions.csv'")