<h1 style="text-align: center;">Task 3: Crosslingual Honorific Translation</h1>

## Import Library

In [1]:
!pip install torch evaluate transformers accelerate>=0.26.0 sentencepiece einops sacrebleu google.generativeai openai

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.26.0 requires protobuf<5.0,>=3.19, but you have protobuf 5.29.4 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import torch
import numpy as np
import pandas as pd
import evaluate
import google.generativeai as genai
from datasets import load_dataset, Dataset, DatasetDict
from dataclasses import dataclass
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer, 
    GenerationConfig, 
    TrainingArguments, 
    Trainer,
    BitsAndBytesConfig,
    pipeline
)
from sacrebleu.metrics import BLEU, CHRF
from typing import List, Dict, Tuple, Optional
from huggingface_hub import login
from openai import OpenAI
from dotenv import load_dotenv

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Configure logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Global Config

In [3]:
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPEN_API_KEY = os.getenv("OPEN_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# Define models
MODELS = [
    "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct",
    "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
    "sail/Sailor2-8B",
    "google/gemma-2-2b-it",
    "google/gemma-2-9b-it",
]

## Load Dataset

In [5]:
data = load_dataset("JavaneseHonorifics/Unggah-Ungguh", "translation")

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

train-00000-of-00001.csv:   0%|          | 0.00/807k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4024 [00:00<?, ? examples/s]

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['index', 'label', 'javanese sentence', 'group', 'indonesian sentence', 'english sentence'],
        num_rows: 4024
    })
})

In [7]:
data["train"][2]

{'index': 2,
 'label': 2,
 'javanese sentence': 'Sampeyan mbekta jeram menika!',
 'group': 1,
 'indonesian sentence': 'Membawalah jeruk itu.',
 'english sentence': 'Bring that orange.'}

In [None]:
dataset = data["train"]

## Load Model

In [11]:
login(token=HF_TOKEN)

In [12]:
model_name_1 = MODELS[0] # "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct"

tokenizer1 = AutoTokenizer.from_pretrained(model_name_1)

model1 = AutoModelForCausalLM.from_pretrained(
    model_name_1,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.

In [13]:
model_name_2 = MODELS[1] # "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct"

tokenizer2 = AutoTokenizer.from_pretrained(model_name_2)

model2 = AutoModelForCausalLM.from_pretrained(
    model_name_2,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
model_name_3 = MODELS[2] # "meta-llama/Llama-3.1-8B-Instruct"

tokenizer3 = AutoTokenizer.from_pretrained(model_name_3)

model3 = AutoModelForCausalLM.from_pretrained(
    model_name_3,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
model_name_4 = MODELS[3] # "sail/Sailor2-8B"

tokenizer4 = AutoTokenizer.from_pretrained(model_name_4)

model4 = AutoModelForCausalLM.from_pretrained(
    model_name_4,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
model_name_5 = MODELS[4] # "google/gemma-2-2b-it"

tokenizer5 = AutoTokenizer.from_pretrained(model_name_5)

model5 = AutoModelForCausalLM.from_pretrained(
    model_name_5,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
model_name_6 = MODELS[5] # "google/gemma-2-9b-it"

tokenizer6 = AutoTokenizer.from_pretrained(model_name_6)

model6 = AutoModelForCausalLM.from_pretrained(
    model_name_6,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
ModelOpenAI = OpenAI(api_key=OPEN_API_KEY)

genai.configure(api_key=GEMINI_API_KEY)
ModelGemini = genai.GenerativeModel('gemini-1.5-pro')

In [None]:
from datetime import datetime

class RateLimiter:
    def __init__(self, min_delay_ms=1000):
        self.min_delay_ms = min_delay_ms
        self.last_call = datetime.now()
    
    def wait(self):
        now = datetime.now()
        elapsed_ms = (now - self.last_call).total_seconds() * 1000
        
        if elapsed_ms < self.min_delay_ms:
            remaining_ms = self.min_delay_ms - elapsed_ms
            while (datetime.now() - now).total_seconds() * 1000 < remaining_ms:
                pass
        
        self.last_call = datetime.now()

In [24]:
import torch
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import numpy as np
from sacrebleu.metrics import BLEU, CHRF

@dataclass
class TranslationResult:
    predicted: List[str]
    target: List[str]
    bleu_score: float
    chrf_score: float
    groups: List[int]

class JavaneseTranslationEvaluator:
    def __init__(self, model, tokenizer=None):
        self.model = model
        self.tokenizer = tokenizer
        self.bleu = BLEU()
        self.chrf = CHRF(word_order=2)  # Using chrF++ (word_order=2)
        
    def _create_indo_to_javanese_prompt(self, indo_text: str, target_level: int) -> str:
        honorific_levels = {
            0: "Ngoko (informal, familiar conversations)",
            1: "Ngoko Alus (respectful but informal)",
            2: "Krama (polite but not overly formal)",
            3: "Krama Alus (highly respectful)"
        }
        
        prompt = f"""Translate this Indonesian sentence to Javanese using {honorific_levels[target_level]}:
Indonesian: {indo_text}
Javanese: """
        return prompt

    def _create_javanese_to_indo_prompt(self, javanese_text: str) -> str:
        prompt = f"""Translate this Javanese sentence to Indonesian:
Javanese: {javanese_text}
Indonesian: """
        return prompt
    
    def generate_translation(self, prompt: str) -> str:
        torch.cuda.empty_cache()
        rate_limiter = RateLimiter(min_delay_ms=1000)
        try:
            if self.model == ModelOpenAI:
                response = ModelOpenAI.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=100,
                    top_p=0.9
                )
                translation = response.choices[0].message.content
                torch.cuda.empty_cache()
                rate_limiter.wait()
            elif self.model == ModelGemini:
                response = ModelGemini.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        temperature=0.7,
                        top_p=0.9,
                        top_k=50,
                        max_output_tokens=100
                    )
                )
                translation = response.text
                torch.cuda.empty_cache()
                rate_limiter.wait()
            else:
                generation_config = {
                    'num_return_sequences': 1,
                    'temperature': 0.7,
                    'top_p': 0.9,
                    'top_k': 50,
                    'pad_token_id': self.tokenizer.pad_token_id,
                    'do_sample': True,
                    'early_stopping': True,
                    'repetition_penalty': 1.2,
                    'min_length': 10,
                    'max_new_tokens': 100
                }
                input_ids = self.tokenizer(prompt, return_tensors='pt').input_ids
                outputs = self.model.generate(
                    input_ids=input_ids,
                    **generation_config
                )
                translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                torch.cuda.empty_cache()
            translation = translation.split(":")[-1].strip()
            torch.cuda.empty_cache()
            return translation if translation else "EMPTY_TRANSLATION"
            
        except Exception as e:
            print(f"Error in generation: {str(e)}")
            return "EMPTY_TRANSLATION"
            
    def _calculate_scores(self, predictions: List[str], targets: List[str]) -> Tuple[float, float]:
        """Calculate BLEU and chrF++ scores with error handling."""
        try:
            valid_pairs = [(p, t) for p, t in zip(predictions, targets) 
                          if p != "EMPTY_TRANSLATION" and p and t]
            
            if not valid_pairs:
                return 0.0, 0.0
                
            valid_preds, valid_targets = zip(*valid_pairs)
            
            bleu_score = self.bleu.corpus_score(
                valid_preds, 
                [valid_targets]
            ).score
            
            chrf_score = self.chrf.corpus_score(
                valid_preds,
                [valid_targets]
            ).score
            
            return bleu_score, chrf_score
            
        except Exception as e:
            print(f"Error calculating scores: {str(e)}")
            return 0.0, 0.0
            
    def evaluate_translations(self, dataset: List[Dict]) -> Dict[str, TranslationResult]:
        results = {
            'indo_to_java_0': [], 'indo_to_java_1': [], 
            'indo_to_java_2': [], 'indo_to_java_3': [],
            'java_0_to_indo': [], 'java_1_to_indo': [],
            'java_2_to_indo': [], 'java_3_to_indo': []
        }
        targets = {k: [] for k in results.keys()}
        groups = {k: [] for k in results.keys()}

        honorific_levels = {
            0: "Ngoko",
            1: "Ngoko Alus",
            2: "Krama",
            3: "Krama Alus"
        }

        grouped_data = {}
        for item in dataset:
            if 'group' not in item or 'indo' not in item or 'sentence' not in item or item['sentence'] is None:
                print(f"Skipping invalid data item: {item}")
                continue

            group = item['group']
            if group not in grouped_data:
                grouped_data[group] = {'indo': item['indo'], 'translations': {}}

            if item['sentence'] is not None and item['sentence'].strip():
                grouped_data[group]['translations'][item['label']] = item['sentence']

        for group, data in grouped_data.items():
            if 'indo' not in data or not data['indo']:
                print(f"Skipping group {group}: No Indonesian text found")
                continue

            indo_text = data['indo']

            for level in range(4):
                if level in data['translations'] and data['translations'][level]:
                    prompt = self._create_indo_to_javanese_prompt(indo_text, level)
                    prediction = self.generate_translation(prompt)

                    key = f'indo_to_java_{level}'
                    results[key].append(prediction)
                    targets[key].append(data['translations'][level])
                    groups[key].append(group)

                    if level == 0:
                        print(f"\nTranslation from Indonesian to {honorific_levels[level]}:")
                        print(f"Indonesian: {indo_text}")
                        print(f"Generation: {prediction}")
                        print(f"Gold: {data['translations'][level]}")
                        print("-" * 50)

            for level in range(4):
                if level in data['translations'] and data['translations'][level]:
                    javanese_text = data['translations'][level]
                    prompt = self._create_javanese_to_indo_prompt(javanese_text)
                    prediction = self.generate_translation(prompt)

                    key = f'java_{level}_to_indo'
                    results[key].append(prediction)
                    targets[key].append(indo_text)
                    groups[key].append(group)

                    if level == 0:
                        print(f"\nTranslation from {honorific_levels[level]} to Indonesian:")
                        print(f"Javanese: {javanese_text}")
                        print(f"Generation: {prediction}")
                        print(f"Gold: {indo_text}")
                        print("-" * 50)

        final_results = {}
        for key in results:
            if results[key] and all(t is not None for t in targets[key]):
                bleu_score, chrf_score = self._calculate_scores(results[key], targets[key])

                final_results[key] = TranslationResult(
                    predicted=results[key],
                    target=targets[key],
                    bleu_score=bleu_score,
                    chrf_score=chrf_score,
                    groups=groups[key]
                )

        return final_results

def print_evaluation_results(results: Dict[str, TranslationResult]):
    translation_types = {
        'indo_to_java': 'Indonesian to Javanese',
        'java_to_indo': 'Javanese to Indonesian'
    }
    
    honorific_levels = {
        '0': 'Ngoko',
        '1': 'Ngoko Alus',
        '2': 'Krama',
        '3': 'Krama Alus'
    }
    
    print("\nEvaluation Results:")
    print("=" * 80)
    
    for key, result in results.items():
        direction = 'indo_to_java' if 'indo_to_java' in key else 'java_to_indo'
        level = key.split('_')[-1] if 'indo_to_java' in key else key.split('_')[1]
        
        print(f"\n{translation_types[direction]} ({honorific_levels[level]}):")
        print("-" * 40)
        print(f"BLEU Score: {result.bleu_score:.2f}")
        print(f"chrF++ Score: {result.chrf_score:.2f}")

def calculate_average_scores(results: Dict[str, TranslationResult]) -> Dict[str, Dict[str, float]]:
    """Calculate average BLEU and chrF++ scores for each direction and overall."""
    
    scores = {
        'indo_to_java': {'bleu': [], 'chrf': []},
        'java_to_indo': {'bleu': [], 'chrf': []},
        'overall': {'bleu': [], 'chrf': []}
    }
    
    for key, result in results.items():
        direction = 'indo_to_java' if 'indo_to_java' in key else 'java_to_indo'

        if result.bleu_score > 0 or result.chrf_score > 0:
            scores[direction]['bleu'].append(result.bleu_score)
            scores[direction]['chrf'].append(result.chrf_score)
            scores['overall']['bleu'].append(result.bleu_score)
            scores['overall']['chrf'].append(result.chrf_score)

    averages = {}
    for direction in scores:
        averages[direction] = {
            'bleu': np.mean(scores[direction]['bleu']) if scores[direction]['bleu'] else 0.0,
            'chrf': np.mean(scores[direction]['chrf']) if scores[direction]['chrf'] else 0.0
        }
    
    return averages

In [None]:
import pandas as pd
import numpy as np
from typing import Dict, List

def evaluate_all_models(models_dict: Dict, data: List[Dict]) -> pd.DataFrame:
    results_list = []

    directions = [
        'indo_to_java_0', 'indo_to_java_1', 'indo_to_java_2', 'indo_to_java_3',
        'java_0_to_indo', 'java_1_to_indo', 'java_2_to_indo', 'java_3_to_indo'
    ]
    
    for model_name, (model, tokenizer) in models_dict.items():
        print(f"\nEvaluating {model_name}...")
        evaluator = JavaneseTranslationEvaluator(model, tokenizer)
        results = evaluator.evaluate_translations(data)

        row_data = {'Model': model_name}

        for direction in directions:
            if direction in results:
                row_data[f'{direction}_BLEU'] = results[direction].bleu_score
                row_data[f'{direction}_chrF++'] = results[direction].chrf_score
            else:
                row_data[f'{direction}_BLEU'] = 0.0
                row_data[f'{direction}_chrF++'] = 0.0
        
        results_list.append(row_data)

    df = pd.DataFrame(results_list)

    first_level = []
    second_level = []
    
    for direction in directions:
        first_level.extend([direction, direction])
        second_level.extend(['BLEU', 'chrF++'])

    columns = pd.MultiIndex.from_arrays([
        ['Model'] + first_level,
        [''] + second_level
    ])

    df_columns = ['Model']
    for direction in directions:
        df_columns.extend([f'{direction}_BLEU', f'{direction}_chrF++'])
    
    df = df[df_columns]

    df.columns = columns

    numeric_columns = df.columns[1:]
    df[numeric_columns] = df[numeric_columns].round(2)
    
    return df

models_dict = {
    'Model1': (model1, tokenizer1),
    'Model2': (model2, tokenizer2),
    'Model3': (model3, tokenizer3),
    'Model4': (model4, tokenizer4),
    'Model5': (model5, tokenizer5),
    'Model6': (model6, tokenizer6),
    'OpenAI': (ModelOpenAI, None),
    'Gemini': (ModelGemini, None)
}

results_df = evaluate_all_models(models_dict, dataset)

print("\nEvaluation Results:")
print("=" * 80)
print(results_df)

results_df.to_csv('../../results/task3/translation_evaluation_results_<YOUR MODEL>.csv')

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.



Evaluating Model6...

Evaluation Results:
    Model indo_to_java_0        indo_to_java_1        indo_to_java_2         \
                    BLEU chrF++           BLEU chrF++           BLEU chrF++   
0  Model6           0.43  11.85           0.29  11.45           0.12  10.98   

  indo_to_java_3        java_0_to_indo        java_1_to_indo         \
            BLEU chrF++           BLEU chrF++           BLEU chrF++   
0           0.11  11.03           3.29  20.43           2.22   17.7   

  java_2_to_indo        java_3_to_indo         
            BLEU chrF++           BLEU chrF++  
0           1.55  15.88           1.77  15.27  


In [21]:
results_df

Unnamed: 0_level_0,Model,indo_to_java_0,indo_to_java_0,indo_to_java_1,indo_to_java_1,indo_to_java_2,indo_to_java_2,indo_to_java_3,indo_to_java_3,java_0_to_indo,java_0_to_indo,java_1_to_indo,java_1_to_indo,java_2_to_indo,java_2_to_indo,java_3_to_indo,java_3_to_indo
Unnamed: 0_level_1,Unnamed: 1_level_1,BLEU,chrF++,BLEU,chrF++,BLEU,chrF++,BLEU,chrF++,BLEU,chrF++,BLEU,chrF++,BLEU,chrF++,BLEU,chrF++
0,Model6,0.43,11.85,0.29,11.45,0.12,10.98,0.11,11.03,3.29,20.43,2.22,17.7,1.55,15.88,1.77,15.27
