# Ancient to Modern Italian Translation with TinyLLaMA and BLOOMZ

This notebook compares two approaches for translating ancient Italian into modern Italian:

1. **TinyLLaMA (Fine-tuned locally on parallel examples)**
2. **BLOOMZ (Zero-shot / Few-shot Inference)**

---

## 🔧 Setup

In [1]:
# --- 1. Install required libraries ---
!pip install -q transformers datasets peft bitsandbytes accelerate evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127

In [None]:
# --- 1. Import Necessary Modules ---
import pandas as pd
import numpy as np
import torch
import ast
import evaluate
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig,
    StoppingCriteria, StoppingCriteriaList
)
from tqdm import tqdm

# --- 2. Load and Prepare Dataset ---
df = pd.read_csv('/kaggle/input/dataset-list/dataset_concatenato.csv')[['text', 'translation']].dropna()
df = df.rename(columns={'text': 'ancient', 'translation': 'modern'})

def fix_list_string_to_sentence(text):
    try:
        tokens = ast.literal_eval(text)
        if isinstance(tokens, list):
            return " ".join(tokens)
    except (ValueError, SyntaxError):
        pass
    return text

df['ancient'] = df['ancient'].apply(fix_list_string_to_sentence)
df['modern'] = df['modern'].apply(fix_list_string_to_sentence)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

# --- 3. Load Tokenizer and Model ---
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)

# --- 4. Apply LoRA ---
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
peft_config = LoraConfig(
    r=32,             # Increased from 16 to 32
    lora_alpha=64,    # Conventionally, lora_alpha is 2 * r
    lora_dropout=0.05,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# --- 5. Preprocess Data ---
MAX_LENGTH = 128

def preprocess_function(examples):
    all_input_ids = []
    all_labels = []
    SYSTEM_PROMPT = "You are an expert translator from ancient to modern Italian."

    for ancient, modern in zip(examples['ancient'], examples['modern']):
        full_text = (
            f"<|system|>\n{SYSTEM_PROMPT}{tokenizer.eos_token}\n"
            f"<|user|>\n{ancient}{tokenizer.eos_token}\n"
            f"<|assistant|>\n{modern}{tokenizer.eos_token}"
        )
        prompt_only = (
            f"<|system|>\n{SYSTEM_PROMPT}{tokenizer.eos_token}\n"
            f"<|user|>\n{ancient}{tokenizer.eos_token}\n"
            f"<|assistant|>\n"
        )
        tokenized_full = tokenizer(full_text, max_length=MAX_LENGTH, truncation=True, padding=False)
        tokenized_prompt = tokenizer(prompt_only, max_length=MAX_LENGTH, truncation=True, padding=False)
        
        input_ids = tokenized_full['input_ids']
        prompt_length = len(tokenized_prompt['input_ids'])
        
        labels = [-100] * prompt_length + input_ids[prompt_length:]
        
        padding_length = MAX_LENGTH - len(input_ids)
        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
        labels = labels + [-100] * padding_length
        
        all_input_ids.append(input_ids)
        all_labels.append(labels)
        
    return {"input_ids": all_input_ids, "labels": all_labels}

train_ds = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
val_ds = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)

# --- 6. Define Training Arguments and Metrics ---
training_args = TrainingArguments(
    output_dir="/kaggle/working/tinyllama-ft",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=10,
    learning_rate=1e-5,
    report_to='none',
    gradient_checkpointing=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="/kaggle/working/logs",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    save_total_limit=2
)

bleu_metric = evaluate.load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.argmax(preds, axis=-1)
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Isolate the assistant's response for both predictions and labels
    cleaned_preds = [pred.split("<|assistant|>")[-1].strip() for pred in decoded_preds]
    cleaned_labels = [label.split("<|assistant|>")[-1].strip() for label in decoded_labels]
    
    # Format for BLEU metric
    references_for_bleu = [[label] for label in cleaned_labels]

    print("Sample prediction:", cleaned_preds[0])
    print("Ground truth:", cleaned_labels[0])
    
    result = bleu_metric.compute(predictions=cleaned_preds, references=references_for_bleu)
    return {"bleu": result["bleu"]}

# --- 7. Initialize and Run Trainer ---
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

print("Starting training...")
trainer.train()
print("Training finished. Best model is loaded.")

# --- 8. Define Inference Function (REVISED with Generation Tuning) ---
class StopOnEosToken(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        if input_ids[0, -1] == tokenizer.eos_token_id:
            return True
        return False

def generate_with_tinyllama(text):
    prompt = (
        f"<|system|>\nYou are an expert translator from ancient to modern Italian.{tokenizer.eos_token}\n"
        f"<|user|>\n{text}{tokenizer.eos_token}\n"
        f"<|assistant|>\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    stopping_criteria = StoppingCriteriaList([StopOnEosToken()])

    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        num_beams=5,
        early_stopping=True,
        do_sample=False,
        stopping_criteria=stopping_criteria,
        # --- NEW PARAMETERS TO CONTROL REPETITION ---
        repetition_penalty=1.2, # Penalizes words that have already appeared.
        no_repeat_ngram_size=3, # Prevents the model from repeating any 3-word sequence.
    )
    
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        return decoded.split("<|assistant|>")[-1].strip()
    except IndexError:
        return f"Model failed to generate a valid response. Full output: {decoded}"

# --- 9. Run Inference on Test Set ---
test_df = pd.read_csv('/kaggle/input/dataset-list/dataset_human_eval.csv')[['Sentence', 'HumanEval']].dropna()
test_df = test_df.rename(columns={'Sentence': 'ancient', 'HumanEval': 'modern'})
# Use tqdm here for a progress bar during the final inference
test_df['tinyllama_output'] = [generate_with_tinyllama(text) for text in tqdm(test_df['ancient'], desc="Generating Final Translations")]

# --- 10. Save Final Predictions ---
test_df[['ancient', 'modern', 'tinyllama_output']].to_csv("/kaggle/working/tinyllama_predictions.csv", index=False)
print("Final predictions saved to /kaggle/working/tinyllama_predictions.csv")

Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training with early stopping based on validation BLEU...


Step,Training Loss,Validation Loss,Bleu
10,2.8125,2.738626,0.124567
20,2.7984,2.665295,0.125582
30,2.5913,2.590719,0.12571
40,2.5758,2.531405,0.136007
50,2.566,2.467233,0.104028
60,2.4699,2.410282,0.085793
70,2.4229,2.369215,0.086277
80,2.3408,2.336598,0.101537
90,2.3077,2.308064,0.104296
100,2.2183,2.280741,0.10131


Sample prediction: d’ altro pesce in tutta la costaiera;p pescatori e navicelle a schiera,e barche,saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di’ altro pesce in tutta la costaiera;p pescatori e navicelle a schiera,e barche,saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di’ volta pesce in tutta la Riviera;p pescatori e navicelle a schiera,e barche,saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di’ volta pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di con volta pesce in tutta la rivi

In [None]:
# --- 13. Process final_test_set_path (dataset.csv) and save with new columns --

final_output_test_file_path = '/kaggle/input/dataset-list/dataset.csv'

final_test_df = pd.read_csv(final_output_test_file_path)
ancient_text_column_name = 'Sentence'

final_test_df[ancient_text_column_name] = final_test_df[ancient_text_column_name].apply(fix_list_string_to_sentence)
    
# Filter out any empty sentences after cleaning, if any
final_test_df = final_test_df[final_test_df[ancient_text_column_name].str.strip() != ""].copy() # Use .copy() to avoid SettingWithCopyWarning

# Get the list of ancient sentences to translate
ancient_sentences_to_translate = final_test_df[ancient_text_column_name].tolist()

model.eval()
generated_translations = []
print("Generating translations (this may take a while)...")
for text in tqdm(ancient_sentences_to_translate, desc="Translating final test set"):
    generated_translations.append(generate_with_tinyllama(text))
final_test_df['generated_translation'] = generated_translations
final_test_df['score_human'] = 0
output_csv_path = "/kaggle/working/tinyllama_final_dataset_predictions.csv"
output_columns = ['Author', 'Date', 'Region', 'Sentence', 'generated_translation', 'score_human']

final_output_df_columns = [col for col in output_columns if col in final_test_df.columns]
if 'generated_translation' not in final_output_df_columns:
    final_output_df_columns.append('generated_translation')
if 'score_human' not in final_output_df_columns:
    final_output_df_columns.append('score_human')

final_test_df[final_output_df_columns].to_csv(output_csv_path, index=False)
print(f"Final predictions saved to {output_csv_path}")

Generating translations (this may take a while)...


Translating final test set: 100%|██████████| 97/97 [18:34<00:00, 11.49s/it]

Final predictions saved to /kaggle/working/tinyllama_final_dataset_predictions.csv



