# Ancient to Modern Italian Translation with TinyLLaMA and BLOOMZ

This notebook compares two approaches for translating ancient Italian into modern Italian:

1. **TinyLLaMA (Fine-tuned locally on parallel examples)**
2. **BLOOMZ (Zero-shot / Few-shot Inference)**

---

## 🔧 Setup

In [1]:
# --- 1. Install required libraries ---
!pip install -q transformers datasets peft bitsandbytes accelerate evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127

In [9]:
# --- 2. Import modules ---
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import evaluate
import ast

# --- 3. Load and prepare dataset ---
df = pd.read_csv('/kaggle/input/dataset-list/dataset_concatenato.csv')[['text', 'translation']].dropna()
df = df.rename(columns={'text': 'ancient', 'translation': 'modern'})

def fix_list_string_to_sentence(text):
    # Try to parse string list representation to python list
    try:
        tokens = ast.literal_eval(text)
        if isinstance(tokens, list):
            return " ".join(tokens)
    except:
        pass
    return text  # fallback if parsing fails

df['ancient'] = df['ancient'].apply(fix_list_string_to_sentence)
df['modern'] = df['modern'].apply(fix_list_string_to_sentence)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

# --- 4. Load tokenizer and model (4-bit quantized TinyLLaMA) ---
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)

# --- 5. Apply LoRA ---
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
peft_config = LoraConfig(
    r=16,
    lora_alpha=32, # Often r*2
    lora_dropout=0.05, # Slightly lower
    target_modules=TARGET_MODULES,
    bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# --- 6. Preprocess: tokenize and MASK input for focused learning ---
def preprocess_function_with_masking(examples):
    # Construct the full input string that the model will see for generation
    # Model will be prompted with: "Translate...\nAncient: {a}\nModern:"
    # And it should generate: "{m}{eos_token}"
    
    prompts_with_targets = []
    prompts_only_for_masking = []

    for ancient, modern in zip(examples['ancient'], examples['modern']):
        # This is the full sequence for training input_ids
        text = f"Translate from ancient to modern Italian:\nAncient: {ancient}\nModern: {modern}{tokenizer.eos_token}"
        prompts_with_targets.append(text)
        
        # This is the part we want to MASK in the labels
        prompt_part = f"Translate from ancient to modern Italian:\nAncient: {ancient}\nModern:"
        prompts_only_for_masking.append(prompt_part)

    # Tokenize the full sequences (prompt + target)
    model_inputs = tokenizer(
        prompts_with_targets,
        truncation=True,
        padding="max_length",
        max_length=256, # Your MAX_LENGTH
        # return_tensors="pt" # Not needed if Trainer handles it
    )

    # Create labels and mask the prompt part
    labels = []
    for i in range(len(model_inputs["input_ids"])):
        input_ids_example = model_inputs["input_ids"][i]
        
        # Tokenize the prompt_only part to find its length *within the context of the full tokenized sequence*
        # Important: tokenize prompts_only_for_masking[i] *without* adding special tokens like BOS,
        # as BOS is already part of model_inputs["input_ids"][i] if the tokenizer adds it.
        prompt_tokens = tokenizer.encode(prompts_only_for_masking[i], add_special_tokens=False)
        prompt_length = len(prompt_tokens)
        
        # Account for BOS token if tokenizer adds it to the beginning of the full sequence
        actual_mask_length = prompt_length
        if tokenizer.bos_token_id is not None and input_ids_example[0] == tokenizer.bos_token_id:
            actual_mask_length +=1 # The BOS token is part of the "context" to be masked

        # Create a copy of input_ids for labels
        label_example = list(input_ids_example)
        
        # Mask the prompt part by setting tokens to -100
        for j in range(actual_mask_length):
            label_example[j] = -100
            
        # Also ensure padding tokens in the target part are -100
        for j in range(actual_mask_length, len(label_example)):
            if label_example[j] == tokenizer.pad_token_id:
                label_example[j] = -100
        
        labels.append(label_example)

    model_inputs["labels"] = labels
    return model_inputs

# Apply the new preprocessing function
train_ds = train_ds.map(preprocess_function_with_masking, batched=True, remove_columns=train_ds.column_names)
val_ds = val_ds.map(preprocess_function_with_masking, batched=True, remove_columns=val_ds.column_names)

# --- 7. Define training ---
training_args = TrainingArguments(
    output_dir="/kaggle/working/tinyllama-ft",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=15,  # Reduced epochs, rely on early stopping
    logging_steps=10,     # Log more frequently if dataset is small
    eval_strategy='steps',
    eval_steps=10,        # Evaluate more frequently if dataset is small
    save_strategy='steps',
    save_steps=10,        # Save more frequently, tied to eval_steps
    learning_rate=2e-5,   # Adjusted learning rate
    # fp16=True, # Not strictly necessary with bnb_config.bnb_4bit_compute_dtype
    report_to='none',
    gradient_checkpointing=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="/kaggle/working/logs",
    load_best_model_at_end=True,    # Crucial for preventing overfitting
    metric_for_best_model="bleu", # Monitor BLEU on validation set
    greater_is_better=True,       # For BLEU, higher is better
    save_total_limit=3            # Keep best, last, and maybe one more
)

def clean_decoded_text(text):
        text = text.strip()
        if text.startswith("[") and text.endswith("]"):
            text = text[1:-1]  # remove brackets
        text = text.replace("'", "")  # remove quotes
        return text.strip()

bleu_metric = evaluate.load("bleu")



def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)
    if isinstance(preds, torch.Tensor):
        preds = preds.detach().cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()
    
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.split("Modern:")[-1].strip() for pred in decoded_preds]
    decoded_labels = [label.split("Modern:")[-1].strip() for label in decoded_labels]

    pred_tokens = [pred.split() for pred in decoded_preds]
    label_tokens = [label.split() for label in decoded_labels]

    print("Sample prediction:", decoded_preds[0])
    print("Ground truth:", decoded_labels[0])

    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics, # This is fine for monitoring with argmax
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")
)

# --- 8. Train the model ---
print("Starting training with early stopping based on validation BLEU...")
trainer.train()
print("Training finished. Best model (based on val BLEU) is loaded.")

# --- 9. Load test set ---
test_df = pd.read_csv('/kaggle/input/dataset-list/dataset_human_eval.csv')[['Sentence', 'HumanEval']].dropna()
test_df = test_df.rename(columns={'Sentence': 'ancient', 'HumanEval': 'modern'})

# --- 10. Inference with zero-shot prompt ---
zero_shot_prompt = """Translate from ancient to modern Italian:
Ancient: {input}
Modern:"""

def generate_with_tinyllama(text):
    prompt = zero_shot_prompt.format(input=text)
    MAX_INPUT_PROMPT_LENGTH = 256 - 128 # max_length_train - max_new_tokens
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_PROMPT_LENGTH).to(model.device)

    outputs = model.generate(
        input_ids=inputs["input_ids"], # More explicit
        attention_mask=inputs["attention_mask"], # Pass attention_mask
        max_new_tokens=128,
        do_sample=False,
        num_beams=4,
        early_stopping=True, # Good
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id # Ensure this is correctly set
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Strip everything before the final "Modern:" to get clean output
    return decoded.split("Modern:")[-1].strip()

# --- 11. Generate on test set ---
test_df['tinyllama_output'] = test_df['ancient'].apply(generate_with_tinyllama)

# --- 12. Save predictions ---
test_df[['ancient', 'modern', 'tinyllama_output']].to_csv("/kaggle/working/tinyllama_predictions.csv", index=False)

Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training with early stopping based on validation BLEU...


Step,Training Loss,Validation Loss,Bleu
10,2.8125,2.738626,0.124567
20,2.7984,2.665295,0.125582
30,2.5913,2.590719,0.12571
40,2.5758,2.531405,0.136007
50,2.566,2.467233,0.104028
60,2.4699,2.410282,0.085793
70,2.4229,2.369215,0.086277
80,2.3408,2.336598,0.101537
90,2.3077,2.308064,0.104296
100,2.2183,2.280741,0.10131


Sample prediction: d’ altro pesce in tutta la costaiera;p pescatori e navicelle a schiera,e barche,saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di’ altro pesce in tutta la costaiera;p pescatori e navicelle a schiera,e barche,saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di’ volta pesce in tutta la Riviera;p pescatori e navicelle a schiera,e barche,saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di’ volta pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Ground truth: e ogni altro pesce in tutta la riviera;con pescatori e navicelle a schiera,e barche, saettie e galeoni,
Sample prediction: di con volta pesce in tutta la rivi

In [8]:
from IPython.display import display
display(test_df[['ancient', 'modern', 'tinyllama_output']].head(30))

Unnamed: 0,ancient,modern,tinyllama_output
0,quella guerra ben fatta l' opera perché etc. E...,Quella guerra fu condotta bene perchè etc. Dal...,la guerra che è stata effettivamente compiuta ...
1,"crudele, e di tutte le colpe pigli vendetta, c...","È crudele, e punisce ogni colpa come vuole la ...","crudele, e di tutte le colpe pigli vendetta, c..."
2,Non d' altra forza d' animo fue ornato Ponzio ...,"Ponzio Aufidiano, cavaliere romano, non fu dot...",Non d' altra forza d' animo fu ornato Ponzio A...
3,Se questo piace a tutti e se 'l tempo hae biso...,Se questo piace a tutti e se il tempo ha bisog...,Se questo piace a tutti e se 'l tempo hae biso...
4,Officio di questa arte pare che sia dicere app...,Il compito di quest’arte sembra essere quello ...,L'ufficio di questa arte sembra essere dichiar...
5,Ecco e larghi ventipiovoli caggiono delle riso...,Ecco che forti piogge scendono dalle dense neb...,Ecco e larghi ventipiovoli caggiono delle riso...
6,Però che or chi spererebbe quello che eziandio...,Perché ora chi spererebbe ciò che anche quelli...,"Because they do not want to believe in Christ,..."
7,I vendimenti de' morti et le presure de' vivi ...,Le vendite dei morti e le pressioni sui vivi f...,I vendimenti de' morti et le presure de' vivi ...
8,"Acciocché quegli, il quale ora per le sue gran...","Così che colui, che ora è temuto e onorato pe...","Quegli, il quale ora è feroce e onorevole per ..."
9,Gli uomini spessamente a stare fermi nella bug...,Spesso gli uomini incontrano la verità mentre ...,Gli uomini stanno fermi nella bugia incontrand...


In [11]:
# --- 13. Process final_test_set_path (dataset.csv) and save with new columns --

final_output_test_file_path = '/kaggle/input/dataset-list/dataset.csv'

final_test_df = pd.read_csv(final_output_test_file_path)
ancient_text_column_name = 'Sentence'

final_test_df[ancient_text_column_name] = final_test_df[ancient_text_column_name].apply(fix_list_string_to_sentence)
    
# Filter out any empty sentences after cleaning, if any
final_test_df = final_test_df[final_test_df[ancient_text_column_name].str.strip() != ""].copy() # Use .copy() to avoid SettingWithCopyWarning

# Get the list of ancient sentences to translate
ancient_sentences_to_translate = final_test_df[ancient_text_column_name].tolist()

model.eval()
generated_translations = []
print("Generating translations (this may take a while)...")
for text in tqdm(ancient_sentences_to_translate, desc="Translating final test set"):
    generated_translations.append(generate_with_tinyllama(text))
final_test_df['generated_translation'] = generated_translations
final_test_df['score_human'] = 0
output_csv_path = "/kaggle/working/tinyllama_final_dataset_predictions.csv"
output_columns = ['Author', 'Date', 'Region', 'Sentence', 'generated_translation', 'score_human']

final_output_df_columns = [col for col in output_columns if col in final_test_df.columns]
if 'generated_translation' not in final_output_df_columns:
    final_output_df_columns.append('generated_translation')
if 'score_human' not in final_output_df_columns:
    final_output_df_columns.append('score_human')

final_test_df[final_output_df_columns].to_csv(output_csv_path, index=False)
print(f"Final predictions saved to {output_csv_path}")

Generating translations (this may take a while)...


Translating final test set: 100%|██████████| 97/97 [18:34<00:00, 11.49s/it]

Final predictions saved to /kaggle/working/tinyllama_final_dataset_predictions.csv



