# Ancient to Modern Italian Translation with TinyLLaMA

## Setup

In [None]:
# Install required libraries
!pip install -q transformers datasets peft bitsandbytes accelerate evaluate

In [None]:
# Import Necessary Modules
import pandas as pd
import numpy as np
import torch
import ast
import evaluate
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig,
    StoppingCriteria, StoppingCriteriaList
)
from tqdm import tqdm

## Load and prepare dataset

In [None]:
df = pd.read_csv('/inputs/dataset_concatenato.csv')[['text', 'translation']].dropna()
df = df.rename(columns={'text': 'ancient', 'translation': 'modern'})

# Convert a string representation of a list to a string sentence
def fix_list_string_to_sentence(text): 
    try:
        tokens = ast.literal_eval(text)
        if isinstance(tokens, list):
            return " ".join(tokens)
    except (ValueError, SyntaxError):
        pass
    return text

# Apply the function to both columns
df['ancient'] = df['ancient'].apply(fix_list_string_to_sentence)
df['modern'] = df['modern'].apply(fix_list_string_to_sentence)

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

## Load Model and Tokenizer

In [None]:
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' # Model ID for TinyLlama
tokenizer = AutoTokenizer.from_pretrained(model_id) # Load the tokenizer for TinyLlama
if tokenizer.pad_token is None: # Set the pad token if it is not already set
    tokenizer.pad_token = tokenizer.eos_token

## Quantization and Fine-Tuning configuration

In [None]:
# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig( 
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load the model with 4-bit quantization and prepare it for training
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)

# Apply LoRA
# Define the target modules for LoRA
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
peft_config = LoraConfig(
    r=32, # Rank for LoRA
    lora_alpha=64, # Scaling factor for LoRA
    lora_dropout=0.05,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM" # Task type for the model
)

# Get the PEFT model with the LoRA configuration
model = get_peft_model(model, peft_config)

## Preprocess Data

In [None]:
# Maximum sequence length for the model
MAX_LENGTH = 128

# Tokenizes the input data and prepares it for training
def preprocess_function(examples): 
    all_input_ids = []
    all_labels = []
    # System prompt that guides the model's behavior during translation.
    # It sets the context for the model to act as an expert translator.
    SYSTEM_PROMPT = "You are an expert translator from ancient to modern Italian." 

    for ancient, modern in zip(examples['ancient'], examples['modern']):
        # Constructs the full text by combining the system prompt, ancient text, and modern translation.
        full_text = (
            f"<|system|>\n{SYSTEM_PROMPT}{tokenizer.eos_token}\n"
            f"<|user|>\n{ancient}{tokenizer.eos_token}\n"
            f"<|assistant|>\n{modern}{tokenizer.eos_token}"
        )
        # Constructs the prompt part of the text, which is used to guide the model.
        prompt_only = (
            f"<|system|>\n{SYSTEM_PROMPT}{tokenizer.eos_token}\n"
            f"<|user|>\n{ancient}{tokenizer.eos_token}\n"
            f"<|assistant|>\n"
        )

        # Tokenizes the full text and the prompt part separately.
        tokenized_full = tokenizer(full_text, max_length=MAX_LENGTH, truncation=True, padding=False)
        tokenized_prompt = tokenizer(prompt_only, max_length=MAX_LENGTH, truncation=True, padding=False)
        
        # Extracts the input_ids from the tokenized full text and calculates the prompt length.
        input_ids = tokenized_full['input_ids']
        prompt_length = len(tokenized_prompt['input_ids'])
        
        # Labels are set to -100 for the prompt part to ignore it during loss calculation.
        labels = [-100] * prompt_length + input_ids[prompt_length:]
        
        # Padding the input_ids and labels to ensure they are of equal length.
        padding_length = MAX_LENGTH - len(input_ids) 
        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
        labels = labels + [-100] * padding_length
        
        all_input_ids.append(input_ids)
        all_labels.append(labels)
        
    return {"input_ids": all_input_ids, "labels": all_labels}

train_ds = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
val_ds = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)

## Training and Metrics

In [None]:
# Define Training Arguments and Metrics
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4, # Accumulate gradients over 4 steps to simulate a larger batch size
    num_train_epochs=4,
    logging_steps=10,
    eval_strategy='steps', # Evaluate every 10 steps
    eval_steps=10,
    save_strategy='steps', # Save the model every 10 steps
    save_steps=10,
    learning_rate=1e-5, # Learning rate for training
    report_to='none',
    gradient_checkpointing=True, # Enable gradient checkpointing to save memory
    warmup_ratio=0.1, # Warmup ratio for learning rate scheduler
    weight_decay=0.01, # Weight decay for regularization
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="bleu",
    greater_is_better=True,
    save_total_limit=2 # Limit the number of saved checkpoints
)

bleu_metric = evaluate.load("bleu")

# Define the compute_metrics function for evaluation
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Convert logits to predictions
    preds = np.argmax(preds, axis=-1)
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Isolate the assistant's response for both predictions and labels
    cleaned_preds = [pred.split("<|assistant|>")[-1].strip() for pred in decoded_preds]
    cleaned_labels = [label.split("<|assistant|>")[-1].strip() for label in decoded_labels]
    
    # Format for BLEU metric
    references_for_bleu = [[label] for label in cleaned_labels]
    
    # Compute BLEU score
    result = bleu_metric.compute(predictions=cleaned_preds, references=references_for_bleu)
    return {"bleu": result["bleu"]}

In [None]:
# Initialize and Run Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Data collator for language modeling without masked language modeling (MLM)
)

print("Starting training...")
trainer.train()
print("Training finished. Best model is loaded.")

## Inference and Stopping Criteria

In [None]:
# Define Stopping Criteria for Inference
class StopOnEosToken(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        if input_ids[0, -1] == tokenizer.eos_token_id: # Check if the last token is the end-of-sequence token
            return True
        return False

# Define Inference Function
def generate_with_tinyllama(text):
    prompt = (
        f"<|system|>\nYou are an expert translator from ancient to modern Italian.{tokenizer.eos_token}\n" # System prompt to guide the model's behavior
        f"<|user|>\n{text}{tokenizer.eos_token}\n" # User's input text to be translated
        f"<|assistant|>\n" # Placeholder for the model's response
    )

    # Tokenize the prompt and prepare inputs for the model
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    stopping_criteria = StoppingCriteriaList([StopOnEosToken()])

    # Generate the output using the model
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        num_beams=5, # Use beam search for better quality translations
        early_stopping=True, # Stop generation when the model is confident enough
        do_sample=False, # Disable sampling to ensure deterministic outputs
        stopping_criteria=stopping_criteria, # Use custom stopping criteria to stop at the end-of-sequence token
        repetition_penalty=1.2, # Penalizes words that have already appeared.
        no_repeat_ngram_size=3, # Prevents the model from repeating any 3-word sequence.
    )
    
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the assistant's response from the decoded output
    try:
        return decoded.split("<|assistant|>")[-1].strip()
    except IndexError:
        return f"Model failed to generate a valid response. Full output: {decoded}"

# Run Inference on Test Set
test_df = pd.read_csv('/inputs/dataset_human_eval.csv')[['Sentence', 'HumanEval']].dropna()
test_df = test_df.rename(columns={'Sentence': 'ancient', 'HumanEval': 'modern'})
test_df['tinyllama_output'] = [generate_with_tinyllama(text) for text in tqdm(test_df['ancient'], desc="Generating Final Translations")]

## Generate final translations

In [None]:
# Process the Test Set
final_output_test_file_path = '/inputs/dataset.csv'

final_test_df = pd.read_csv(final_output_test_file_path)
ancient_text_column_name = 'Sentence'

final_test_df[ancient_text_column_name] = final_test_df[ancient_text_column_name].apply(fix_list_string_to_sentence)
    
# Filter out any empty sentences after cleaning, if any
final_test_df = final_test_df[final_test_df[ancient_text_column_name].str.strip() != ""].copy()

# Get the list of ancient sentences to translate
ancient_sentences_to_translate = final_test_df[ancient_text_column_name].tolist()

model.eval() # Ensure the model is in evaluation mode before generating translations
generated_translations = [] # List to store generated translations
print("Generating translations (this may take a while)...") # Progress bar will show the progress of translation generation.
for text in tqdm(ancient_sentences_to_translate, desc="Translating final test set"):
    generated_translations.append(generate_with_tinyllama(text))

# Add the generated translations to the final test DataFrame
final_test_df['generated_translation'] = generated_translations
final_test_df['score_human'] = 0
output_csv_path = "/outputs/dataset_with_tinyllama_translations.csv"
output_columns = ['Author', 'Date', 'Region', 'Sentence', 'generated_translation', 'score_human']

# Ensure the final DataFrame has the correct columns
final_output_df_columns = [col for col in output_columns if col in final_test_df.columns]
if 'generated_translation' not in final_output_df_columns:
    final_output_df_columns.append('generated_translation')
if 'score_human' not in final_output_df_columns:
    final_output_df_columns.append('score_human')

# Save the final DataFrame with translations to a CSV file
final_test_df[final_output_df_columns].to_csv(output_csv_path, index=False)
print(f"Final predictions saved to {output_csv_path}")

Generating translations (this may take a while)...


Translating final test set: 100%|██████████| 97/97 [18:34<00:00, 11.49s/it]

Final predictions saved to /kaggle/working/tinyllama_final_dataset_predictions.csv



