In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split


In [6]:
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")

# Create a validation split since validation.csv is not present
train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=train_df["language"] if "language" in train_df.columns else None,
 )

print("Train:", train_df.shape)
print("Validation:", val_df.shape)
print("Test:", test_df.shape)

train_df.head()


Train: (1248, 3)
Validation: (313, 3)
Test: (4, 5)


Unnamed: 0,oare_id,transliteration,translation
1452,ee4f5237-f729-4424-9164-727dcd9fb8f2,um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-...,"Thus Iliya, A≈°≈°ur-rƒì·π£ƒ´, Ennam-A≈°≈°ur, and Dadiy..."
1239,c5670aba-105b-40b5-bac1-12bc43a51091,2 ma-na K√ô.BABBAR ≈°√≠-im-tum ≈°a ·π≠up-p√≠-≈°u ·∏´a-ar...,"2 minas of silver, the agreed amount of his ce..."
774,7ad105d4-158e-457d-955b-b9f6b68d212c,a-na ku-ku-a LUGAL-s√∫-in a-gi-a √π en-nam-a-≈°ur...,"To Kukuwa, ≈†ar-Suen, Agiya and Ennam-A≈°≈°ur fro..."
1260,c9ae599d-8d56-4b81-be61-4a1edadc00b6,14 G√ö URUDU SIG‚ÇÖ ≈°a-bu-ra-am i-·π£√©-er i-d√≠-a-≈°u...,"Iddin-A≈°≈°ur son of Dan-A≈°≈°ur, Puzur-I≈°tar, his..."
858,870b143f-9c29-499b-8c6e-c0ed2be5eea8,·π≠up-pu-√∫ ≈°a ni-i≈° a-lim(ki) ku-nu-uk ba-≈°√≠,Tablets with oaths by the City.


In [7]:
# Rename columns to standard names used throughout the project
train_df = train_df.rename(columns={
    "transliteration": "source",
    "translation": "target"
})

test_df = test_df.rename(columns={
    "transliteration": "source"
})

train_df.head()


Unnamed: 0,oare_id,source,target
1452,ee4f5237-f729-4424-9164-727dcd9fb8f2,um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-...,"Thus Iliya, A≈°≈°ur-rƒì·π£ƒ´, Ennam-A≈°≈°ur, and Dadiy..."
1239,c5670aba-105b-40b5-bac1-12bc43a51091,2 ma-na K√ô.BABBAR ≈°√≠-im-tum ≈°a ·π≠up-p√≠-≈°u ·∏´a-ar...,"2 minas of silver, the agreed amount of his ce..."
774,7ad105d4-158e-457d-955b-b9f6b68d212c,a-na ku-ku-a LUGAL-s√∫-in a-gi-a √π en-nam-a-≈°ur...,"To Kukuwa, ≈†ar-Suen, Agiya and Ennam-A≈°≈°ur fro..."
1260,c9ae599d-8d56-4b81-be61-4a1edadc00b6,14 G√ö URUDU SIG‚ÇÖ ≈°a-bu-ra-am i-·π£√©-er i-d√≠-a-≈°u...,"Iddin-A≈°≈°ur son of Dan-A≈°≈°ur, Puzur-I≈°tar, his..."
858,870b143f-9c29-499b-8c6e-c0ed2be5eea8,·π≠up-pu-√∫ ≈°a ni-i≈° a-lim(ki) ku-nu-uk ba-≈°√≠,Tablets with oaths by the City.


In [8]:
from sklearn.model_selection import train_test_split


In [9]:
train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Kaggle CSV files
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")

# Rename columns to standard names
train_df = train_df.rename(columns={
    "transliteration": "source",
    "translation": "target"
})

test_df = test_df.rename(columns={
    "transliteration": "source"
})

# Create validation split
train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print("Train:", train_df.shape)
print("Validation:", val_df.shape)
print("Test:", test_df.shape)

train_df.head()


Train: (1248, 3)
Validation: (313, 3)
Test: (4, 5)


Unnamed: 0,oare_id,source,target
1452,ee4f5237-f729-4424-9164-727dcd9fb8f2,um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-...,"Thus Iliya, A≈°≈°ur-rƒì·π£ƒ´, Ennam-A≈°≈°ur, and Dadiy..."
1239,c5670aba-105b-40b5-bac1-12bc43a51091,2 ma-na K√ô.BABBAR ≈°√≠-im-tum ≈°a ·π≠up-p√≠-≈°u ·∏´a-ar...,"2 minas of silver, the agreed amount of his ce..."
774,7ad105d4-158e-457d-955b-b9f6b68d212c,a-na ku-ku-a LUGAL-s√∫-in a-gi-a √π en-nam-a-≈°ur...,"To Kukuwa, ≈†ar-Suen, Agiya and Ennam-A≈°≈°ur fro..."
1260,c9ae599d-8d56-4b81-be61-4a1edadc00b6,14 G√ö URUDU SIG‚ÇÖ ≈°a-bu-ra-am i-·π£√©-er i-d√≠-a-≈°u...,"Iddin-A≈°≈°ur son of Dan-A≈°≈°ur, Puzur-I≈°tar, his..."
858,870b143f-9c29-499b-8c6e-c0ed2be5eea8,·π≠up-pu-√∫ ≈°a ni-i≈° a-lim(ki) ku-nu-uk ba-≈°√≠,Tablets with oaths by the City.


In [11]:
train_df["src_len"] = train_df["source"].astype(str).apply(lambda x: len(x.split()))
train_df["tgt_len"] = train_df["target"].astype(str).apply(lambda x: len(x.split()))

train_df[["src_len", "tgt_len"]].describe()


Unnamed: 0,src_len,tgt_len
count,1248.0,1248.0
mean,57.59375,91.329327
std,37.169598,88.071951
min,3.0,1.0
25%,28.0,31.0
50%,49.0,68.0
75%,83.0,124.0
max,187.0,744.0


In [12]:
from collections import Counter

def build_vocab(text_series):
    vocab = Counter()
    for sentence in text_series:
        vocab.update(sentence.split())
    return vocab

src_vocab = build_vocab(train_df["source"])
tgt_vocab = build_vocab(train_df["target"])

print("Akkadian vocabulary size:", len(src_vocab))
print("English vocabulary size:", len(tgt_vocab))


Akkadian vocabulary size: 10267
English vocabulary size: 9225


In [13]:
pd.DataFrame(src_vocab.most_common(20), columns=["Token", "Frequency"])


Unnamed: 0,Token,Frequency
0,a-na,3097
1,≈°a,2691
2,K√ô.BABBAR,2499
3,x,2285
4,ma-na,1965
5,G√çN,1487
6,DUMU,1470
7,√π,1457
8,‚Ä¶,1037
9,i-na,993


In [14]:
special_tokens = ["<gap>", "<big_gap>", "{ki}", "{d}", "{m}", "{uru}", "LUGAL"]

for tok in special_tokens:
    count = train_df["source"].str.contains(tok).sum()
    print(f"{tok}: {count} sentences")


<gap>: 0 sentences
<big_gap>: 0 sentences
{ki}: 0 sentences
{d}: 0 sentences
{m}: 0 sentences
{uru}: 0 sentences
LUGAL: 20 sentences


In [15]:
import re

# 1) Basic cleaning for Akkadian transliteration (source)
def clean_source(text: str) -> str:
    text = str(text)

    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # remove scribal certainty markers and line dividers
    text = text.replace("!", "").replace("?", "")
    text = text.replace("/", " ")

    # normalize common word dividers (keep hyphens; they matter in transliteration)
    text = text.replace(":", " ")
    text = text.replace(".", " ")

    # keep determinatives in curly braces: {ki}, {d}, etc. (they carry meaning)
    # remove square brackets but keep inside text: [K√ô.BABBAR] -> K√ô.BABBAR
    text = re.sub(r"\[([^\]]+)\]", r"\1", text)

    # remove half brackets Àπ À∫ if present
    text = text.replace("Àπ", "").replace("À∫", "")

    # normalize whitespace again
    text = re.sub(r"\s+", " ", text).strip()

    return text


# 2) Basic cleaning for English translation (target)
def clean_target(text: str) -> str:
    text = str(text)

    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # optional: remove repeated spaces and stray control chars
    text = text.replace("\u200b", "")

    return text


In [16]:
train_df["source_clean"] = train_df["source"].apply(clean_source)
train_df["target_clean"] = train_df["target"].apply(clean_target)

val_df["source_clean"] = val_df["source"].apply(clean_source)
val_df["target_clean"] = val_df["target"].apply(clean_target)

test_df["source_clean"] = test_df["source"].apply(clean_source)

train_df[["source", "source_clean", "target", "target_clean"]].head(5)


Unnamed: 0,source,source_clean,target,target_clean
1452,um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-...,um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-...,"Thus Iliya, A≈°≈°ur-rƒì·π£ƒ´, Ennam-A≈°≈°ur, and Dadiy...","Thus Iliya, A≈°≈°ur-rƒì·π£ƒ´, Ennam-A≈°≈°ur, and Dadiy..."
1239,2 ma-na K√ô.BABBAR ≈°√≠-im-tum ≈°a ·π≠up-p√≠-≈°u ·∏´a-ar...,2 ma-na K√ô BABBAR ≈°√≠-im-tum ≈°a ·π≠up-p√≠-≈°u ·∏´a-ar...,"2 minas of silver, the agreed amount of his ce...","2 minas of silver, the agreed amount of his ce..."
774,a-na ku-ku-a LUGAL-s√∫-in a-gi-a √π en-nam-a-≈°ur...,a-na ku-ku-a LUGAL-s√∫-in a-gi-a √π en-nam-a-≈°ur...,"To Kukuwa, ≈†ar-Suen, Agiya and Ennam-A≈°≈°ur fro...","To Kukuwa, ≈†ar-Suen, Agiya and Ennam-A≈°≈°ur fro..."
1260,14 G√ö URUDU SIG‚ÇÖ ≈°a-bu-ra-am i-·π£√©-er i-d√≠-a-≈°u...,14 G√ö URUDU SIG‚ÇÖ ≈°a-bu-ra-am i-·π£√©-er i-d√≠-a-≈°u...,"Iddin-A≈°≈°ur son of Dan-A≈°≈°ur, Puzur-I≈°tar, his...","Iddin-A≈°≈°ur son of Dan-A≈°≈°ur, Puzur-I≈°tar, his..."
858,·π≠up-pu-√∫ ≈°a ni-i≈° a-lim(ki) ku-nu-uk ba-≈°√≠,·π≠up-pu-√∫ ≈°a ni-i≈° a-lim(ki) ku-nu-uk ba-≈°√≠,Tablets with oaths by the City.,Tablets with oaths by the City.


In [17]:
print("Example BEFORE:", train_df["source"].iloc[0])
print("Example AFTER :", train_df["source_clean"].iloc[0])

print("Empty sources (train):", (train_df["source_clean"].str.len() == 0).sum())
print("Empty targets (train):", (train_df["target_clean"].str.len() == 0).sum())


Example BEFORE: um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-d√≠-a a-na e-l√°-ma q√≠-bi‚ÇÑ-ma 4.33333 ma-na 2.5 G√çN K√ô.BABBAR dan-a-≈°ur ub-lam 4.33333 G√çN ≈°a-du-a-t√°m a-na dan-a-≈°ur ni-d√≠-in 1.3333300000000001 G√çN t√©-·π£√∫-be n√©-pu-ul 11 G√çN a-na ni-is-·∏´a-tim ni-d√≠-in ≈°√≠-t√≠ K√ô.BABBAR-p√¨-k√† 4 ma-na 5.83334 G√çN 17 ma-na 4 G√çN K√ô.BABBAR a-ni-nu-um ub-lam ≈†√Ä.BA 0.66666 ma-na 2.5 G√çN a-na ni-is-·∏´a-tim ni-d√≠-in ≈°√≠-t√≠ K√ô.BABBAR-p√¨-k√† 16.33333 ma-na L√Å 0.5 G√çN ≈†U.N√çGIN K√ô.BABBAR-p√¨-k√† 20.33333 ma-na 5.5 G√çN 4 G√ö 20 ma-na AN.NA ku-nu-ku 15.33333 G√çN 15 ≈†E.TA K√ô.BABBAR-√°p-≈°u 17.83333 ma-na 2 G√çN 20 ma-na AN.NA-ak q√°-tim 15 G√çN.TA 1.3333300000000001 ma-na K√ô.BABBAR 10 ku-ta-nu 0.66666 ma-na 6.5 G√çN it-bu-lu 8 ≈°u-ru-tum ≈°a li-wi-tim 0.33333 ma-na 8 G√çN K√ô.BABBAR-≈°u-nu 2 AN≈†E.·∏™I.A ·π£a-l√°-mu 0.66666 ma-na L√Å 4 G√çN K√ô.BABBAR-≈°u-nu 0.33333 ma-na 7 G√çN K√ô.BABBAR i ma-s√°-im im-·π≠√≠ 0.5 ma-na K√ô.BABBAR be-√∫-l√°-at ma-·π£i-√

In [18]:
from transformers import AutoTokenizer

MODEL_NAME = "Helsinki-NLP/opus-mt-en-de"  # placeholder tokenizer base (we only use tokenizer behavior)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [19]:
MAX_SOURCE_LEN = 128
MAX_TARGET_LEN = 128


In [21]:
def encode_batch(sources, targets=None):
    model_inputs = tokenizer(
        list(sources),
        max_length=MAX_SOURCE_LEN,
        truncation=True,
        padding="max_length"
    )
    if targets is not None:
        labels = tokenizer(
            list(targets),
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_enc = encode_batch(train_df["source_clean"], train_df["target_clean"])
val_enc   = encode_batch(val_df["source_clean"], val_df["target_clean"])
test_enc  = encode_batch(test_df["source_clean"])

In [22]:
print("input_ids shape:", len(train_enc["input_ids"]), len(train_enc["input_ids"][0]))
print("labels shape:", len(train_enc["labels"]), len(train_enc["labels"][0]))


input_ids shape: 1248 128
labels shape: 1248 128


In [25]:
i = 0
print("SOURCE CLEAN:", train_df["source_clean"].iloc[i])
print("TOKENIZED ->", train_enc["input_ids"][i][:30])
print("DECODED ->", tokenizer.decode(train_enc["input_ids"][i], skip_special_tokens=True))

print("\nTARGET CLEAN:", train_df["target_clean"].iloc[i])
print("DECODED LABELS ->", tokenizer.decode(train_enc["labels"][i], skip_special_tokens=True))


SOURCE CLEAN: um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-d√≠-a a-na e-l√°-ma q√≠-bi‚ÇÑ-ma 4 33333 ma-na 2 5 G√çN K√ô BABBAR dan-a-≈°ur ub-lam 4 33333 G√çN ≈°a-du-a-t√°m a-na dan-a-≈°ur ni-d√≠-in 1 3333300000000001 G√çN t√©-·π£√∫-be n√©-pu-ul 11 G√çN a-na ni-is-·∏´a-tim ni-d√≠-in ≈°√≠-t√≠ K√ô BABBAR-p√¨-k√† 4 ma-na 5 83334 G√çN 17 ma-na 4 G√çN K√ô BABBAR a-ni-nu-um ub-lam ≈†√Ä BA 0 66666 ma-na 2 5 G√çN a-na ni-is-·∏´a-tim ni-d√≠-in ≈°√≠-t√≠ K√ô BABBAR-p√¨-k√† 16 33333 ma-na L√Å 0 5 G√çN ≈†U N√çGIN K√ô BABBAR-p√¨-k√† 20 33333 ma-na 5 5 G√çN 4 G√ö 20 ma-na AN NA ku-nu-ku 15 33333 G√çN 15 ≈†E TA K√ô BABBAR-√°p-≈°u 17 83333 ma-na 2 G√çN 20 ma-na AN NA-ak q√°-tim 15 G√çN TA 1 3333300000000001 ma-na K√ô BABBAR 10 ku-ta-nu 0 66666 ma-na 6 5 G√çN it-bu-lu 8 ≈°u-ru-tum ≈°a li-wi-tim 0 33333 ma-na 8 G√çN K√ô BABBAR-≈°u-nu 2 AN≈†E ·∏™I A ·π£a-l√°-mu 0 66666 ma-na L√Å 4 G√çN K√ô BABBAR-≈°u-nu 0 33333 ma-na 7 G√çN K√ô BABBAR i ma-s√°-im im-·π≠√≠ 0 5 ma-na K√ô BABBAR be-√∫-l√°-at ma-·π£i-√¨-

In [26]:
# Token counts (word counts) from cleaned data
train_df["src_len_clean"] = train_df["source_clean"].astype(str).apply(lambda x: len(x.split()))
train_df["tgt_len_clean"] = train_df["target_clean"].astype(str).apply(lambda x: len(x.split()))

train_df[["src_len_clean","tgt_len_clean"]].quantile([0.50, 0.75, 0.90, 0.95, 0.99])


Unnamed: 0,src_len_clean,tgt_len_clean
0.5,54.0,68.0
0.75,91.25,124.0
0.9,123.0,189.0
0.95,137.0,250.0
0.99,167.0,476.18


In [27]:
MAX_SOURCE_LEN = 160
MAX_TARGET_LEN = 256


In [28]:
train_enc = encode_batch(train_df["source_clean"], train_df["target_clean"])
val_enc   = encode_batch(val_df["source_clean"], val_df["target_clean"])
test_enc  = encode_batch(test_df["source_clean"])


In [29]:
i = 0
print("DECODED SOURCE:", tokenizer.decode(train_enc["input_ids"][i], skip_special_tokens=True))
print("\nDECODED TARGET:", tokenizer.decode(train_enc["labels"][i], skip_special_tokens=True))


DECODED SOURCE: um-ma i-li-a-ma a-≈°ur-re-·π£√≠ en-nam-a-≈°ur √π da-d√≠-a a-na e-l√°-ma q√≠-bi4-ma 4 33333 ma-na 2 5 G√çN K√ô BABBAR dan-a-≈°ur ub-lam 4 33333 G√çN ≈°a-du-a-t√°m a-na dan-a-≈°ur ni-d√≠-in 1 3333300000000001 G√çN t√©-·π£√∫-be n√©-pu-ul 11 G√çN a-na ni-is-·∏´a-tim ni-d√≠-in ≈°√≠-

DECODED TARGET: Thus Iliya, A≈°≈°ur-rƒì·π£ƒ´, Ennam-A≈°≈°ur, and Dadiya, say to Elamma: 4.3333 minas 2.5 shekels of silver DƒÅn-A≈°≈°ur brought. We gave 4.3333 shekels as transport fee to DƒÅn-A≈°≈°ur, paid 1.3333 shekel as addition and 11 gave as excise, so that of your silver there remain 4 minas 5.83333? shekels. 17 minas and 4 shekels of silver Anƒ´num brought here. Thereof we paid 42.5 shekels as excise. There remain of your silver 16 minas 19.5 shekels. The total of your silver amounts to 20 [minas 25.3333 shekels. 4 talents 20 minas of tin under seals, at a rate of 15 5 / 12 shekels : 1 the silver for it was 17 minas 52 shekels. 20 minas tin for expenses, at a rate of 15 shekels : 1 makes

In [30]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast

MODEL_NAME = "t5-small"

tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

device


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 131/131 [00:00<00:00, 336.19it/s, Materializing param=shared.weight]                                                      


device(type='cpu')

In [31]:
def add_prefix(text):
    return "translate Akkadian to English: " + text


In [32]:
train_sources = train_df["source_clean"].apply(add_prefix)
val_sources   = val_df["source_clean"].apply(add_prefix)
test_sources  = test_df["source_clean"].apply(add_prefix)


In [33]:
MAX_SOURCE_LEN = 160
MAX_TARGET_LEN = 256

def encode_t5(sources, targets=None):
    inputs = tokenizer(
        list(sources),
        max_length=MAX_SOURCE_LEN,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    if targets is not None:
        labels = tokenizer(
            list(targets),
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        inputs["labels"] = labels["input_ids"]
    return inputs

train_enc = encode_t5(train_sources, train_df["target_clean"])
val_enc   = encode_t5(val_sources, val_df["target_clean"])


In [34]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings["input_ids"].shape[0]

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

train_dataset = TranslationDataset(train_enc)
val_dataset   = TranslationDataset(val_enc)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4)


In [35]:
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCHS = 2  # baseline (enough for coursework)

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


Training Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 312/312 [26:36<00:00,  5.12s/it]


Epoch 1 | Train Loss: 2.9143 | Val Loss: 2.1323


Training Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 312/312 [47:36<00:00,  9.16s/it]  


Epoch 2 | Train Loss: 2.1213 | Val Loss: 1.8874


In [36]:
import numpy as np

def generate_texts(sources, max_new_tokens=128, num_beams=4):
    model.eval()
    preds = []
    with torch.no_grad():
        for s in sources:
            enc = tokenizer(
                s,
                max_length=MAX_SOURCE_LEN,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(device)

            out_ids = model.generate(
                **enc,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                early_stopping=True
            )

            preds.append(tokenizer.decode(out_ids[0], skip_special_tokens=True))
    return preds

# Use a small validation subset
N = 100
val_sources_subset = val_sources.iloc[:N].tolist()
val_targets_subset = val_df["target_clean"].iloc[:N].tolist()

val_preds_subset = generate_texts(val_sources_subset)

# Preview a few
for i in range(3):
    print("SOURCE:", val_sources_subset[i][:120], "...")
    print("TRUE  :", val_targets_subset[i][:200], "...")
    print("PRED  :", val_preds_subset[i][:200], "...")
    print("-"*80)


SOURCE: translate Akkadian to English: 10 ma-na K√ô BABBAR ·π£a-ru-pu-um ni-is-·∏´a-s√∫ DIRI ≈°a-du-a-s√∫ ≈°a-bu ≈°a t√°m-k√†-ri-im a-na i-l ...
TRUE  : 10 minas of refined silver, its excise added, his transport fee paid, belonging to the merchant, for Iliya and the representatives of LƒÅ-qƒìp, and separately 0.3333 mina of silver from Iliya's free loa ...
PRED  :  ...
--------------------------------------------------------------------------------
SOURCE: translate Akkadian to English: 2-≈°√≠-ta na-√°≈°-p√©-ra-tum l√° p√°-t√≠-a-tum ≈°a a-ta-ta DUMU ma-num-ba-lum-a-≈°√πr a-·π£√©-ri-a √∫ a- ...
TRUE  : 2 (unopened) letters of Atata, son of Mannum-balum-Assur, addressed to me and to Assur-ennam, son of Kubiya; two (unopened) letters of our father to Belum-bani, son of Su-Belum; a tablet (with the sea ...
PRED  :  ...
--------------------------------------------------------------------------------
SOURCE: translate Akkadian to English: KI≈†IB en-na-nim DUMU a-l√°-bi‚ÇÑ-im KI≈†IB a-gi-a D

In [37]:
import sacrebleu

bleu = sacrebleu.corpus_bleu(val_preds_subset, [val_targets_subset]).score
chrf = sacrebleu.corpus_chrf(val_preds_subset, [val_targets_subset]).score

geom_mean = (bleu * chrf) ** 0.5

print(f"BLEU: {bleu:.2f}")
print(f"chrF++: {chrf:.2f}")
print(f"Geometric Mean (BLEU * chrF++)^0.5: {geom_mean:.2f}")


BLEU: 0.00
chrF++: 0.00
Geometric Mean (BLEU * chrF++)^0.5: 0.00


In [38]:
import os

SAVE_DIR = "../outputs/models/t5_baseline"
os.makedirs(SAVE_DIR, exist_ok=True)

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("Saved model to:", SAVE_DIR)


Writing model shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.50it/s]

Saved model to: ../outputs/models/t5_baseline





In [39]:
def translate_akkadian(text, max_new_tokens=128, num_beams=4):
    text = clean_source(text)
    text = add_prefix(text)

    enc = tokenizer(
        text,
        max_length=MAX_SOURCE_LEN,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    out_ids = model.generate(
        **enc,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
        early_stopping=True
    )

    return tokenizer.decode(out_ids[0], skip_special_tokens=True)

# Test with one example from your dataset
print(translate_akkadian(train_df["source"].iloc[0]))





In [43]:
print(model is not None, tokenizer is not None)


True True


In [45]:
# Sanity check: confirm critical variables exist
for name in ["test_df", "tokenizer", "model", "device"]:
    print(name, "->", "OK" if name in globals() else "MISSING")

# If test_df exists, show its shape
if "test_df" in globals():
    print("test_df rows:", len(test_df))
    print("test_df cols:", test_df.columns.tolist())


test_df -> OK
tokenizer -> OK
model -> OK
device -> OK
test_df rows: 4
test_df cols: ['id', 'text_id', 'line_start', 'line_end', 'source', 'source_clean']


In [46]:
import torch

def add_prefix(text: str) -> str:
    return "translate Akkadian to English: " + text

MAX_SOURCE_LEN = 160
MAX_NEW_TOKENS = 128
BATCH_SIZE = 16
NUM_BEAMS = 4

texts = (
    test_df["source_clean"]
    .astype(str)
    .apply(add_prefix)
    .tolist()
)

preds = []

for i in range(0, len(texts), BATCH_SIZE):
    batch = texts[i:i+BATCH_SIZE]

    enc = tokenizer(
        batch,
        max_length=MAX_SOURCE_LEN,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            num_beams=NUM_BEAMS,
            early_stopping=True
        )

    preds.extend(tokenizer.batch_decode(out_ids, skip_special_tokens=True))

print("‚úÖ Predictions generated:", len(preds))
print("üîç Sample prediction:", preds[0][:200])


‚úÖ Predictions generated: 4
üîç Sample prediction: 


In [47]:
import pandas as pd
import os

submission = pd.DataFrame({
    "id": test_df["id"],
    "translation": preds
})

submission.to_csv("submission.csv", index=False)

print("File exists:", os.path.exists("submission.csv"))
print("Rows:", len(submission))
print("File size:", os.path.getsize("submission.csv"), "bytes")

submission


File exists: True
Rows: 4
File size: 32 bytes


Unnamed: 0,id,translation
0,0,
1,1,
2,2,
3,3,


In [48]:
import pandas as pd
import os

submission = pd.DataFrame({
    "id": test_df["id"],
    "translation": preds
})

submission.to_csv("submission.csv", index=False)

print("‚úÖ submission.csv created")
print("File exists:", os.path.exists("submission.csv"))
print("Rows:", len(submission))
print("File size:", os.path.getsize("submission.csv"), "bytes")

submission


‚úÖ submission.csv created
File exists: True
Rows: 4
File size: 32 bytes


Unnamed: 0,id,translation
0,0,
1,1,
2,2,
3,3,
