# Akan Text Generation with Transformers

This notebook trains a character-level Tokenizer and Transformer model to generate Akan text, based on transcriptions from 'Akan.xlsx'.

In [8]:
!pip install torch pandas openpyxl tqdm



In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
import os
import time

# Hyperparameters
Optimized for the Akan dataset (~3MB text).

In [10]:
# Hyperparameters
batch_size = 64  # Increased batch size for efficiency
block_size = 128 # Context length
max_iters = 3000 # Reduced iterations for efficiency (approx 10 epochs)
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128     # Reduced embedding size to prevent overfitting
n_head = 4       # Reduced heads
n_layer = 4      # Reduced layers (lighter model)
dropout = 0.2    # Increased dropout for regularization

print(f"Using device: {device}")
torch.manual_seed(42)

Using device: cuda


<torch._C.Generator at 0x7e644bf002f0>

# Data Loading
Loads text from the 'Transcriptions' column of Akan.xlsx.

In [11]:
def load_language_texts():
    """Load Akan dataset from Excel"""
    print("Loading Akan dataset from Excel...")
    file_path = "/content/Akan.xlsx"

    try:
        # Try loading from current directory or absolute path if needed
        if os.path.exists(file_path):
             df = pd.read_excel(file_path)
        else:
             # Fallback to specific user path if running locally
             base_path = "/Users/naalamleboye/Documents/Ashesi_MPhil_ICS/2025-2026_Sem2/Natural Language Processing/Prosit 1/"
             df = pd.read_excel(os.path.join(base_path, file_path))

    except Exception as e:
        print(f"Error reading Excel file: {e}")
        print("Ensure 'Akan.xlsx' is in the directory and openpyxl is installed.")
        raise

    if 'Transcriptions' not in df.columns:
        raise ValueError(f"Column 'Transcriptions' not found. Available: {df.columns.tolist()}")

    # Drop NA and convert to string
    texts = df['Transcriptions'].dropna().astype(str).tolist()
    print(f"Collected {len(texts)} texts.")

    # Combine all texts with newlines
    combined_text = '\n\n'.join(texts)
    return combined_text

def create_vocab_and_encode(text):
    """Create vocabulary and encoding/decoding functions"""
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    print(f"Vocabulary size: {vocab_size} unique characters")

    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}

    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])

    return vocab_size, encode, decode, stoi, itos

def prepare_data(text, encode):
    """Prepare train/val/test splits"""
    print("Encoding text...")
    data = torch.tensor(encode(text), dtype=torch.long)

    n = len(data)
    train_size = int(0.9 * n) # 90% train
    val_size = int(0.05 * n)  # 5% val
    test_size = n - train_size - val_size

    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]

    print(f"Train size: {len(train_data):,} chars")
    print(f"Val size:   {len(val_data):,} chars")
    print(f"Test size:  {len(test_data):,} chars")

    return train_data, val_data, test_data

# Model Definition
Standard Decoder-only Transformer.

In [12]:
class Head(nn.Module):
    """One head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)

        # Affinity scores
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """A simple linear layer followed by a non-linearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Training & Generation Utilities

In [13]:
def get_batch(data, batch_size, block_size):
    """Generate a small batch of data"""
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, train_data, val_data):
    out = {}
    model.eval()
    for split, data in [('train', train_data), ('val', val_data)]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(data, batch_size, block_size)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def train_model(model, train_data, val_data, optimizer):
    print("Starting training...")
    best_val_loss = float('inf')
    start_time = time.time()

    for iter in range(max_iters):
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model, train_data, val_data)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

            if losses['val'] < best_val_loss:
                best_val_loss = losses['val']
                torch.save(model.state_dict(), 'best_model.pt')

        xb, yb = get_batch(train_data, batch_size, block_size)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(f"Training finished in {(time.time()-start_time)/60:.2f} minutes.")

def generate_text(model, decode, start_string, max_new_tokens=200, temperature=0.8):
    model.eval()
    # Handle unknown chars by ignoring or mapping to random known char (simple approach: skip)
    # But for now assume start_string chars exist in vocab
    try:
        context = torch.tensor([encode(start_string)], dtype=torch.long, device=device)
        generated = model.generate(context, max_new_tokens=max_new_tokens, temperature=temperature)
        return decode(generated[0].tolist())
    except KeyError as e:
        return f"Error: Character {e} not in vocabulary."


# Main Execution

# 1. Setup, Data Loading & Model Initialization

In [14]:
# 1. Load Data
text = load_language_texts()

# 2. Vocab
vocab_size, encode, decode, stoi, itos = create_vocab_and_encode(text)

# 3. Splits
train_data, val_data, test_data = prepare_data(text, encode)

# 4. Initialize Model
print(f"Initializing model with embedding size {n_embd}, {n_layer} layers, {n_head} heads.")
model = TransformerLanguageModel(vocab_size)
model = model.to(device)
print(f"Parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

Loading Akan dataset from Excel...
Collected 18787 texts.
Vocabulary size: 90 unique characters
Encoding text...
Train size: 2,925,977 chars
Val size:   162,554 chars
Test size:  162,555 chars
Initializing model with embedding size 128, 4 layers, 4 heads.
Parameters: 0.83M


# 2. Evaluation Metric
We evaluate language models using cross-entropy loss and perplexity.
- **Loss**: Measures how well the model predicts the next character (lower is better).
- **Perplexity (exp(loss))**: A more intuitive metric representing the geometric mean of the number of choices the model believes are possible for the next character. A lower perplexity indicates the model is more confident and accurate.

In [15]:
@torch.no_grad()
def evaluate_model(model, data, subset_name="test"):
    """Evaluate the model on a given dataset and print Loss and Perplexity"""
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(data, batch_size, block_size)
        logits, loss = model(X, Y)
        losses[k] = loss.item()

    mean_loss = losses.mean().item()
    perplexity = torch.exp(torch.tensor(mean_loss)).item()

    print(f"--- {subset_name.upper()} SET EVALUATION ---")
    print(f"Loss:       {mean_loss:.4f}")
    print(f"Perplexity: {perplexity:.4f}")

    return mean_loss, perplexity

# 3. Training Loop

In [16]:
# 5. Train
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
train_model(model, train_data, val_data, optimizer)

Starting training...
step 0: train loss 4.6322, val loss 4.6264
step 500: train loss 2.1042, val loss 2.1556
step 1000: train loss 1.7842, val loss 1.8632
step 1500: train loss 1.5551, val loss 1.6657
step 2000: train loss 1.4368, val loss 1.5682
step 2500: train loss 1.3641, val loss 1.5077
step 2999: train loss 1.3166, val loss 1.4706
Training finished in 2.27 minutes.


# 4. Final Evaluation & Text Generation

In [17]:
# Load best model
idx_path = 'best_model.pt'
if os.path.exists(idx_path):
    print("Loading best model...")
    model.load_state_dict(torch.load(idx_path))
else:
    print("Warning: 'best_model.pt' not found. Using current model state.")

# Evaluate on Test Set
evaluate_model(model, test_data, subset_name="test")

# Generate Samples
print("\n--- Generating Samples ---")
start_strings = ["Nnipa", "Mmaa", "Baabi a", "Adwuma", "Efie"]

for start in start_strings:
    print(f"\nPrompt: {start}")
    try:
        print(generate_text(model, decode, start))
    except Exception as e:
        print(f"Could not generate for {start}: {e}")

Loading best model...
--- TEST SET EVALUATION ---
Loss:       1.4161
Perplexity: 4.1210

--- Generating Samples ---

Prompt: Nnipa
Nnipa bebree na bi so gyina hɔ a ɔredi wɔn akadaa.

Mmarima mmeranteɛ bi a wɔasorow a ɔrehwɛ wɔn. Mmaa no abɔ baako nso gyina nkyɛn wɔ a wɔresa. Mmarima no ara. Nkurɔfoɔ dan wɔ mu. Ebinom nso wɔ hɔ a ɔkant

Prompt: Mmaa
Mmaa no anim.

Mmarima bi te pono wɔ te hɔ ahyia wɔn so so, na wɔrehwɛ adwuma no yɛ kɔkɔɔ hɔnom. Ebi kyɛ kyɛw. Na ebinom so adeɛ no wɔ hɔnom. Ebi nso yɛ fitaa, wɔde ɛdan no yɛ apadɔtɔ ne mu mmarima nso at

Prompt: Baabi a
Baabi a ɔkurafoɔ de akɔkɔɔ dɛ obi. Nnipa ketewa no so te so.

Nnipadɔm gyinagyina ne ahyia wɔ ne mu a wɔhyɛ ntam. Nkyɛn. Sipa na mmienu wɔn a wɔrekyerɛ egugu adeɛ n’ano a ɔne no ho no ases no mu, ɛna fufuo b

Prompt: Adwuma
Adwumayɛfoɔ baako a ɔrehwɛ regyerɛ ase. Lɔɔreyɛ soro no soro kame ho nso a wɔreda hɔn nnua na si so wɔn atadeɛ bi nso so.

Nkokuo tentena wɔafadaa no so yɛ tadeɛ. Abanin de ahosuo yɛ kuntum. Baako abɔ bi