# 1. Environment and Data Preparation

In [None]:
from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
import re
import random
import time
import pickle
import os

### Load Data

In [None]:
twi_file = "twi.txt"

with open(twi_file, "r", encoding="utf-8") as f:
    twi_sentences = f.readlines()

print("Number of raw Twi sentences:", len(twi_sentences))
print("Sample sentence:", twi_sentences[0])


### Clean the Tetxt (Twi Specific)

In [None]:


def clean_text(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"[^a-zɔɛ\-\' ]+", "", sentence)
    sentence = re.sub(r"\s+", " ", sentence).strip()
    return sentence

cleaned_sentences = [clean_text(s) for s in twi_sentences]
cleaned_sentences = [s for s in cleaned_sentences if s != ""]

print("Cleaned sentences:", len(cleaned_sentences))
print("Example:", cleaned_sentences[0])


In [None]:
with open("twi_clean_for_sketch_engine.txt", "w", encoding="utf-8") as f:
    for sentence in cleaned_sentences:
        f.write(sentence + "\n")

print("File saved: twi_clean_for_sketch_engine.txt")


### Split the Data (80/10/10)

In [None]:
cleaned_file = "twi_clean_for_sketch_engine.txt"

with open(cleaned_file, "r", encoding="utf-8") as f:
    # We read lines and strip to ensure no trailing newlines in our list
    full_cleaned_data = [line.strip() for line in f if line.strip()]


# --- The Split (80/10/10) ---
# First split: 80% train, 20% temp
train_sentences, temp_sentences = train_test_split(full_cleaned_data, test_size=0.2, random_state=42)

# Second split: split temp into 50/50 to get 10% dev and 10% test
dev_sentences, test_sentences = train_test_split(temp_sentences, test_size=0.5, random_state=42)

print(f"Split sizes -> Train: {len(train_sentences)}, Dev: {len(dev_sentences)}, Test: {len(test_sentences)}")


# Save train set only for the tokenizer
with open("twi_train_only.txt", "w", encoding="utf-8") as f:
    for s in train_sentences:
        f.write(s + "\n")

# 3. CUSTOM TWI SUBWORD TOKENIZATION

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# We use a custom BPE that uses Twi-aware splitting
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
TWI_REGEX = r"[A-Za-zÀ-ÖØ-öø-ÿƐɛƆɔ0-9]+(?:['-][A-Za-zÀ-ÖØ-öø-ÿƐɛƆɔ0-9]+)*|[^\w\s]"



tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),  # splits & removes spaces
    pre_tokenizers.Split(
        pattern=TWI_REGEX,
        behavior="isolated"
    )
])

trainer = trainers.BpeTrainer(
    vocab_size=3000, # Optimized for low-resource density
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "<s>", "</s>", "[MASK]"]
)
tokenizer.train(files=["twi_train_only.txt"], trainer=trainer)

def tokenize_set(data):
    return [["<s>"] + tokenizer.encode(s).tokens + ["</s>"] for s in data]



In [None]:
# Select a random subset of 10,000 sentences from the test set
# This drastically reduces the time for perplexity evaluation
sample_size = 1000
if len(test_sentences) >= sample_size:
    test_sample = random.sample(test_sentences, sample_size)
else:
    test_sample = test_sentences  # Fallback if the test set is smaller than 10k

print(f"Created a random test sample of {len(test_sample)} sentences.")

# Now tokenize this sample instead of the full test set
test_tokenized = tokenize_set(test_sample)

In [None]:

train_tokenized = tokenize_set(train_sentences)
dev_tokenized = tokenize_set(dev_sentences)
test_tokenized = tokenize_set(test_sentences)


print(train_tokenized[:30])


## Data Analysis

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def run_analysis(data):
    """Linguistic analysis based on Zipf's Law."""
    all_words = " ".join(data).split()
    counts = Counter(all_words)
    freqs = sorted(counts.values(), reverse=True)

    plt.figure(figsize=(12, 4))

    # Zipf's Law Plot
    plt.subplot(1, 2, 1)
    plt.loglog(freqs)
    plt.title("Zipf's Law (Twi Corpus)")

    # Cumulative Coverage
    plt.subplot(1, 2, 2)
    plt.plot([sum(freqs[:i])/sum(freqs) for i in range(len(freqs[:1000]))])
    plt.title("Vocab Coverage (Top 1000)")
    plt.show()

run_analysis(train_sentences)

# 4. Optimized Training & Performance Benchmarking

### Train with WittenBell Interpolated

In [None]:
from nltk.lm import WittenBellInterpolated

def train_and_save_ngram(train_tokenized, n, model_path):
    """
    Train an n-gram  LM and save it to disk.
    """
    print(f"Training {n}-gram model...")

    train_data, vocab = padded_everygram_pipeline(n, train_tokenized)

    model = WittenBellInterpolated(n)
    
    model.fit(train_data, vocab)

    with open(model_path, "wb") as f:
        pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved model → {model_path}")


n_orders = [3, 4, 5]

print(f"Evaluating {len(test_tokenized)} test sentences")

for n in n_orders:
    start_time = time.time()

    
    model_path = f"twi_wittenBell_{n}gram.pkl"

    # --- TRAIN & SAVE ---
    train_and_save_ngram(train_tokenized, n, model_path)



### Or, Train with Kneser-ney

In [None]:
from nltk.lm import KneserNeyInterpolated

def train_and_save_ngram(train_tokenized, n, model_path):
    """
    Train an n-gram Kneser–Ney LM and save it to disk.
    """
    print(f"Training {n}-gram model...")

    train_data, vocab = padded_everygram_pipeline(n, train_tokenized)

    model = KneserNeyInterpolated(n)
    
    model.fit(train_data, vocab)

    with open(model_path, "wb") as f:
        pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved model → {model_path}")


n_orders = [3, 4, 5]

print(f"Evaluating {len(test_tokenized)} test sentences")

for n in n_orders:
    start_time = time.time()

    
    model_path = f"twi_kneser_ney_{n}gram.pkl"

    # --- TRAIN & SAVE ---
    train_and_save_ngram(train_tokenized, n, model_path)

n_orders = [3, 4, 5]

print(f"Evaluating {len(test_tokenized)} test sentences")

for n in n_orders:
    start_time = time.time()

    
    model_path = f"twi_kneser_ney_{n}gram.pkl"

    # --- TRAIN & SAVE ---
    train_and_save_ngram(train_tokenized, n, model_path)




# Evaluating the n-grams

### Evaluation loop for WittenBell

In [None]:
import math
from nltk.util import ngrams

def lm_perplexity_and_entropy(model, tokenized_test_set):
    """
    Computes Cross-Entropy and Perplexity using model probabilities ONLY.
    Fully theory-aligned.
    """
    n = model.order
    total_log_prob = 0.0
    total_ngrams = 0

    for sent in tokenized_test_set:
        if not sent:
            continue

        padded = ['<s>'] * (n - 1) + sent + ['</s>']
        for ng in ngrams(padded, n):
            context, word = ng[:-1], ng[-1]
            log_score = model.logscore(word, context)

            if math.isfinite(log_score):
                total_log_prob += log_score
                total_ngrams += 1

    if total_ngrams == 0:
        return {
            "CrossEntropy": float("inf"),
            "Perplexity": float("inf")
        }

    cross_entropy = -total_log_prob / total_ngrams
    perplexity = 2 ** cross_entropy

    return {
        "CrossEntropy": cross_entropy,
        "Perplexity": perplexity
    }


In [None]:

final_results = {}
n_orders = [3, 4, 5]

for n in n_orders:
    model_path = f"twi_wittenBell_{n}gram.pkl"

    if os.path.exists(model_path):
        with open(model_path, "rb") as f:
            model = pickle.load(f)

        metrics = lm_perplexity_and_entropy(
            model,
            test_tokenized
        )

        final_results[n] = metrics

        print(f"\nResults for {n}-gram:")
        print(f" - Perplexity: {metrics['Perplexity']:.2f}")
        print(f" - Cross-Entropy: {metrics['CrossEntropy']:.3f}")
        # print(f" - Coverage: {metrics['Coverage']:.2f}%")

        # if metrics["OOV"] is not None:
        #     print(f" - OOV Rate: {metrics['OOV']:.2f}%")

    else:
        print(f"Warning: Model file {model_path} not found.")



### Evaluation loop with Kneser-Ney

In [None]:
import math
import pickle
import time
from collections import Counter
from nltk.util import ngrams
from tqdm import tqdm

# --- 1. Optimized Evaluation Functions ---

def fast_metrics(model, tokenized_test_set):
    """
    Computes Perplexity, Cross-Entropy, and Coverage.
    Uses Counter to group repeated Twi phrases for 10x speedup.
    """
    n = model.order
    ngram_counts = Counter()
    
    # Extract n-grams from test set
    for sent in tokenized_test_set:
        ngram_counts.update(ngrams(sent, n, pad_left=True, pad_right=True, 
                                   left_pad_symbol="<s>", right_pad_symbol="</s>"))

    log_prob_sum = 0.0
    total_ngrams = 0
    known_ngrams = 0

    # Batch process unique n-grams
    for ng, freq in tqdm(ngram_counts.items(), desc=f"Evaluating {n}-gram", leave=False):
        word = ng[-1]
        context = ng[:-1]
        
        # Use logscore (base 2) for direct entropy calculation
        lp = model.logscore(word, context)
        
        # Check for coverage (probability > 0)
        if lp != float("-inf"):
            known_ngrams += freq
            log_prob_sum += (lp * freq)
        else:
            # Apply a penalty for unknown patterns to avoid infinite perplexity
            log_prob_sum += (-100 * freq) 
            
        total_ngrams += freq

    # Metric Calculations
    avg_log_prob = log_prob_sum / total_ngrams
    cross_entropy = -avg_log_prob
    perplexity = math.pow(2, cross_entropy)
    coverage = (known_ngrams / total_ngrams) * 100

    return perplexity, cross_entropy, coverage



# --- 2. Execution Loop ---

final_results = {}
n_orders = [3, 4, 5]

for n in n_orders:
    model_path = f"twi_kneser_ney_{n}gram.pkl"
    
    if os.path.exists(model_path):
        with open(model_path, "rb") as f:
            model = pickle.load(f)
        
        ppl, entropy, cov = fast_metrics(model, test_tokenized)
        
        final_results[n] = {
            "Perplexity": ppl,
            "Cross-Entropy": entropy,
            "Coverage": cov
        }
        
        print(f"\nResults for {n}-gram:")
        print(f" - Perplexity: {ppl:.2f}")
        print(f" - Cross-Entropy: {entropy:.4f} bits")
        print(f" - Coverage: {cov:.2f}%")
    else:
        print(f"Warning: Model file {model_path} not found.")



In [None]:
import pickle

# Specify which model you want to check (e.g., the 3-gram model)
model_path = "twi_kneser_ney_3gram.pkl"

with open(model_path, "rb") as f:
    loaded_model = pickle.load(f)

# This will print the actual number of unique tokens the model learned
print(len(loaded_model.vocab))

# Generate Sample Sentences

In [None]:
import pickle
import math
import numpy as np
from collections import defaultdict

# --- 1. LOAD THE MODEL ---
MODEL_PATH= "twi_wittenBell_{n}gram.pk1" #BestWittenBell Model
# MODEL_PATH = "twi_kneser_ney_3gram.pkl" #Best Kneser-Ney Model

with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

# --- 2. PRE-COMPUTE "KNOWN FOLLOWERS" ---
# This is the secret to speed. We build a map of tokens that actually 
# exist for specific contexts so we don't scan the whole vocab.
print("Optimizing search space...")
context_map = defaultdict(list)
for ngram in model.counts[model.order]:
    context = ngram[:-1]
    word = ngram[-1]
    context_map[context].append(word)

# --- 3. IMPLEMENT FASTER GENERATOR ---
def generate_twi_ultra_fast(model, context_map, max_len=15, temp=0.7, top_k=5):
    content = ["<s>"]
    
    for _ in range(max_len):
        context = tuple(content[-(model.order-1):])
        
        # SPEED FIX: Only look at words that actually follow this context
        # If the context is unknown, the model would back off anyway.
        potential_words = context_map.get(context, list(model.vocab))
        
        candidates = []
        # Even if we check the whole vocab, we limit it to a smaller sample
        # if the context_map is empty to maintain speed.
        search_list = potential_words if len(potential_words) < 500 else list(model.vocab)[:500]

        for token in search_list:
            if token in ["[UNK]", "[PAD]", "<s>"]: continue
            
            # logscore is faster than score in NLTK
            ls = model.logscore(token, context)
            if ls != float("-inf"):
                candidates.append((token, math.pow(2, ls)))
        
        if not candidates: break
            
        # Top-K Pruning
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)[:top_k]
        tokens, scores = zip(*candidates)
        
        # Temperature Scaling
        preds = np.array(scores)
        preds = np.exp(np.log(preds + 1e-10) / temp)
        preds /= preds.sum() 
        
        next_token = np.random.choice(tokens, p=preds)
        content.append(next_token)
        if next_token == "</s>": break
            
    return " ".join([t for t in content if t not in ["<s>", "</s>"]])

# --- 4. EXECUTION ---
print("\nGenerating 10 Twi Sentences (High-Speed Mode):")
for i in range(10):
    print(f"{i+1}. {generate_twi_ultra_fast(model, context_map)}")