## N-gram Language Model with Twi Bible Text - Evaluation & Experimentation notebook


## 1. Data Loading and Exploration

In [47]:
from preprocessor import Text as tx
from ngram import Ngram
import random
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from collections import Counter
import time


In [48]:
# load the corpora
train_corpus = tx("data/train.twi")
val_corpus = tx("data/val.twi")
test_corpus = tx("data/test.twi")


In [49]:
train_tokens = train_corpus.word_tokens
val_tokens = val_corpus.word_tokens
test_tokens = test_corpus.word_tokens


In [50]:
# corpus statistics

train_content = [token for token in train_tokens if token not in ['<s>', '</s>']]
val_content = [token for token in val_tokens if token not in ['<s>', '</s>']]
test_content = [token for token in test_tokens if token not in ['<s>', '</s>']]

print(f"\nTrain tokens (excluding markers): {len(train_content):,}")
print(f"Validation tokens (excluding markers): {len(val_content):,}")
print(f"Test tokens (excluding markers): {len(test_content):,}")

print(f"\nTrain tokens (total): {len(train_tokens):,}")
print(f"Validation tokens (total): {len(val_tokens):,}")
print(f"Test tokens (total): {len(test_tokens):,}")

# vocabulary statistics
train_vocab = set(train_content)
val_vocab = set(val_content)
test_vocab = set(test_content)

print(f"\nTrain vocabulary size: {len(train_vocab):,}")
print(f"Validation vocabulary size: {len(val_vocab):,}")
print(f"Test vocabulary size: {len(test_vocab):,}")

# overlap statistics
val_oov = val_vocab - train_vocab
test_oov = test_vocab - train_vocab

print(f"\nOut-of-vocabulary (OOV) words in validation: {len(val_oov):,} ({len(val_oov)/len(val_vocab)*100:.2f}%)")
print(f"Out-of-vocabulary (OOV) words in test: {len(test_oov):,} ({len(test_oov)/len(test_vocab)*100:.2f}%)")



Train tokens (excluding markers): 642,437
Validation tokens (excluding markers): 17,416
Test tokens (excluding markers): 28,656

Train tokens (total): 700,442
Validation tokens (total): 19,015
Test tokens (total): 31,255

Train vocabulary size: 22,078
Validation vocabulary size: 3,511
Test vocabulary size: 4,639

Out-of-vocabulary (OOV) words in validation: 257 (7.32%)
Out-of-vocabulary (OOV) words in test: 404 (8.71%)


### Word Frequency Distribution

In [51]:
# analyze word frequency distribution
word_counts = Counter(train_content)
most_common = word_counts.most_common(20)

# create bar chart
fig = go.Figure(data=[
    go.Bar(
        x=[word for word, _ in most_common],
        y=[count for _, count in most_common],
        marker=dict(
            color=[count for _, count in most_common],
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(title="Count")
        )
    )
])

fig.update_layout(
    title="Top 20 Most Frequent Words in Training Corpus",
    xaxis_title="Word",
    yaxis_title="Frequency",
    height=500,
    template="plotly_white"
)

fig.show()


In [52]:
# frequency distribution analysis
freq_distribution = list(word_counts.values())
freq_bins = pd.cut(freq_distribution, bins=[0, 1, 5, 10, 50, 100, max(freq_distribution)], 
                   labels=['1', '2-5', '6-10', '11-50', '51-100', '100+'])

freq_counts = freq_bins.value_counts().sort_index()

# create pie chart
fig = go.Figure(data=[
    go.Pie(
        labels=freq_counts.index,
        values=freq_counts.values,
        hole=0.3,
        marker=dict(colors=px.colors.qualitative.Set3)
    )
])

fig.update_layout(
    title="Word Frequency Distribution in Training Corpus",
    annotations=[dict(text='Frequency<br>Bins', x=0.5, y=0.5, font_size=12, showarrow=False)],
    height=500
)

fig.show()

print(f"\nWords appearing only once (singletons): {len([w for w, c in word_counts.items() if c == 1]):,}")
print(f"Words appearing 2-5 times: {len([w for w, c in word_counts.items() if 2 <= c <= 5]):,}")
print(f"Words appearing 6-10 times: {len([w for w, c in word_counts.items() if 6 <= c <= 10]):,}")



Words appearing only once (singletons): 8,885
Words appearing 2-5 times: 7,032
Words appearing 6-10 times: 2,121


In [53]:
model = Ngram(train_tokens, n=3, smoothing='IP', eval_set=val_tokens)


## 2. Model Training and Evaluation

### Training Models with Different Configurations

In [54]:
# define configurations to test
n_values = [1, 2, 3, 4, 5]
smoothing_methods = ['LP', 'IP', 'KN']

print("="*60)
print("TRAINING ALL MODEL CONFIGURATIONS")
print("="*60)

# storage for results
results = []
models = {}  # store trained models for later use

total_configs = len(n_values) * len(smoothing_methods) - 1  # subtract 1 for unigram IP
current = 0

for n in n_values:
    for smoothing in smoothing_methods:

        current += 1
        model_name = f"{n}-gram {smoothing}"

        print(f"\n[{current}/{total_configs}] Training: {model_name}")

        try:
            # train model
            start_time = time.time()
            model = Ngram(train_tokens, n=n, smoothing=smoothing, eval_set=val_tokens)
            train_time = time.time() - start_time

            # evaluate on validation set
            eval_start = time.time()
            val_perplexity = model.perplexity()
            eval_time = time.time() - eval_start

            # store model and results
            models[model_name] = model

            results.append({
                'n': n,
                'smoothing': smoothing,
                'model_name': model_name,
                'vocab_size': model.V,
                'train_time': train_time,
                'eval_time': eval_time,
                'val_perplexity': val_perplexity,
                'status': 'success'
            })

            print(f"   Validation Perplexity: {val_perplexity:.2f}")
            print(f"   Training time: {train_time:.2f}s, Eval time: {eval_time:.2f}s")

        except Exception as e:
            print(f"   FAILED: {e}")
            results.append({
                'n': n,
                'smoothing': smoothing,
                'model_name': model_name,
                'status': 'failed',
                'error': str(e)
            })

print("\n" + "="*60)
print(f"Training complete! {len([r for r in results if r['status'] == 'success'])}/{total_configs} models succeeded")
print("="*60)


TRAINING ALL MODEL CONFIGURATIONS

[1/14] Training: 1-gram LP
  ✓ Validation Perplexity: 550.13
  ✓ Training time: 1.92s, Eval time: 0.08s

[2/14] Training: 1-gram IP
  ✓ Validation Perplexity: 493.28
  ✓ Training time: 1.81s, Eval time: 0.13s

[3/14] Training: 1-gram KN
  ✗ FAILED: 'int' object is not iterable

[4/14] Training: 2-gram LP
  ✓ Validation Perplexity: 1292.41
  ✓ Training time: 2.78s, Eval time: 1.19s

[5/14] Training: 2-gram IP
  ✓ Validation Perplexity: 161.92
  ✓ Training time: 4.54s, Eval time: 1.17s

[6/14] Training: 2-gram KN
  ✓ Validation Perplexity: 148.96
  ✓ Training time: 4.58s, Eval time: 1.11s

[7/14] Training: 3-gram LP
  ✓ Validation Perplexity: 6879.14
  ✓ Training time: 3.16s, Eval time: 0.28s

[8/14] Training: 3-gram IP
  ✓ Validation Perplexity: 165.28
  ✓ Training time: 7.91s, Eval time: 1.46s

[9/14] Training: 3-gram KN
  ✓ Validation Perplexity: 113.44
  ✓ Training time: 10.32s, Eval time: 1.92s

[10/14] Training: 4-gram LP
  ✓ Validation Perplexity

In [55]:
# create results dataframe
df_results = pd.DataFrame([r for r in results if r['status'] == 'success'])
df_results = df_results.sort_values('val_perplexity')

print("\nTop 10 Models by Validation Perplexity:")
print(df_results[['model_name', 'n', 'smoothing', 'val_perplexity', 'train_time']].head(10).to_string(index=False))



Top 10 Models by Validation Perplexity:
model_name  n smoothing  val_perplexity  train_time
 4-gram KN  4        KN      110.647962   20.721730
 5-gram KN  5        KN      112.518372   41.674789
 3-gram KN  3        KN      113.438904   10.324985
 2-gram KN  2        KN      148.962523    4.576132
 2-gram IP  2        IP      161.923982    4.537529
 3-gram IP  3        IP      165.275282    7.913580
 4-gram IP  4        IP      309.874065   11.174789
 1-gram IP  1        IP      493.277405    1.811440
 1-gram LP  1        LP      550.131998    1.918994
 5-gram IP  5        IP      740.510569   13.919187


## 3. Perplexity Analysis with Visualizations

### Perplexity by N-gram Order

In [56]:
# perplexity by n-gram order for each smoothing method
fig = go.Figure()

colors = {'None': '#e74c3c', 'LP': '#3498db', 'IP': '#2ecc71', 'KN': '#f39c12'}

for smoothing in smoothing_methods:
    subset = df_results[df_results['smoothing'] == smoothing]
    if len(subset) > 0:
        fig.add_trace(go.Scatter(
            x=subset['n'],
            y=subset['val_perplexity'],
            mode='lines+markers',
            name=smoothing,
            line=dict(width=3, color=colors.get(smoothing, '#000000')),
            marker=dict(size=10)
        ))

fig.update_layout(
    title="Validation Perplexity by N-gram Order and Smoothing Method",
    xaxis_title="N-gram Order",
    yaxis_title="Perplexity (lower is better)",
    xaxis=dict(tickmode='linear', tick0=1, dtick=1),
    height=600,
    template="plotly_white",
    hovermode='x unified',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

fig.show()


### Perplexity Heatmap

In [57]:
# create pivot table for heatmap
pivot_data = df_results.pivot(index='smoothing', columns='n', values='val_perplexity')

# create heatmap
fig = go.Figure(data=go.Heatmap(
    z=pivot_data.values,
    x=pivot_data.columns,
    y=pivot_data.index,
    colorscale='RdYlGn_r',  # red for high (bad), green for low (good)
    text=np.round(pivot_data.values, 2),
    texttemplate='%{text}',
    textfont={"size": 14},
    colorbar=dict(title="Perplexity")
))

fig.update_layout(
    title="Perplexity Heatmap: N-gram Order vs Smoothing Method",
    xaxis_title="N-gram Order",
    yaxis_title="Smoothing Method",
    height=500,
    template="plotly_white"
)

fig.show()


### Smoothing Method Comparison

In [58]:
# box plot comparing smoothing methods across all n-gram orders
fig = go.Figure()

for smoothing in smoothing_methods:
    subset = df_results[df_results['smoothing'] == smoothing]['val_perplexity']
    fig.add_trace(go.Box(
        y=subset,
        name=smoothing,
        marker_color=colors.get(smoothing, '#000000'),
        boxmean='sd'  # show mean and standard deviation
    ))

fig.update_layout(
    title="Perplexity Distribution by Smoothing Method (across all n-gram orders)",
    yaxis_title="Perplexity",
    height=500,
    template="plotly_white",
    showlegend=True
)

fig.show()


### Training and Evaluation Time Analysis

In [59]:
# create subplot with training and eval times
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Training Time by N-gram Order', 'Evaluation Time by N-gram Order')
)

for smoothing in smoothing_methods:
    subset = df_results[df_results['smoothing'] == smoothing]
    if len(subset) > 0:
        # training time
        fig.add_trace(
            go.Scatter(
                x=subset['n'],
                y=subset['train_time'],
                mode='lines+markers',
                name=smoothing,
                line=dict(color=colors.get(smoothing, '#000000')),
                showlegend=True
            ),
            row=1, col=1
        )

        # eval time
        fig.add_trace(
            go.Scatter(
                x=subset['n'],
                y=subset['eval_time'],
                mode='lines+markers',
                name=smoothing,
                line=dict(color=colors.get(smoothing, '#000000')),
                showlegend=False
            ),
            row=1, col=2
        )

fig.update_xaxes(title_text="N-gram Order", row=1, col=1)
fig.update_xaxes(title_text="N-gram Order", row=1, col=2)
fig.update_yaxes(title_text="Time (seconds)", row=1, col=1)
fig.update_yaxes(title_text="Time (seconds)", row=1, col=2)

fig.update_layout(
    height=500,
    template="plotly_white",
    title_text="Computational Cost Analysis"
)

fig.show()


### Best Model Analysis

In [60]:
# get best model
best_model_info = df_results.iloc[0]
best_model = models[best_model_info['model_name']]

print("="*60)
print("BEST MODEL ON VALIDATION SET")
print("="*60)
print(f"Model: {best_model_info['model_name']}")
print(f"N-gram order: {best_model_info['n']}")
print(f"Smoothing: {best_model_info['smoothing']}")
print(f"Validation Perplexity: {best_model_info['val_perplexity']:.2f}")
print(f"Vocabulary size: {best_model_info['vocab_size']:,}")
print(f"Training time: {best_model_info['train_time']:.2f}s")

# evaluate on test set
print("\nEvaluating best model on TEST set")
best_model.eval_set = test_tokens
test_perplexity = best_model.perplexity()
print(f"Test Perplexity: {test_perplexity:.2f}")


BEST MODEL ON VALIDATION SET
Model: 4-gram KN
N-gram order: 4
Smoothing: KN
Validation Perplexity: 110.65
Vocabulary size: 22,080
Training time: 20.72s

Evaluating best model on TEST set...
Test Perplexity: 108.81


## Text Generation Comparison

In [61]:
# use a good model for generation (trigram with interpolation is usually solid)
gen_model_name = '3-gram KN'
if gen_model_name in models:
    gen_model = models[gen_model_name]
else:
    # fallback to best model
    gen_model = best_model
    gen_model_name = best_model_info['model_name']

print(f"Using {gen_model_name} for text generation")
print(f"Vocabulary size: {gen_model.V:,}")


Using 3-gram KN for text generation
Vocabulary size: 22,080


In [62]:
# generate texts with different strategies
num_samples = 5
max_words = 25

strategies = [
    ('greedy', 1.0, 'Greedy (deterministic)'),
    ('w_sampling', 0.5, 'Weighted Sampling (T=0.5, conservative)'),
    ('w_sampling', 0.8, 'Weighted Sampling (T=0.8, balanced)'),
    ('w_sampling', 1.2, 'Weighted Sampling (T=1.2, lose)'),
    ('random', 1.0, 'Random (uniform)')
]

print("="*60)
print("TEXT GENERATION COMPARISON")
print("="*60)

generation_results = {}

for style, temp, label in strategies:
    print(f"\n{'='*80}")
    print(f"{label}")
    print('='*80)

    samples = []
    for i in range(num_samples):
        sentence = gen_model.generate_random_sentence(
            max_words=max_words,
            temperature=temp,
            style=style
        )
        samples.append(sentence)
        print(f"{i+1}. {sentence}")

    generation_results[label] = samples


TEXT GENERATION COMPARISON

Greedy (deterministic)
1. <s> na awurade ka kyerɛɛ no sɛ sɛ awurade nyankopɔn seɛ nie hwɛ mede bɔne bɛba wo so na ɔde ne ho no </s>
2. <s> na awurade ka kyerɛɛ no sɛ sɛ awurade nyankopɔn seɛ nie hwɛ mede bɔne bɛba wo so na ɔde ne ho no </s>
3. <s> na awurade ka kyerɛɛ no sɛ sɛ awurade nyankopɔn seɛ nie hwɛ mede bɔne bɛba wo so na ɔde ne ho no </s>
4. <s> na awurade ka kyerɛɛ no sɛ sɛ awurade nyankopɔn seɛ nie hwɛ mede bɔne bɛba wo so na ɔde ne ho no </s>
5. <s> na awurade ka kyerɛɛ no sɛ sɛ awurade nyankopɔn seɛ nie hwɛ mede bɔne bɛba wo so na ɔde ne ho no </s>

Weighted Sampling (T=0.5, conservative)
1. <s> na awurade animuonyam ahyɛ awurade wo nyankopɔn de rema woɔ no nyinaa na ɔde bi maa no yɛɛ deɛ ɛyɛ nokware sɛ wɔn a wɔka
2. <s> na wo ho na ne ho mfi na lewifoɔ no mu na wo ne me nyankopɔn </s>
3. <s> na awurade nam mose so hyɛɛ sɛ wɔmmɔ wɔn asu </s>
4. <s> na dawid teeɛ no na na wɔde wɔn ho wɔn ho no </s>
5. <s> na dawid ka kyerɛɛ wɔn se ne na din de na

## Interactive Text Generation

In [65]:
def interactive_generation(model, context_words, num_generations=5, max_words=20, temperature=0.8, style='w_sampling'):
    """
    Generate text from a custom context

    Args:
        model: trained Ngram model
        context_words: list of words as context (e.g., ['na', 'ɔde'])
        num_generations: how many samples to generate
        max_words: maximum words per generation
        temperature: sampling temperature
        style: generation style ('greedy', 'w_sampling', 'random')
    """
    print(f"Context: {' '.join(context_words)}")
    print(f"Model: {model.n}-gram with {model.smoothing} smoothing")
    print(f"Strategy: {style}" + (f" (T={temperature})" if style == 'w_sampling' else ""))
    print("="*80)

    results = []

    for i in range(num_generations):
        # create sentence starting with context
        sentence_tokens = ['<s>'] + context_words
        current_context = tuple(sentence_tokens[-(model.n-1):])

        for _ in range(max_words):
            possible_words = list(model.vocabs)

            if style == 'random':
                next_word = random.choice(possible_words)
            elif style == 'greedy':
                probs = [model.P(word, current_context) for word in possible_words]
                if max(probs) == 0:
                    next_word = random.choice(possible_words)
                else:
                    max_idx = probs.index(max(probs))
                    next_word = possible_words[max_idx]
            else:  # w_sampling
                probs = [model.P(word, current_context) for word in possible_words]
                if sum(probs) == 0:
                    probs = [1.0] * len(possible_words)
                adjusted_probs = model.apply_temperature(probs, temperature)
                next_word = random.choices(possible_words, weights=adjusted_probs, k=1)[0]

            sentence_tokens.append(next_word)

            if next_word == '</s>':
                break

            current_context = tuple(sentence_tokens[-(model.n-1):])

        # format output
        result = ' '.join(sentence_tokens[1:])  # skip initial <s>
        results.append(result)
        print(f"{i+1}. {result}")

    # return results


In [66]:
# Example 1: Religious context
print("EXAMPLE 1: Religious Context")
print("="*80)
context1 = ['na', 'awurade']
interactive_generation(gen_model, context1, num_generations=5, temperature=0.8, style='w_sampling')


EXAMPLE 1: Religious Context
Context: na awurade
Model: 3-gram with KN smoothing
Strategy: w_sampling (T=0.8)
1. na awurade de n ani soɔ na ɔsɔɔ mu na ne ba no ba a ɛtɔ so mmienu no ntam </s>
2. na awurade ne nyankopɔn ani so na wode wo ho nyinaa so </s>
3. na awurade nyankopɔn seɛ nie </s>
4. na awurade ka kyerɛɛ israel fie atemmufoɔ a mehyɛɛ wɔn no na ɔbuu israel atɛn mfeɛ nwɔtwe na ɔbɛdii hene na ɔdii
5. na awurade de ama mo </s>


['na awurade de n ani soɔ na ɔsɔɔ mu na ne ba no ba a ɛtɔ so mmienu no ntam </s>',
 'na awurade ne nyankopɔn ani so na wode wo ho nyinaa so </s>',
 'na awurade nyankopɔn seɛ nie </s>',
 'na awurade ka kyerɛɛ israel fie atemmufoɔ a mehyɛɛ wɔn no na ɔbuu israel atɛn mfeɛ nwɔtwe na ɔbɛdii hene na ɔdii',
 'na awurade de ama mo </s>']

In [67]:
# Example 2: Different context
print("\nEXAMPLE 2: Narrative Context")
print("="*80)
context2 = ['na', 'wɔ', 'kɔɔ']
interactive_generation(gen_model, context2, num_generations=5, temperature=0.8, style='w_sampling')



EXAMPLE 2: Narrative Context
Context: na wɔ kɔɔ
Model: 3-gram with KN smoothing
Strategy: w_sampling (T=0.8)
1. na wɔ kɔɔ nyinaa </s>
2. na wɔ kɔɔ nea wopɛ sɛ meyɛ m agya fie nyinaa wura ɔhene anim </s>
3. na wɔ kɔɔ ɔse </s>
4. na wɔ kɔɔ ɛserɛ so no te aseɛ yi </s>
5. na wɔ kɔɔ so yɛɛ duru na wo nnwan mma yerenom mfram kɔɔ hɔ na anya ne ho ase </s>


['na wɔ kɔɔ nyinaa </s>',
 'na wɔ kɔɔ nea wopɛ sɛ meyɛ m agya fie nyinaa wura ɔhene anim </s>',
 'na wɔ kɔɔ ɔse </s>',
 'na wɔ kɔɔ ɛserɛ so no te aseɛ yi </s>',
 'na wɔ kɔɔ so yɛɛ duru na wo nnwan mma yerenom mfram kɔɔ hɔ na anya ne ho ase </s>']

In [68]:
# Example 3: Single word context
print("\nEXAMPLE 3: Single Word Context")
print("="*80)
context3 = ['ɔde']
interactive_generation(gen_model, context3, num_generations=5, temperature=0.8, style='w_sampling')



EXAMPLE 3: Single Word Context
Context: ɔde
Model: 3-gram with KN smoothing
Strategy: w_sampling (T=0.8)
1. ɔde na wɔn a wodi wɔn akɔnnɔ nea mo agya no de worentumi nni m aso saa berɛ no soɔ </s>
2. ɔde ne nsa na obiara a obedi nkonim no mu no wɔyii wɔn firii wɔn afa na wɔasɛe no mu </s>
3. ɔde ne ho adi kyerɛɛ me sɛ me deɛ a wɔadome wɔn no so na wɔn agyapadeɛ a ne ho pɔbɔne
4. ɔde no ho </s>
5. ɔde okuo yɛɛ no te sɛ na deɛ sɛ moasiesie mo ho sɛ nnipa nyinaa so na ɔde ayitoma bɔɔ n


['ɔde na wɔn a wodi wɔn akɔnnɔ nea mo agya no de worentumi nni m aso saa berɛ no soɔ </s>',
 'ɔde ne nsa na obiara a obedi nkonim no mu no wɔyii wɔn firii wɔn afa na wɔasɛe no mu </s>',
 'ɔde ne ho adi kyerɛɛ me sɛ me deɛ a wɔadome wɔn no so na wɔn agyapadeɛ a ne ho pɔbɔne',
 'ɔde no ho </s>',
 'ɔde okuo yɛɛ no te sɛ na deɛ sɛ moasiesie mo ho sɛ nnipa nyinaa so na ɔde ayitoma bɔɔ n']