# N-gram Language Modeling

In [32]:
import string
import pandas as pd
from sklearn.model_selection import train_test_split
import os

kI_lambd = 10

## Helper funtions


In [33]:
def preprocess_input(input_, vocab):
    tokens = input_.lower().split()
    return [w if w in vocab else "<UNK>" for w in tokens]
    
def remove_punctuation(text):
    # Create a string of all punctuation EXCEPT the single quote
    punct_to_remove = string.punctuation.replace("'", "")   
    return text.translate(str.maketrans('', '', punct_to_remove))

In [34]:
def get_lambdas_trigram(w1, w2, unigram_counts, bigram_counts):
    # Context for a trigram is the word pair (w1, w2)
    c_w1_w2 = bigram_counts.get((w1, w2), 0)
    c_w2 = unigram_counts.get(w2, 0)
    
    # l3 = Trust in Trigram
    l3 = c_w1_w2 / (c_w1_w2 + kI_lambd) if c_w1_w2 > 0 else 0
    
    # Remaining trust is split between Bigram and Unigram based on Bigram context
    remaining = 1 - l3
    l2_ratio = c_w2 / (c_w2 + kI_lambd) if c_w2 > 0 else 0
    
    l2 = remaining * l2_ratio
    l1 = remaining * (1 - l2_ratio)
    
    return l1, l2, l3
    


def get_lambdas_bigram(w1, unigram_counts):
    # Context for a bigram is just the previous word (w1)
    context_count = unigram_counts.get(w1, 0)
    
    # l2 = Trust in Bigram expert
    l2 = context_count / (context_count + kI_lambd) if context_count > 0 else 0
    
    # l1 = Trust in Unigram expert (the leftover)
    l1 = 1 - l2
    
    return l1, l2

In [35]:
# Read file and clean sentences
sentences = []
vocab = set()

bigram_counts = {}
trigram_counts = {}
fourgram_counts ={}
unigram_counts = {}
vocab_size = 0
TRAIN = False


with open("train-transcription-data.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        line = remove_punctuation(line)
        if line:
            sentences.append(line)


In [36]:
# Create DataFrame
df = pd.DataFrame(sentences, columns=["sentence"])
print(df.head())

# Split into training and validation sets use dev set as held-out set
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42) 
    # Split temp into dev and test
    # 50% of 20% = 10% each
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print(f"Train size: {len(train_df)}")
print(f"Dev size: {len(dev_df)}")
print(f"Test size: {len(test_df)}")


                                            sentence
0  Adwumayɛfoɔ nsia adwumayɛfoɔ nson a wɔreyɛ adw...
1  Baabi a yɛbu fangoo na ɛhɔ ahye ama nwisie ɛfi...
2  Atoyerɛnkyɛm asi wɔ bea a wɔgu fangoo ɛgu ɛhyɛ...
3  Mmmaa mmienu ɛne sukuu nkwadaa mmiɛnsa ɛgyina ...
4  Nnipa nnum koto hɔ a nhwiren sisi wɔn anim Wɔa...
Train size: 15029
Dev size: 1879
Test size: 1879


In [37]:
os.makedirs("data", exist_ok=True)

train_df.to_csv("data/train.csv", index=False)
test_df.to_csv("data/test.csv", index=False)
dev_df.to_csv("data/dev.csv")


In [38]:
def heldout_corpus(dev_df):
    total_tokens = 0
    bigram_counts = {}
    trigram_counts = {}
    fourgram_counts ={}
    unigram_counts = {}
    

    for sentence in dev_df["sentence"]:

        
        raw_tokens = sentence.lower().split()  
        vocab.update(raw_tokens)
        for token in raw_tokens:
                unigram_counts[token] = unigram_counts.get(token, 0) + 1
                total_tokens += 1

        bigram_tokens = ['<s>'] + raw_tokens + ['</s>']
      
        for i in range(len(bigram_tokens) - 1):
            bigram = (bigram_tokens[i], bigram_tokens[i + 1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

        trigram_tokens = ['<s>', '<s>'] + raw_tokens + ['</s>']
        for i in range(len(trigram_tokens) - 2):
            trigram = (trigram_tokens[i], trigram_tokens[i + 1], trigram_tokens[i + 2])
            trigram_counts[trigram] = trigram_counts.get(trigram, 0) + 1

    unigram_df = pd.DataFrame(
        [(w1, count) for (w1), count in unigram_counts.items()],
        columns=["word", "count"]
    )

    bigram_df = pd.DataFrame(
        [(w1, w2, count) for (w1, w2), count in bigram_counts.items()],
        columns=["word_1", "word_2", "count"]
    )

    trigram_df = pd.DataFrame(
        [(w1, w2, w3, count) for (w1, w2, w3), count in trigram_counts.items()],
        columns=["word_1", "word_2", "word_3", "count"]
    )        

    unigram_df.to_csv("data/heldout_unigram_counts.csv")
    bigram_df = bigram_df.sort_values(by="count", ascending=False)
    bigram_df.to_csv("data/heldout_bigram_counts.csv", index=False)
 
    trigram_df = trigram_df.sort_values(by="count", ascending=False)
    trigram_df.to_csv("data/heldout_trigram_counts.csv", index=False)

In [39]:
heldout_corpus(dev_df)

In [40]:
def train_grams(train_df, bigram_counts, trigram_counts, fourgram_counts):
    total_tokens = 0
    for sentence in train_df["sentence"]:

        
        raw_tokens = sentence.lower().split()  
        vocab.update(raw_tokens)
        for token in raw_tokens:
                unigram_counts[token] = unigram_counts.get(token, 0) + 1
                total_tokens += 1

        bigram_tokens = ['<s>'] + raw_tokens + ['</s>']
      
        for i in range(len(bigram_tokens) - 1):
            bigram = (bigram_tokens[i], bigram_tokens[i + 1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

        trigram_tokens = ['<s>', '<s>'] + raw_tokens + ['</s>']
        for i in range(len(trigram_tokens) - 2):
            trigram = (trigram_tokens[i], trigram_tokens[i + 1], trigram_tokens[i + 2])
            trigram_counts[trigram] = trigram_counts.get(trigram, 0) + 1

        fourgram_tokens = ['<s>', '<s>', '<s>'] + raw_tokens + ['</s>']
        for i in range(len(fourgram_tokens) - 3):
            fourgram = (fourgram_tokens[i], fourgram_tokens[i + 1], fourgram_tokens[i + 2], fourgram_tokens[i + 3])
            fourgram_counts[fourgram] = fourgram_counts.get(fourgram, 0) + 1


    vocab_df = pd.DataFrame(list(vocab),columns=["word"])
    unigram_df = pd.DataFrame(
        [(w1, count) for (w1), count in unigram_counts.items()],
        columns=["word", "count"]
    )
    bigram_df = pd.DataFrame(
        [(w1, w2, count) for (w1, w2), count in bigram_counts.items()],
        columns=["word_1", "word_2", "count"]
    )


    trigram_df = pd.DataFrame(
        [(w1, w2, w3, count) for (w1, w2, w3), count in trigram_counts.items()],
        columns=["word_1", "word_2", "word_3", "count"]
    )
    fourgram_df = pd.DataFrame(
        [(w1, w2, w3, w4, count) for (w1, w2, w3, w4), count in fourgram_counts.items()],
        columns=["word_1", "word_2", "word_3", "word_4", "count"]
    )


    bigram_df = bigram_df.sort_values(by="count", ascending=False)
    fourgram_df = fourgram_df.sort_values(by="count", ascending=False)
    trigram_df = trigram_df.sort_values(by="count", ascending=False)

    vocab_df.to_csv("data/vocab.csv")
    unigram_df.to_csv("data/unigram_counts.csv")
    bigram_df.to_csv("data/bigram_counts.csv", index=False)
    trigram_df.to_csv("data/trigram_counts.csv", index=False)
    fourgram_df.to_csv("data/fourgram_counts.csv", index=False)

    print("Bigram and trigram counts saved to 'data' directory.")
    print(f"""  
        Vocab size: {len(vocab)}
        Number of bigrams: {len(bigram_counts)}
        Number of trigrams: {len(trigram_counts)}
        Number of 4_grams: {len(fourgram_counts)}
    """)
    return vocab

In [41]:
if(TRAIN):
    vocab = train_grams(train_df, bigram_counts, trigram_counts, fourgram_counts)

vocab = pd.read_csv("data/vocab.csv")
bigram_df = pd.read_csv("data/bigram_counts.csv")
trigram_df = pd.read_csv("data/trigram_counts.csv")
hO_unigram_df = pd.read_csv("data/heldout_unigram_counts.csv")
hO_bigram_df = pd.read_csv("data/heldout_bigram_counts.csv")
hO_trigram_df = pd.read_csv("data/heldout_trigram_counts.csv")
fourgram_df = pd.read_csv("data/fourgram_counts.csv")
fourgram_pruned = fourgram_df[~fourgram_df['count']<=1]


In [42]:
vocab_size = len(vocab)

display(vocab_size)
display(bigram_df.head())
display(trigram_df.head())


20372

Unnamed: 0,word_1,word_2,count
0,bi,nso,2632
1,hɔ,a,2234
2,no,mu,2132
3,bi,a,1846
4,no,so,1627


Unnamed: 0,word_1,word_2,word_3,count
0,<s>,<s>,nnipa,1287
1,gyina,hɔ,a,566
2,<s>,<s>,maame,515
3,<s>,<s>,nkurɔfoɔ,432
4,<s>,<s>,mmarima,402


# Laplace Smoothing Functions
## Bigram, Trigram, 4gram


In [43]:
def laplace_smoothing_bigram(input_, unigram_counts, bigram_counts, vocab):
    tokenized_input = input_.lower().split()
    last_word = tokenized_input[-1]

    vocab_size = len(vocab)
    vocab_probabilities = {}

    unigram_count = unigram_counts.get(last_word, 0)

    for vocab_word in vocab:
        bigram = (last_word, vocab_word)
        bigram_count = bigram_counts.get(bigram, 0)

        probability = (bigram_count + 1) / (unigram_count + vocab_size)
        vocab_probabilities[vocab_word] = probability

    top_suggestions = sorted(
        vocab_probabilities.items(),
        key=lambda x: x[1],
        reverse=True
    )[:3]

    return top_suggestions


In [44]:
# Function takes sentence as input and suggests possible words that comes after the sentence  
def laplace_smoothing_trigram(input_, bigram_counts, trigram_counts, vocab):
    # Consider the last bigram of sentence
    tokenized_input = input_.lower().split()
    last_bigram = tuple(tokenized_input[-2:])
    
    # Calculating probability for each word in vocab
    vocab_probabilities = {}

    # Laplace Smoothing
    for vocab_word in vocab:
        trigram = (last_bigram[0], last_bigram[1], vocab_word)

        trigram_count = trigram_counts.get(trigram, 0)
        bigram_count = bigram_counts.get(last_bigram, 0)
        
         # Laplace smoothing formula
        probability = (trigram_count + 1) / (bigram_count + vocab_size)
        vocab_probabilities[vocab_word] = probability
    
    # Sorting the vocab probability in descending order to get top probable words
    top_suggestions = sorted(vocab_probabilities.items(), key=lambda x: x[1], reverse=True)[:3]
    return top_suggestions




# Add-k Smoothing Functions
## Bigram, Trigram, 4-gram


In [45]:

def add_k_smoothing_trigram(input_, bigram_counts, trigram_counts, vocab, k=0.1):
    tokenized_input = input_.lower().split()
    last_bigram = tuple(tokenized_input[-2:])

    vocab_size = len(vocab)
    vocab_probabilities = {}

    bigram_count = bigram_counts.get(last_bigram, 0)

    for vocab_word in vocab:
        trigram = (last_bigram[0], last_bigram[1], vocab_word)
        trigram_count = trigram_counts.get(trigram, 0)

        probability = (trigram_count + k) / (bigram_count + k * vocab_size)
        vocab_probabilities[vocab_word] = probability

    top_suggestions = sorted(
        vocab_probabilities.items(),
        key=lambda x: x[1],
        reverse=True
    )[:3]

    return top_suggestions


In [46]:
import math

def get_add_k_prob(word, context, counts, context_counts, vocab_size, k=0.1):
    """
    Standard Add-K probability for any N-gram level.
    counts: dict of (context + word)
    context_counts: dict of (context)
    """
    count_ngram = counts.get(context + (word,), 0)
    count_context = context_counts.get(context, 0)
    return (count_ngram + k) / (count_context + k * vocab_size)


# Interpolation
## Bigram, Trigram

### P(w2​∣w1​) = λ P_bigram​(w2​∣w1​) + (1−λ)P_unigram​(w2​)

In [47]:

def get_interpolated_trigram_prob(w3, w1, w2, unigram_counts, bigram_counts, trigram_counts, total_tokens):
    """
    Calculates P(w3 | w1, w2) using interpolated weights.
    """
    # Get dynamic lambdas based on context
    l1, l2, l3 = get_lambdas_trigram(w1, w2, unigram_counts, bigram_counts)
    
    # MLE Probabilities
    p3 = trigram_counts.get((w1, w2, w3), 0) / bigram_counts.get((w1, w2), 1) if (w1, w2) in bigram_counts else 0
    p2 = bigram_counts.get((w2, w3), 0) / unigram_counts.get(w2, 1) if w2 in unigram_counts else 0
    p1 = unigram_counts.get(w3, 0) / total_tokens if total_tokens > 0 else 0
    
    return (l3 * p3) + (l2 * p2) + (l1 * p1)

In [48]:
def calculate_perplexity(test_df, prob_func, **kwargs):
    """
    test_df: DataFrame containing sentences
    prob_func: The probability logic to use
    **kwargs: Counts, vocab, and k-values needed by the prob_func
    """
    total_log_prob = 0
    total_word_count = 0
    
    for sentence in test_df["sentence"]:
        # Preprocess and pad
        tokens = ["<s>", "<s>"] + sentence.lower().split() + ["</s>"]
        
        for i in range(2, len(tokens)):
            w1, w2, w3 = tokens[i-2], tokens[i-1], tokens[i]
            
            # Call the passed probability function
            prob = prob_func(w3, w1, w2, **kwargs)
            
            # Use a tiny floor for probability to avoid log(0) if smoothing fails
            prob = max(prob, 1e-10) 
            
            total_log_prob += math.log2(prob)
            total_word_count += 1
            
    avg_log_prob = total_log_prob / total_word_count
    return math.pow(2, -avg_log_prob)

In [49]:
# Wrapper to match the perplexity function signature
def add_k_wrapper(w3, w1, w2, **kwargs):
    return get_add_k_prob(w3, (w1, w2), kwargs['trigram_counts'], kwargs['bigram_counts'], kwargs['vocab_size'], k=0.5)

pp_add_k = calculate_perplexity(test_df, add_k_wrapper, 
                                trigram_counts=trigram_counts, 
                                bigram_counts=bigram_counts, 
                                vocab_size=len(vocab))
print(f"Add-K Perplexity: {pp_add_k}")

Add-K Perplexity: 20372.000000278156


In [52]:
import random

def generate_sentence(primer, vocab, unigram_counts, bigram_counts, trigram_counts, total_tokens, max_len=20):
    """
    Generates a sentence token by token using interpolated trigram probabilities.
    """
    # 1. Prepare the sequence from your primer
    # "me pɛ sɛ me" -> ['me', 'pɛ', 'sɛ', 'me']
    sentence = primer.lower().split()
    
    for _ in range(max_len):
        # 2. Identify the context (the last two words)
        w1 = sentence[-2] if len(sentence) >= 2 else None
        w2 = sentence[-1]
        
        best_word = None
        max_prob = -1
        
        # 3. Efficiency Trick: Only check words that have a chance of appearing
        # We look at words that appeared after w2 in the training data
        candidates = [word for (prev, word) in bigram_counts.keys() if prev == w2]
        
        # If w2 is totally new, fallback to a small subset of the vocab or unigrams
        if not candidates:
            candidates = list(vocab)[:500] 

        for candidate in set(candidates):
            # Calculate P(candidate | w1, w2) using your interpolation function
            prob = get_interpolated_trigram_prob(
                candidate, w1, w2, 
                unigram_counts, bigram_counts, trigram_counts, total_tokens
            )
            
            if prob > max_prob:
                max_prob = prob
                best_word = candidate
        
        # 4. Append and check for sentence end
        if best_word is None: break
        sentence.append(best_word)
        
        if best_word in {'.', '!', '?'}:
            break
            
    return " ".join(sentence)

# Execution
# Ensure unigram_counts is defined as: Counter(lower_case_corpus)
total_tokens = sum(unigram_counts.values())
output = generate_sentence("me pɛ sɛ me", vocab, unigram_counts, bigram_counts, trigram_counts, total_tokens)
print(output)

me pɛ sɛ me word word word word word word word word word word word word word word word word word word word word


In [51]:
generate_sentence("me pɛ sɛ me", vocab, unigram_counts, bigram_counts, trigram_counts, sum(unigram_counts.values()))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/patrickadu-amankwah/nltk_data'
    - '/Users/patrickadu-amankwah/Documents/Msc Intelligent Computing Systems/NLP/n_gram_twi_model/.venv/nltk_data'
    - '/Users/patrickadu-amankwah/Documents/Msc Intelligent Computing Systems/NLP/n_gram_twi_model/.venv/share/nltk_data'
    - '/Users/patrickadu-amankwah/Documents/Msc Intelligent Computing Systems/NLP/n_gram_twi_model/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## Possible Tweaks

* Adding corpus (nltk, scrapping, etc)
* DIfferent model better than trigram model
* Handling 0 counts in model (Smoothing)