# **Calculating N-Gram Letter Prediction**

In [1]:
import numpy as np
import pandas as pd

## **First Dataset: 5k most frequent worrds**

Font: [wordfrequency.info](wordfrequency.info)

In [3]:
# get a specific page from the xlsx
df_words = pd.read_excel('../data/wordFrequency.xlsx', sheet_name='4 forms (219k)')

In [4]:
df_words.head()

Unnamed: 0,rank,word,freq,#texts,%caps,blog,web,TVM,spok,fic,...,news,acad,blogPM,webPM,TVMPM,spokPM,ficPM,magPM,newsPM,acadPM
0,1,the,50074257,483041,0.11,6272412,7101104,3784652,5769026,6311500,...,6582642,7447070,50480.69,55212.83,29550.39,45736.71,53341.69,53975.61,54070.43,62167.47
1,2,to,25557793,478977,0.02,3579158,3590504,2911924,3427348,2871517,...,3013501,2978222,28805.25,27917.05,22736.17,27171.94,24268.65,25264.13,24753.18,24861.93
2,3,and,24821791,478727,0.09,3211226,3458960,1828166,3325442,3064047,...,2995111,3633119,25844.11,26894.26,14274.24,26364.03,25895.82,26215.95,24602.12,30328.95
3,4,of,23605964,478144,0.01,2952017,3462140,1486604,2678416,2330823,...,2893200,4517563,23757.98,26918.99,11607.33,21234.42,19698.97,26054.07,23765.01,37712.21
4,5,a,21889251,477421,0.05,2783458,2827106,2519099,2716641,2749208,...,2959649,2229222,22401.41,21981.44,19669.01,21537.47,23234.95,24619.48,24310.83,18609.35


Filtering the words only.

In [5]:
words = df_words['word'].values
print(words[:10])

['the' 'to' 'and' 'of' 'a' 'in' 'i' 'that' 'you' 'it']


Let's create an n-gram model that calculates the probabilities of the next letter for every n-size preffix:

In [6]:
N = 4 #4-gram on the way!

def generate_probs(n, words):
    num_transitions = {}

    for word in words: 
        word = str(word)
        for idx in range(1, len(word)):
            if idx < n:
                prev = "%"*(n - idx) + word[:idx] # if there aren't enough letters to form an n-gram, adds % to complete it
            else:
                prev = word[idx - n:idx] # get the n-gram behind
            
            nxt = word[idx] # get the curr letter

            # if the n-gram isn't already in the table, adds it as an empty dict
            num_transitions.setdefault(prev, {})

            # then updates the count of that n-gram with the following letter by 1
            num_transitions[prev][nxt] = num_transitions[prev].setdefault(nxt, 0) + 1
        
        # for eacah one of the n-grams, count the sum of the occurences of that prefix
        # we will use this to compute the probability of each following letter
        prev_counts = {prev: sum(num_transitions[prev].values()) for prev in num_transitions}

    probs = {}
    for prev, nxt_dict in num_transitions.items():
        # for each n-gram prefix, compute the prob that the next one is the following letter
        for nxt in nxt_dict:
            probs.setdefault(prev, {})
            probs[prev][nxt] = num_transitions[prev][nxt] / prev_counts[prev]
    
    # returns relative frequency (prob) and absolute frequency for each n-gram and following word on the corpus
    return probs, prev_counts

Let's test!

In [7]:
generate_probs(N, words)

({'%%%t': {'h': 0.20973782771535582,
   'o': 0.1348314606741573,
   'i': 0.06741573033707865,
   'w': 0.02247191011235955,
   'a': 0.11235955056179775,
   'e': 0.19101123595505617,
   'r': 0.20973782771535582,
   'u': 0.033707865168539325,
   'y': 0.0149812734082397,
   'v': 0.003745318352059925},
  '%%th': {'e': 0.30357142857142855,
   'a': 0.10714285714285714,
   'i': 0.17857142857142858,
   'o': 0.14285714285714285,
   'r': 0.23214285714285715,
   'u': 0.03571428571428571},
  '%%%a': {'n': 0.11977715877437325,
   's': 0.07799442896935933,
   'r': 0.08913649025069638,
   't': 0.07242339832869081,
   'l': 0.09192200557103064,
   'b': 0.03899721448467967,
   'f': 0.036211699164345405,
   'g': 0.04735376044568245,
   'm': 0.036211699164345405,
   'w': 0.022284122562674095,
   'c': 0.11420612813370473,
   'i': 0.022284122562674095,
   'v': 0.011142061281337047,
   'd': 0.08913649025069638,
   'h': 0.005571030640668524,
   'p': 0.08635097493036212,
   'u': 0.036211699164345405,
   'a': 0.

In [56]:
def get_top3_weights(txt, probs, n):
    # get the n-gram prefix of the text
    # if that n-gram isn't in the table, returns an empty dict
    weights = probs.get(txt[len(txt)-n:], {})
    if not weights:
        return {}

    # get the 3 most probable following letters
    # if there are less than 3, gets all of them
    words = np.array(list(weights.keys()))
    freq = np.array(list(weights.values()))

    top = np.argpartition(freq, -3)[-3:] if len(freq) >= 3 else np.argpartition(freq, -len(freq))

    return dict(zip(words[top], freq[top]))


In [11]:
word_counts = {'apple': 10, 'banana': 5, 'orange': 8, 'grape': 3, 'kiwi': 12}
print(get_top3_weights('oi', {'oi': word_counts}, 2))

{'orange': 8, 'apple': 10, 'kiwi': 12}


In [12]:
def predicter(n, words):

    # initializes variables
    probs, prev_counts = generate_probs(n, words)

    txt = r'%%%'
    while (curr := input("Next Letter: ")) != '':
        if curr == ' ':
            txt = r'%%%'
            continue

        txt += curr
        top3 = get_top3_weights(txt, probs, n)
        print(f'\n{txt} \nBest 3 letters are: {list(reversed(top3.keys()))}')

In [13]:
predicter(N, words)


%%%a 
Best 3 letters are: ['n', 'c', 'l']

%%%an 
Best 3 letters are: ['n', 'y', 'g']

%%%ann 
Best 3 letters are: ['o', 'i', 'e']

%%%anno 
Best 3 letters are: ['u']

%%%annou 
Best 3 letters are: ['n']

%%%announ 
Best 3 letters are: ['c']

%%%announc 
Best 3 letters are: ['e', 'i']

%%%announce 
Best 3 letters are: ['m', 'd']

%%%announcem 
Best 3 letters are: ['e']

%%%announcemn 
Best 3 letters are: []


If we save this data, we can use them later on any other project!

In [9]:
# Save the probs dict as a json
import json

probs, prev_counts = generate_probs(N, words)

with open('../data/out/probs.json', 'w') as f:
    json.dump(probs, f)

with open('../data/out/prefix-count.json', 'w') as f:
    json.dump(prev_counts, f)

## **Second Dataset: 100k most common words**

It seems a bit outdated and it's from a gist, so... let's test!

Font: [https://gist.github.com/h3xx/1976236](https://gist.github.com/h3xx/1976236)

In [46]:
words100k = []
with open('../data/wiki-100k.txt', 'r', encoding='utf-8') as f:
    curr = f.read().splitlines()

    for line in curr: # reading all lines and ignoring comments
        if line and line[0] != '#':
            words100k.extend(line.split())
    
    words100k = np.array(words100k)

In [47]:
words100k[:10], len(words100k)

(array(['the', 'of', 'and', 'to', 'a', 'in', 'that', 'I', 'was', 'he'],
       dtype='<U23'),
 98916)

Just from quickly analyzing the file, we can see that there is a big problem of repetition, where the same word appears capitalized and not capitalized. Also, some proper nouns can also be a problem. Therefore, let's test by eliminating any letter that is capitalized:

In [48]:
def preprocess_text(text):
    # filter all capitalized words
    words = [word for word in text if word.islower()]

    # this function can be improved in the future!

    return words

In [49]:
words100k = preprocess_text(words100k)

30k less words!

In [51]:
len(words100k), words100k[:10]

(65758, ['the', 'of', 'and', 'to', 'a', 'in', 'that', 'was', 'he', 'his'])

In [52]:
second_probs = generate_probs(N, words100k)[0]

In [53]:
second_probs

{'%%%t': {'h': 0.14613713931120073,
  'o': 0.13217499224325163,
  'i': 0.07787775364567173,
  'w': 0.022339435308718587,
  'a': 0.12131554452373565,
  'e': 0.14613713931120073,
  'u': 0.06143344709897611,
  't': 0.0006205398696866273,
  'r': 0.2544213465715172,
  'y': 0.01768538628606888,
  'ê': 0.000930809804529941,
  "'": 0.003723239218119764,
  'ä': 0.007756748371082842,
  'm': 0.00031026993484331366,
  'ô': 0.0006205398696866273,
  'ú': 0.00031026993484331366,
  'v': 0.0006205398696866273,
  's': 0.00031026993484331366,
  'é': 0.001861619609059882,
  'â': 0.0006205398696866273,
  'l': 0.00031026993484331366,
  'j': 0.0006205398696866273,
  'ã': 0.00031026993484331366,
  'í': 0.000930809804529941,
  'z': 0.00031026993484331366,
  'æ': 0.00031026993484331366},
 '%%th': {'e': 0.2702127659574468,
  'a': 0.08723404255319149,
  'i': 0.2,
  'o': 0.10851063829787234,
  'r': 0.2276595744680851,
  'y': 0.01702127659574468,
  'u': 0.06595744680851064,
  "'": 0.002127659574468085,
  'w': 0.008

In [54]:
def predicterV2(n, probs):
    txt = r'%%%'
    while (curr := input("Next Letter: ")) != '':
        if curr == ' ':
            txt = r'%%%'
            continue

        txt += curr
        top3 = get_top3_weights(txt, probs, n)
        print(f'\n{txt} \nBest 3 letters are: {list(reversed(top3.keys()))}')

In [57]:
predicterV2(N, second_probs)


%%%a 
Best 3 letters are: ['n', 'l', 'r']

%%%an 
Best 3 letters are: ['n', 't', 'g']

%%%ann 
Best 3 letters are: ['o', 'u', 'i']

%%%anno 
Best 3 letters are: ['u', 'n', 'y']

%%%annou 
Best 3 letters are: ['n', 's']

%%%announ 
Best 3 letters are: ['c', 's']

%%%announc 
Best 3 letters are: ['e', 'i']

%%%announce 
Best 3 letters are: ['d', 'r', 's']

%%%announced 
Best 3 letters are: ['e', 'i']


With quick tests, it seems like the second model seems very similar to the first one. Therefore, for visualization, let's focus on the first table on the next notebook!

Now, we are going to test the accuracy of each model:

-----

# **Testing Accuracy**

Now, our goal is to test the accuracy of our model by splitting the words in 80/20 to create a cross-validation-like test. We will shuffle the data, separate it, train the probability table, and then use it to try to predict the next letter for all 4-grams of the testing set. 

In that way, we can calculate an accuracy for our model and test different N values!

## 1. Words from WordFrequency

In [39]:
# set seed
np.random.seed(42)

In [40]:
words = np.array(words)

# shuffle the words
np.random.shuffle(words)

# split the words into 80% training and 20% test
train = words[:int(len(words)*0.8)]
test = words[int(len(words)*0.8):]

In [11]:
train[:10], test[:10]

(array(['demanding', 'terms', 'get', 'minnesota', 'cohen', 'office',
        'correct', 'ground', 'chemical', 'ah'], dtype=object),
 array(['seemingly', 'frequent', 'loans', 'andy', 'goodbye', 'whereas',
        'almost', 'captain', 'afghanistan', 'cast'], dtype=object))

Let's train the model with the training set

In [32]:
N = 4

train_probs, train_prev_counts = generate_probs(N, train)

In [33]:
def eval(n, probs, test):
    acc_exact = 0
    acc_top3 = 0
    total = 0

    for word in test:
        word = r'%'*(n-1) + str(word)
        for idx in range(4, len(word)):
            prev = word[idx - n:idx]
            
            nxt = word[idx]

            if prev in probs:
                if nxt == max(probs[prev], key=probs[prev].get):
                    acc_exact += 1
                if nxt in list(sorted(probs[prev], key=probs[prev].get, reverse=True))[:3]:
                    acc_top3 += 1
                total += 1

    return acc_exact / total, acc_top3 / total

In [34]:
print(eval(N, train_probs, test))

(0.45048504446240906, 0.6808811641067097)


## 2. Wiki-100k.txt

In [58]:
words100k = np.array(words100k)

np.random.shuffle(words100k)

train100k = words100k[:int(len(words100k)*0.8)]
test100k = words100k[int(len(words100k)*0.8):]

In [59]:
train_probs, train_prev_counts = generate_probs(N, train100k)

In [60]:
len(train_probs), len(train100k)

(27269, 52606)

In [61]:
print(eval(N, train_probs, test100k))

(0.44161918742069517, 0.7218794317418455)


That is interesting! The accuracy for top 3 increased when we start using some pre-processing into the 100k dataset.