In [12]:
import numpy as np
import pandas as pd

In [13]:
# get a specific page from the xlsx
df_words = pd.read_excel('./data/wordFrequency.xlsx', sheet_name='4 forms (219k)')

In [14]:
df_words.head()

Unnamed: 0,rank,word,freq,#texts,%caps,blog,web,TVM,spok,fic,...,news,acad,blogPM,webPM,TVMPM,spokPM,ficPM,magPM,newsPM,acadPM
0,1,the,50074257,483041,0.11,6272412,7101104,3784652,5769026,6311500,...,6582642,7447070,50480.69,55212.83,29550.39,45736.71,53341.69,53975.61,54070.43,62167.47
1,2,to,25557793,478977,0.02,3579158,3590504,2911924,3427348,2871517,...,3013501,2978222,28805.25,27917.05,22736.17,27171.94,24268.65,25264.13,24753.18,24861.93
2,3,and,24821791,478727,0.09,3211226,3458960,1828166,3325442,3064047,...,2995111,3633119,25844.11,26894.26,14274.24,26364.03,25895.82,26215.95,24602.12,30328.95
3,4,of,23605964,478144,0.01,2952017,3462140,1486604,2678416,2330823,...,2893200,4517563,23757.98,26918.99,11607.33,21234.42,19698.97,26054.07,23765.01,37712.21
4,5,a,21889251,477421,0.05,2783458,2827106,2519099,2716641,2749208,...,2959649,2229222,22401.41,21981.44,19669.01,21537.47,23234.95,24619.48,24310.83,18609.35


In [15]:
words = df_words['word'].values
print(words[:10])

['the' 'to' 'and' 'of' 'a' 'in' 'i' 'that' 'you' 'it']


In [16]:
N = 4 #4-gram on the way!

def generate_probs(n, words):
    num_transitions = {}

    for word in words:
        word = str(word)
        for idx in range(1, len(word)):
            if idx < n:
                prev = "%"*(n - idx) + word[:idx] # get the n-gram behind
            else:
                prev = word[idx - n:idx] # get the n-gram behind
            
            nxt = word[idx] # get the curr letter

            # if the n-gram isn't already in the dict adds it as an empty dict
            num_transitions.setdefault(prev, {})

            # then updates the count of that n-gram with the following nth word by 1
            num_transitions[prev][nxt] = num_transitions[prev].setdefault(nxt, 0) + 1
        
        # for eacah one of the n-1-grams, count the sum of the occurences of that prefix
        prev_counts = {prev: sum(num_transitions[prev].values()) for prev in num_transitions}

    probs = {}
    for prev, nxt_dict in num_transitions.items():
        # for each n-1-gram prefix, compute the prob that the next one is nxt
        for nxt in nxt_dict:
            probs.setdefault(prev, {})
            probs[prev][nxt] = num_transitions[prev][nxt] / prev_counts[prev]
    
    # returns relative frequency (prob) and absolute frequency for each n-1-gram and following word on the corpus
    return probs, prev_counts

In [17]:
generate_probs(N, words)

({'%%%t': {'h': 0.20973782771535582,
   'o': 0.1348314606741573,
   'i': 0.06741573033707865,
   'w': 0.02247191011235955,
   'a': 0.11235955056179775,
   'e': 0.19101123595505617,
   'r': 0.20973782771535582,
   'u': 0.033707865168539325,
   'y': 0.0149812734082397,
   'v': 0.003745318352059925},
  '%%th': {'e': 0.30357142857142855,
   'a': 0.10714285714285714,
   'i': 0.17857142857142858,
   'o': 0.14285714285714285,
   'r': 0.23214285714285715,
   'u': 0.03571428571428571},
  '%%%a': {'n': 0.11977715877437325,
   's': 0.07799442896935933,
   'r': 0.08913649025069638,
   't': 0.07242339832869081,
   'l': 0.09192200557103064,
   'b': 0.03899721448467967,
   'f': 0.036211699164345405,
   'g': 0.04735376044568245,
   'm': 0.036211699164345405,
   'w': 0.022284122562674095,
   'c': 0.11420612813370473,
   'i': 0.022284122562674095,
   'v': 0.011142061281337047,
   'd': 0.08913649025069638,
   'h': 0.005571030640668524,
   'p': 0.08635097493036212,
   'u': 0.036211699164345405,
   'a': 0.

In [18]:
def get_top3_weights(txt, probs, n):    
    weights = probs.get(txt[len(txt)-n:], {})
    if not weights:
        return {}

    words = np.array(list(weights.keys()))
    freq = np.array(list(weights.values()))

    top = np.argpartition(freq, -3)[-3:] if len(freq) >= 3 else np.argpartition(freq, -len(freq))

    return dict(zip(words[top], freq[top]))


In [19]:
word_counts = {'apple': 10, 'banana': 5, 'orange': 8, 'grape': 3, 'kiwi': 12}
print(get_top3_weights('oi', {'oi': word_counts}, 2))

{'orange': 8, 'apple': 10, 'kiwi': 12}


In [20]:
def predicter(n, words):

    # initializes variables
    probs, prev_counts = generate_probs(n, words)

    txt = r'%%%'
    while (curr := input("Next Letter: ")) != '':
        if curr == ' ':
            txt = r'%%%'
            continue

        txt += curr
        top3 = get_top3_weights(txt, probs, n)
        print(f'\n{txt} \nBest 3 letters are: {list(reversed(top3.keys()))}')

In [21]:
predicter(N, words)


%%%a 
Best 3 letters are: ['n', 'c', 'l']

%%%an 
Best 3 letters are: ['n', 'y', 'g']

%%%ann 
Best 3 letters are: ['o', 'i', 'e']

%%%anno 
Best 3 letters are: ['u']

%%%annou 
Best 3 letters are: ['n']

%%%announ 
Best 3 letters are: ['c']

%%%announc 
Best 3 letters are: ['e', 'i']

%%%announce 
Best 3 letters are: ['m', 'd']

%%%announcem 
Best 3 letters are: ['e']

%%%announceme 
Best 3 letters are: ['n']

%%%announcemen 
Best 3 letters are: ['t', 'd']

%%%announcement 
Best 3 letters are: ['s', 'a', 'i']

%%%s 
Best 3 letters are: ['t', 'e', 'u']

%%%st 
Best 3 letters are: ['r', 'a', 'o']

%%%str 
Best 3 letters are: ['e', 'u', 'a']

%%%stre 
Best 3 letters are: ['a', 'e', 't']


In [None]:
# Save the probs dict as a json
import json

with open('./data/probs.json', 'w') as f:
    json.dump(generate_probs(N, words)[0], f)