# **Calculating N-Gram Letter Prediction**

In [1]:
import numpy as np
import pandas as pd

## **First Dataset: 5k most frequent worrds**

Font: [wordfrequency.info](wordfrequency.info)

In [2]:
# get a specific page from the xlsx
df_words = pd.read_excel('../data/wordFrequency.xlsx', sheet_name='4 forms (219k)')

In [3]:
df_words.head()

Unnamed: 0,rank,word,freq,#texts,%caps,blog,web,TVM,spok,fic,...,news,acad,blogPM,webPM,TVMPM,spokPM,ficPM,magPM,newsPM,acadPM
0,1,the,50074257,483041,0.11,6272412,7101104,3784652,5769026,6311500,...,6582642,7447070,50480.69,55212.83,29550.39,45736.71,53341.69,53975.61,54070.43,62167.47
1,2,to,25557793,478977,0.02,3579158,3590504,2911924,3427348,2871517,...,3013501,2978222,28805.25,27917.05,22736.17,27171.94,24268.65,25264.13,24753.18,24861.93
2,3,and,24821791,478727,0.09,3211226,3458960,1828166,3325442,3064047,...,2995111,3633119,25844.11,26894.26,14274.24,26364.03,25895.82,26215.95,24602.12,30328.95
3,4,of,23605964,478144,0.01,2952017,3462140,1486604,2678416,2330823,...,2893200,4517563,23757.98,26918.99,11607.33,21234.42,19698.97,26054.07,23765.01,37712.21
4,5,a,21889251,477421,0.05,2783458,2827106,2519099,2716641,2749208,...,2959649,2229222,22401.41,21981.44,19669.01,21537.47,23234.95,24619.48,24310.83,18609.35


Filtering the words only.

In [4]:
words = df_words['word'].values
print(words[:10])

['the' 'to' 'and' 'of' 'a' 'in' 'i' 'that' 'you' 'it']


Let's create an n-gram model that calculates the probabilities of the next letter for every n-size preffix:

In [31]:
N = 4 #4-gram on the way!

def generate_probs(n, words):
    num_transitions = {}

    for word in words: 
        word = str(word)
        for idx in range(1, len(word)):
            if idx < n:
                prev = "%"*(n - idx) + word[:idx] # if there aren't enough letters to form an n-gram, adds % to complete it
            else:
                prev = word[idx - n:idx] # get the n-gram behind
            
            nxt = word[idx] # get the curr letter

            # if the n-gram isn't already in the table, adds it as an empty dict
            num_transitions.setdefault(prev, {})

            # then updates the count of that n-gram with the following letter by 1
            num_transitions[prev][nxt] = num_transitions[prev].setdefault(nxt, 0) + 1
        
        # for eacah one of the n-grams, count the sum of the occurences of that prefix
        # we will use this to compute the probability of each following letter
        prev_counts = {prev: sum(num_transitions[prev].values()) for prev in num_transitions}

    probs = {}
    for prev, nxt_dict in num_transitions.items():
        # for each n-gram prefix, compute the prob that the next one is the following letter
        for nxt in nxt_dict:
            probs.setdefault(prev, {})
            probs[prev][nxt] = num_transitions[prev][nxt] / prev_counts[prev]
    
    # returns relative frequency (prob) and absolute frequency for each n-gram and following word on the corpus
    return probs, prev_counts

Let's test!

In [6]:
%%time
generate_probs(N, words)

CPU times: total: 1.41 s
Wall time: 5.16 s


({'%%%t': {'h': 0.20973782771535582,
   'o': 0.1348314606741573,
   'i': 0.06741573033707865,
   'w': 0.02247191011235955,
   'a': 0.11235955056179775,
   'e': 0.19101123595505617,
   'r': 0.20973782771535582,
   'u': 0.033707865168539325,
   'y': 0.0149812734082397,
   'v': 0.003745318352059925},
  '%%th': {'e': 0.30357142857142855,
   'a': 0.10714285714285714,
   'i': 0.17857142857142858,
   'o': 0.14285714285714285,
   'r': 0.23214285714285715,
   'u': 0.03571428571428571},
  '%%%a': {'n': 0.11977715877437325,
   's': 0.07799442896935933,
   'r': 0.08913649025069638,
   't': 0.07242339832869081,
   'l': 0.09192200557103064,
   'b': 0.03899721448467967,
   'f': 0.036211699164345405,
   'g': 0.04735376044568245,
   'm': 0.036211699164345405,
   'w': 0.022284122562674095,
   'c': 0.11420612813370473,
   'i': 0.022284122562674095,
   'v': 0.011142061281337047,
   'd': 0.08913649025069638,
   'h': 0.005571030640668524,
   'p': 0.08635097493036212,
   'u': 0.036211699164345405,
   'a': 0.

In [18]:
def get_top3_weights(txt, probs, n):
    # get the n-gram prefix of the text
    # if that n-gram isn't in the table, returns an empty dict
    weights = probs.get(txt[len(txt)-n:], {})
    if not weights:
        return {}

    # get the 3 most probable following letters
    # if there are less than 3, gets all of them
    words = np.array(list(weights.keys()))
    freq = np.array(list(weights.values()))

    top = np.argpartition(freq, -3)[-3:] if len(freq) >= 3 else np.argpartition(freq, -len(freq))

    return dict(zip(words[top], freq[top]))


In [8]:
word_counts = {'apple': 10, 'banana': 5, 'orange': 8, 'grape': 3, 'kiwi': 12}
print(get_top3_weights('oi', {'oi': word_counts}, 2))

{'orange': 8, 'apple': 10, 'kiwi': 12}


In [9]:
def predicter(n, words):

    # initializes variables
    probs, prev_counts = generate_probs(n, words)

    txt = r'%%%'
    while (curr := input("Next Letter: ")) != '':
        if curr == ' ':
            txt = r'%%%'
            continue

        txt += curr
        top3 = get_top3_weights(txt, probs, n)
        print(f'\n{txt} \nBest 3 letters are: {list(reversed(top3.keys()))}')

In [10]:
predicter(N, words)


%%%hell 
Best 3 letters are: ['e', 'o']


If we save this data, we can use them later on any other project!

In [9]:
# Save the probs dict as a json
import json

probs, prev_counts = generate_probs(N, words)

with open('../data/out/word-frequency/word-frequency_probs.json', 'w') as f:
    json.dump(probs, f)

with open('../data/out/word-frequency/word-frequency_prefix-count.json', 'w') as f:
    json.dump(prev_counts, f)

## **Second Dataset: 100k most common words**

It seems a bit outdated and it's from a gist, so... let's test!

Font: [https://gist.github.com/h3xx/1976236](https://gist.github.com/h3xx/1976236)

In [67]:
words100k = []
with open('../data/wiki-100k.txt', 'r', encoding='utf-8') as f:
    curr = f.read().splitlines()

    for line in curr: # reading all lines and ignoring comments
        if line and line[0] != '#':
            words100k.extend(line.split())
    
    words100k = np.array(words100k)

In [68]:
words100k[:10], len(words100k)

(array(['the', 'of', 'and', 'to', 'a', 'in', 'that', 'I', 'was', 'he'],
       dtype='<U23'),
 98916)

Just from quickly analyzing the file, we can see that there is a big problem of repetition, where the same word appears capitalized and not capitalized. Also, some proper nouns can also be a problem. Therefore, let's test by eliminating any letter that is capitalized:

In [69]:
def preprocess_text(text):
    # filter all capitalized words and words that contain special characters
    words = [word for word in text if word.islower() and word.isalpha()]

    # this function can be improved in the future!

    return words

In [70]:
words100k = preprocess_text(words100k)

30k less words!

In [71]:
len(words100k), words100k[:10]

(63139, ['the', 'of', 'and', 'to', 'a', 'in', 'that', 'was', 'he', 'his'])

In [72]:
%%time
second_probs, second_prefix_count = generate_probs(N, words100k)

CPU times: total: 2min 50s
Wall time: 7min 57s


In [73]:
second_probs

{'%%%t': {'h': 0.1371794871794872,
  'o': 0.13173076923076923,
  'i': 0.07916666666666666,
  'w': 0.022435897435897436,
  'a': 0.1233974358974359,
  'e': 0.14967948717948718,
  'u': 0.0625,
  't': 0.000641025641025641,
  'r': 0.2586538461538462,
  'y': 0.017948717948717947,
  'ê': 0.0009615384615384616,
  'ä': 0.008012820512820512,
  'm': 0.0003205128205128205,
  'ô': 0.000641025641025641,
  'ú': 0.0003205128205128205,
  'v': 0.000641025641025641,
  's': 0.0003205128205128205,
  'é': 0.0019230769230769232,
  'â': 0.000641025641025641,
  'l': 0.0003205128205128205,
  'j': 0.000641025641025641,
  'ã': 0.0003205128205128205,
  'í': 0.0009615384615384616,
  'z': 0.0003205128205128205,
  'æ': 0.0003205128205128205},
 '%%th': {'e': 0.2505854800936768,
  'a': 0.07962529274004684,
  'i': 0.1990632318501171,
  'o': 0.10772833723653395,
  'r': 0.24824355971896955,
  'y': 0.01873536299765808,
  'u': 0.07259953161592506,
  'w': 0.00936768149882904,
  'é': 0.01405152224824356},
 '%%%o': {'f': 0.067

In [74]:
def predicterV2(n, probs):
    txt = r'%%%'
    while (curr := input("Next Letter: ")) != '':
        if curr == ' ':
            txt = r'%%%'
            continue

        txt += curr
        top3 = get_top3_weights(txt, probs, n)
        print(f'\n{txt} \nBest 3 letters are: {list(reversed(top3.keys()))}')

In [20]:
predicterV2(N, second_probs)


%%%hell 
Best 3 letters are: ['e', 's', 'i']

%%%hella 
Best 3 letters are: ['t', 'n', 'r']

%%%a 
Best 3 letters are: ['n', 'l', 'r']

%%%an 
Best 3 letters are: ['n', 't', 'g']

%%%ann 
Best 3 letters are: ['o', 'u', 'i']

%%%anno 
Best 3 letters are: ['u', 'n', 'y']

%%%annou 
Best 3 letters are: ['n', 's']

%%%announ 
Best 3 letters are: ['c', 's']

%%%announc 
Best 3 letters are: ['e', 'i']

%%%announce 
Best 3 letters are: ['d', 'r', 's']


With quick tests, it seems like the second model seems very similar to the first one. Therefore, for visualization, let's focus on the first table on the [next notebook](letter-stats.ipynb)!

In [75]:
with open('../data/out/wiki-100k/wiki-100k_probs.json', 'w') as f:
    json.dump(second_probs, f)

with open('../data/out/wiki-100k/wiki-100k_prefix-count.json', 'w') as f:
    json.dump(second_prefix_count, f)

## **Third Dataset: WikiNews**

Let's try with real texts to get better frequency of words that are very much used.

In [21]:
with open('../data/en_wikinews.txt', 'r', encoding='utf-8') as f:
    wikinews = f.read().splitlines()

In [22]:
def flatmap(lst):
    return [item for sublist in lst for item in sublist.split()]

wikinews = list(
        filter(
            lambda x: x and x.islower() and x.isalpha(),
            flatmap(wikinews)
        )
    )

In [23]:
wikinews[:10], len(wikinews)

(['in',
  'a',
  'statement',
  'today',
  'senior',
  'investigating',
  'officer',
  'detective',
  'superintendent',
  'mike'],
 19425254)

We have A LOT of words! Let's parallelize this process to make it feasible.

Python notebooks are AWFUL with parallelization. Therefore, I created a python script that does the process for us! You can check it [here](parallel_generate_probs.py)

You can run it with the following cell:

In [24]:
!python ./parallel_generate_probs.py ../data/en_wikinews.txt 4 --prepro -o wikinews/wikinews 

Using 16 cores and chunk size of 1214078
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Chunk processed with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Chunk processed with 1214078 words
Chunk processed with 1214078 words
Processing chunk with 1214078 words
Processing chunk with 6 words
Chun

Now let's import the results and test!

In [25]:
import json

with open('../data/out/wikinews_probs.json', 'r') as f:
    wikinews_probs = json.load(f)

In [26]:
predicterV2(N, wikinews_probs)


%%%a 
Best 3 letters are: ['n', 'r', 's']

%%%an 
Best 3 letters are: ['d', 'y', 'o']


## **Fourth Dataset: Kaggle**

Let's test some Kaggle datasets!

### A. Unigram Frequency

This is the 333,333 most frequent english words and their frequency

In [5]:
df_unigram = pd.read_csv('../data/unigram_freq.csv')

In [6]:
df_unigram.head()

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


To be able to consider the frequencies, let's take the log:

In [7]:
df_unigram['log_count'] = np.log(df_unigram['count'])

df_unigram.head()

Unnamed: 0,word,count,log_count
0,the,23135851162,23.864649
1,of,13151942776,23.299835
2,and,12997637966,23.288033
3,to,12136980858,23.219523
4,a,9081174698,22.929469


In [8]:
# Filtering words that are not string
df_unigram = df_unigram[df_unigram['word'].apply(lambda x: type(x) == str)]

# Filtering words that are not alpha
df_unigram = df_unigram[df_unigram['word'].apply(lambda x: x.isalpha())]

# lowering all words
df_unigram['word'] = df_unigram['word'].apply(lambda x: x.lower())

# Filtering words with length 1
df_unigram = df_unigram[df_unigram['word'].apply(len) != 1]

In [9]:
# Let's keep only the first 100k words
df_unigram = df_unigram[:100000]

In [10]:
# To create the training set, we will just remove the "words" list from the dataframe
df_unigram_filtered = df_unigram[~df_unigram['word'].isin(words)]

In [11]:
df_unigram_filtered.head()

Unnamed: 0,word,count,log_count
159,info,352363058,19.680173
216,ebay,293178760,19.496293
230,hotels,275510917,19.434138
255,dvd,256530337,19.362757
256,shipping,256521328,19.362722


Now, we are going to create a txt where each line is a word, and each word on the dataframe will appear log_count times

In [20]:
df_unigram.shape, df_unigram_filtered.shape

((100000, 3), (94993, 3))

In [12]:
with open('../data/unigram_freq_processed.txt', 'w') as f:
    for word, count in zip(df_unigram['word'], df_unigram['log_count']):
        f.write((word + '\n') * int(count))

with open('../data/unigram_freq_filtered.txt', 'w') as f:
    for word, count in zip(df_unigram_filtered['word'], df_unigram_filtered['log_count']):
        f.write((word + '\n') * int(count))

In [13]:
!python ./parallel_generate_probs.py ../data/unigram_freq_processed.txt 4 -o kaggle/unigram

Using 16 cores and chunk size of 79131
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Chunk processed with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Chunk processed with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Processing chunk with 79131 words
Chunk processed with 79131 words
Chunk processed with 79131 words
Processing chunk with 79131 words
Processing chunk with 3 words
Chunk processed with 3 words
Chunk processed with 79131 words
Chunk processed with 79131 words
Chunk processed with 79131 words
Chunk processed with 79131 words
Chunk processed with 79131 words
Chunk processed with 79131 words
Chunk processed with 79131 w

In [15]:
!python ./parallel_generate_probs.py ../data/unigram_freq_filtered.txt 4 -o kaggle/unigram-filtered

Using 16 cores and chunk size of 73909
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Chunk processed with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Processing chunk with 73909 words
Chunk processed with 73909 words
Processing chunk with 73909 words
Chunk processed with 73909 words
Chunk processed with 73909 words
Processing chunk with 73909 words
Chunk processed with 73909 words
Processing chunk with 73909 words
Processing chunk with 10 words
Chunk processed with 10 words
Chunk processed with 73909 words
Chunk processed with 73909 words
Chunk processed with 73909 words
Chunk processed with 73909 words
Chunk processed with 73909 words
Chunk processed with 73909

In [17]:
with open('../data/out/kaggle/unigram-filtered_probs.json', 'r') as f:
    import json
    unigram_probs = json.load(f)

In [20]:
predicterV2(4, unigram_probs)


%%%a 
Best 3 letters are: ['n', 'l', 'r']

%%%an 
Best 3 letters are: ['t', 'a', 'n']

%%%ant 
Best 3 letters are: ['i', 'h', 'e']


## **Fifth Dataset: Dictionary**

Let's use `nltk` library and the wordnet dataset to create our data!

In [21]:
from nltk.corpus import wordnet as wn

def get_all_words():
    words = {}
    for synset in wn.all_synsets():
        for lemma in synset.lemmas():
            words[lemma.name().lower()] = words.get(lemma.name().lower(), 0) + 1

    return words

wordnet = get_all_words()

In [22]:
# Filter words with "-" and "_"

wordnet = {word: count for word, count in wordnet.items() if '-' not in word and '_' not in word and word.isalpha()}

In [23]:
# Filter 

wordnet = {word: count for word, count in wordnet.items() if len(word) > 1 and word not in words}

In [24]:
wordnet

{'abaxial': 1,
 'dorsal': 2,
 'adaxial': 1,
 'ventral': 2,
 'acroscopic': 1,
 'basiscopic': 1,
 'abducent': 2,
 'abducting': 1,
 'adducent': 1,
 'adductive': 1,
 'adducting': 1,
 'nascent': 1,
 'emergent': 2,
 'dissilient': 1,
 'parturient': 2,
 'moribund': 2,
 'abridged': 1,
 'shortened': 4,
 'potted': 3,
 'unabridged': 2,
 'uncut': 7,
 'implicit': 2,
 'unquestioning': 2,
 'infinite': 5,
 'comparative': 3,
 'relational': 1,
 'absorbent': 2,
 'absorptive': 1,
 'absorbefacient': 1,
 'sorbefacient': 1,
 'assimilating': 1,
 'assimilative': 2,
 'assimilatory': 1,
 'hygroscopic': 1,
 'receptive': 4,
 'spongy': 2,
 'spongelike': 2,
 'thirsty': 4,
 'nonabsorbent': 1,
 'nonabsorptive': 1,
 'repellent': 6,
 'resistant': 5,
 'adsorbent': 2,
 'adsorptive': 1,
 'chemisorptive': 1,
 'chemosorptive': 1,
 'nonadsorbent': 1,
 'nonadsorptive': 1,
 'absorbable': 1,
 'adsorbable': 1,
 'adsorbate': 2,
 'abstemious': 2,
 'abstinent': 2,
 'abstentious': 1,
 'ascetic': 3,
 'ascetical': 2,
 'austere': 3,
 'sp

In [25]:
!python ./parallel_generate_probs.py ../data/wordnet_words.txt 4 -o wordnet/wordnet

Using 16 cores and chunk size of 4622
Processing chunk with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Chunk processed with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Chunk processed with 4622 words
Processing chunk with 4622 words
Processing chunk with 4622 words
Chunk processed with 4622 words
Chunk processed with 4622 words
Chunk processed wi

Important to note it doesn't include words like "the", "of", "that".

In [26]:
with open('../data/wordnet_words.txt', 'w') as f:
    for word in wordnet.keys():
        f.write(word + '\n')

-----

# **Testing Accuracy**

Now, our goal is to test the accuracy of our model by splitting the words in 80/20 to create a cross-validation-like test. We will shuffle the data, separate it, train the probability table, and then use it to try to predict the next letter for all 4-grams of the testing set. 

In that way, we can calculate an accuracy for our model and test different N values!

## 1. Words from WordFrequency

In [32]:
# set seed
np.random.seed(42)

In [33]:
words = np.array(words)

# shuffle the words
np.random.shuffle(words)

# split the words into 80% training and 20% test
train = words[:int(len(words)*0.8)]
test = words[int(len(words)*0.8):]

In [34]:
train[:10], test[:10]

(array(['ta', 'tips', 'looks', 'powers', 'practically', 'italian',
        'command', 'sleep', 'mine', 'bottom'], dtype=object),
 array(['disagree', 'minority', 'emergency', 'clip', 'awesome', 'jumped',
        'experiences', 'wolf', 'swear', 'voting'], dtype=object))

Let's train the model with the training set

In [35]:
N = 4

train_probs, train_prev_counts = generate_probs(N, train)

In [36]:
N = 4

def evaluate_model(n, probs, test):
    acc_exact = 0
    acc_top3 = 0
    acc_top5 = 0
    total = 0

    for word in test:
        word = r'%'*(n-1) + str(word)
        for idx in range(4, len(word)):
            prev = word[idx - n:idx]
            
            nxt = word[idx]

            if prev in probs:
                if nxt == max(probs[prev], key=probs[prev].get):
                    acc_exact += 1
                if nxt in list(sorted(probs[prev], key=probs[prev].get, reverse=True))[:3]:
                    acc_top3 += 1
                if nxt in list(sorted(probs[prev], key=probs[prev].get, reverse=True))[:5]:
                    acc_top5 += 1
                total += 1

    return acc_exact / total, acc_top3 / total, acc_top5 / total

In [37]:
print(evaluate_model(N, train_probs, test))

(0.42818257956448913, 0.6735762144053601, 0.7644472361809045)


## 2. Wiki-100k.txt

In [45]:
words100k = np.array(words100k)

# filter out the words that are in the list "words"
words100k = np.array([word for word in words100k if word not in words])

In [46]:
train_probs, train_prev_counts = generate_probs(N, words100k)

In [47]:
print(evaluate_model(N, train_probs, words))

(0.43210376976084314, 0.7168810111655674, 0.8335851420569702)


In [48]:
print(evaluate_model(N, train_probs, words100k))

(0.48963141499351964, 0.7848177539280111, 0.8966580640760363)


That is interesting! The accuracy for top 3 increased when we start using some pre-processing into the 100k dataset.

## 3. WikiNews

Let's try to use the probabilites to predict the wiki-100k words!

In [49]:
with open('../data/out/wordnet_probs.json', 'r') as f:
    wikinews_probs = json.load(f)

In [50]:
print(evaluate_model(N, wikinews_probs, words))

(0.41606928423274264, 0.7024125759615735, 0.8325388450201958)


In [51]:
print(evaluate_model(N, wikinews_probs, words100k))

(0.4000644916989125, 0.682248216124108, 0.8022471511235756)


In [53]:
with open('../data/en_wikinews.txt', 'r', encoding='utf-8') as f:
    wikinews_words = f.read().splitlines()

def flatmap(lst):
    return [item for sublist in lst for item in sublist.split()]

wikinews_words = list(
        filter(
            lambda x: x and x.islower() and x.isalpha(),
            flatmap(wikinews_words)
        )
    )

print(evaluate_model(N, wikinews_probs, wikinews_words))

(0.37904610319025484, 0.6429882445285331, 0.8083054314914152)


## 4. Unigram Frequency - Kaggle

In [55]:
with open('../data/out/unigram_probs.json', 'r') as f:
    unigram_probs = json.load(f)

with open('../data/unigram_freq_processed.txt', 'r') as f:
    unigram_words = f.read().splitlines()

Let's test with all the previous sets:

In [56]:
print(evaluate_model(4, unigram_probs, words))

(0.46188422082865116, 0.7519483814840323, 0.8671838184652191)


In [57]:
print(evaluate_model(4, unigram_probs, words100k))

(0.41246404151350335, 0.699955630386427, 0.8207614530901526)


In [58]:
print(evaluate_model(4, unigram_probs, wikinews_words))

(0.42411277866727887, 0.6979728323347932, 0.8505233322804715)


Finally, an "overfitting" test:

In [59]:
print(evaluate_model(4, unigram_probs, unigram_words))

(0.43677892963231385, 0.7277194451999962, 0.8503549837166152)


## 5. Wordnet

In [60]:
with open('../data/out/wordnet_probs.json', 'r') as f:
    wordnet_probs = json.load(f)

with open('../data/wordnet_words.txt', 'r') as f:
    wordnet_words = f.read().splitlines()

Let's test with all the other sets!

In [61]:
print(evaluate_model(4, wordnet_probs, words))

(0.41606928423274264, 0.7024125759615735, 0.8325388450201958)


In [62]:
print(evaluate_model(4, wordnet_probs, words100k))

(0.4000644916989125, 0.682248216124108, 0.8022471511235756)


In [63]:
print(evaluate_model(4, wordnet_probs, unigram_words))

(0.3663244880587427, 0.6324557087923475, 0.7538018180848113)


In [65]:
print(evaluate_model(4, wordnet_probs, wikinews_words))

(0.37904610319025484, 0.6429882445285331, 0.8083054314914152)


In [64]:
print(evaluate_model(4, wordnet_probs, wordnet_words))

(0.4706894691506729, 0.7664761743049329, 0.8842471483629507)


**Conclusion: the best and most stable results come from the *Kaggle Unigram Dataset*!**