In [75]:
import pandas as pd  # do some data
import string
import timeit  # just import for timer
import numpy as np
import multiprocessing as mp
import functools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer  # tf-idf built in function
from scipy import sparse
from string import ascii_lowercase

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import itertools
import cProfile


In [76]:
COCA = pd.DataFrame(
    [
        ['defeat', 21947],
        ['decet', 6],
        ['defect', 3973],
        ['deft', 1240],
        ['defer', 2239],
        ['Deeft', 0]
    ],
    columns=['word', 'frequency']
)
COCA_pop = 1e9
COCA['P(w)'] = COCA['frequency']/COCA_pop
COCA['rank'] = COCA['frequency'].rank(ascending=False).astype(int)


In [77]:
COCA.head(6)


Unnamed: 0,word,frequency,P(w),rank
0,defeat,21947,2.1947e-05,1
1,decet,6,6e-09,5
2,defect,3973,3.973e-06,2
3,deft,1240,1.24e-06,4
4,defer,2239,2.239e-06,3
5,Deeft,0,0.0,6


In [78]:
WIKI = pd.DataFrame(
    [
        ['defeat', 121408],
        ['decet', 81],
        ['defect', 7793],
        ['deft', 814],
        ['defer', 1416],
        ['Deeft', 0]
    ],
    columns=['word', 'frequency']
)
WIKI_pop = 1.9e9
WIKI['P(w)'] = WIKI['frequency']/WIKI_pop
WIKI['rank'] = WIKI['frequency'].rank(ascending=False).astype(int)


In [79]:
WIKI.head(6)


Unnamed: 0,word,frequency,P(w),rank
0,defeat,121408,6.389895e-05,1
1,decet,81,4.263158e-08,5
2,defect,7793,4.101579e-06,2
3,deft,814,4.284211e-07,4
4,defer,1416,7.452632e-07,3
5,Deeft,0,0.0,6


In [80]:
IULA = pd.DataFrame(
    [
        ['defeat', 11],
        ['decet', 0],
        ['defect', 180],
        ['deft', 0],
        ['defer', 11],
        ['Deeft', 0]
    ],
    columns=['word', 'frequency']
)
IULA_pop = 2.1e6
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)


In [81]:
IULA.head(6)


Unnamed: 0,word,frequency,P(w),rank
0,defeat,11,5e-06,2
1,decet,0,0.0,5
2,defect,180,8.6e-05,1
3,deft,0,0.0,5
4,defer,11,5e-06,2
5,Deeft,0,0.0,5


In [82]:
norvig = pd.read_csv('http://norvig.com/ngrams/count_1edit.txt',
                     sep='\t', encoding="ISO-8859-1", header=None)
norvig.columns = ['term', 'edit']
norvig = norvig.set_index('term')
norvig.head()

Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559


In [83]:
norvig_orig = pd.read_csv('http://norvig.com/ngrams/count_big.txt',
                          sep='\t', encoding="ISO-8859-1", header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns = ['term', 'freq']
norvig_orig.head()


Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3


In [84]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x.term.count(c) * x.freq, axis=1).sum()


In [85]:
character_set = list(map(''.join, itertools.product(ascii_lowercase, repeat=1))) + list(map(''.join, itertools.product(ascii_lowercase, repeat=2)))

with mp.Pool(processes=8) as pool:
    freq_list = pool.map(functools.partial(get_count, norvig_orig=norvig_orig), character_set)

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')

In [89]:
freq_list

[407349,
 73161,
 144964,
 215698,
 632999,
 120870,
 96907,
 294681,
 365618,
 6436,
 32798,
 198622,
 127061,
 368989,
 386705,
 98910,
 4571,
 309545,
 334891,
 460734,
 138716,
 52378,
 100831,
 9797,
 90481,
 3796,
 14,
 7890,
 16151,
 19210,
 204,
 3675,
 7449,
 455,
 15757,
 335,
 4440,
 33275,
 10584,
 84280,
 126,
 8238,
 12,
 40541,
 38393,
 57905,
 4313,
 8422,
 3634,
 563,
 10532,
 517,
 5696,
 419,
 158,
 125,
 22672,
 1,
 0,
 3,
 2598,
 482,
 1,
 8626,
 151,
 65,
 8569,
 2,
 0,
 4825,
 1640,
 652,
 8643,
 122,
 23,
 0,
 6908,
 0,
 17870,
 0,
 3068,
 56,
 25416,
 12,
 0,
 21443,
 7695,
 0,
 5103,
 5453,
 17,
 9,
 29224,
 0,
 234,
 4931,
 402,
 13239,
 5785,
 0,
 7,
 0,
 1415,
 25,
 4876,
 23,
 22,
 2112,
 28545,
 155,
 884,
 165,
 17420,
 375,
 86,
 2096,
 797,
 458,
 9034,
 13,
 49,
 5447,
 4829,
 68,
 3752,
 770,
 116,
 0,
 2006,
 2,
 27583,
 1005,
 14841,
 54733,
 14647,
 6342,
 4266,
 1002,
 6964,
 155,
 522,
 19507,
 13975,
 56167,
 3061,
 7260,
 1151,
 84892,
 48218,

In [86]:
COCA['P(x|w)'] = [
    (norvig.loc['e|ea'].values / freq_df.loc['ea'].values)[0],
    (norvig.loc['f|c'].values / freq_df.loc['c'].values)[0],
    (norvig.loc['e|ec'].values / freq_df.loc['ec'].values)[0],
    (norvig.loc['e| '].values / freq_df.loc['e'].values)[0],
    (norvig.loc['t|r'].values / freq_df.loc['r'].values)[0],
    (norvig.loc['fe|ef'].values / freq_df.loc['ef'].values)[0]
]

In [87]:
COCA['109 P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']

In [88]:
IULA['P(x|w)'] = COCA['P(x|w)']
IULA['109 P(x|w)P(w)'] = 1e9 * IULA['P(w)'] * IULA['P(x|w)']