In [2]:
import pandas as pd


# 20-25  Prob tables generation

In [146]:
COCA = pd.DataFrame(
    [
        ["deet", 420],
        ["deft", 1240],
        ["defer", 2237],
        ["defeat", 21940],
        ["defect", 3972],
    ],
    columns=["word", "frequency"],
)
COCA_pop = 1001610938
COCA["P(w)"] = COCA["frequency"] / COCA_pop
COCA["rank"] = COCA["frequency"].rank(ascending=False, method="min").astype(int)
COCA


Unnamed: 0,word,frequency,P(w),rank
0,deet,420,4.193245e-07,5
1,deft,1240,1.238006e-06,4
2,defer,2237,2.233402e-06,3
3,defeat,21940,2.190471e-05,1
4,defect,3972,3.965612e-06,2


In [4]:
WIKI = pd.DataFrame(
    [
        ["deet", 124],
        ["deft", 814],
        ["defer", 1416],
        ["defeat", 121408],
        ["defect", 7793],
    ],
    columns=["word", "frequency"],
)
WIKI_pop = 1.9e9
WIKI["P(w)"] = WIKI["frequency"] / WIKI_pop
WIKI["rank"] = WIKI["frequency"].rank(ascending=False, method="min").astype(int)
WIKI


Unnamed: 0,word,frequency,P(w),rank
0,deet,124,6.526316e-08,5
1,deft,814,4.284211e-07,4
2,defer,1416,7.452632e-07,3
3,defeat,121408,6.389895e-05,1
4,defect,7793,4.101579e-06,2


In [63]:
import os
import re
import numpy as np

all_content = []
for dirpath, dirnames, filename in os.walk("./corpus/iula/en"):
    for name in filename:
        if name.endswith("plain.txt"):
            with open(os.path.join(dirpath, name), "r") as f:
                all_content.append(f.read())
len(all_content)


128

In [87]:
def process(s: str):
    s = re.sub(r"[^\w]", " ", s)
    s = re.sub(r"[\d]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s.strip()

processed_content = pd.Series([process(content) for content in all_content])
processed_content


0      malalties produïdes per rickettsia infeccions ...
1      cell division and chromosomes chapter concepts...
2      introduction the period of crisis and upheaval...
3      cloning for medicine now that genetically modi...
4      evidence for a biological influence in male ho...
                             ...                        
123    sharp disparities in the speed of global integ...
124    sweet medicines sugars play critical roles in ...
125    the basis of immunology innate immunity we liv...
126    integrins and health discovered only recently ...
127    randomised trial of irinotecan plus supportive...
Length: 128, dtype: object

In [188]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
freq_iula = count_vectorizer.fit_transform(processed_content)
freq_iula_df = pd.Series(
    np.squeeze(np.asarray(freq_iula.sum(axis=0))),
    index=count_vectorizer.get_feature_names_out(),
)
total = freq_iula.sum()
print(f"{total=}")
freq_iula_df


total=1026596


aa         34
aaa         2
aaaaaa      1
aalborg     2
aarhus      1
           ..
úniques     1
ús         12
útil        5
útils       4
überbau     1
Length: 34490, dtype: int64

In [89]:
query = ["deet", "deft", "defer", "defect", "defeat"]
transformed_query = [
    count_vectorizer.inverse_transform(count_vectorizer.transform([q])) for q in query
]
query_freq = pd.Series(
    [
        freq_iula.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0
        for tq in transformed_query
    ],
    index=query,
)
query_freq


deet       0
deft       0
defer      1
defect    59
defeat     7
dtype: int64

In [134]:
IULA = pd.DataFrame(query_freq, columns=["frequency"])
IULA_pop = total
IULA["P(w)"] = IULA["frequency"] / IULA_pop
IULA["rank"] = IULA["frequency"].rank(ascending=False).astype(int)
IULA


Unnamed: 0,frequency,P(w),rank
deet,0,0.0,4
deft,0,0.0,4
defer,1,9.74093e-07,3
defect,59,5.747149e-05,1
defeat,7,6.818651e-06,2


# 28-33 Update the tables with Norvig and calculate the final prob

In [116]:
norvig = pd.read_csv(
    "http://norvig.com/ngrams/count_1edit.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig.columns = ["term", "edit"]
norvig = norvig.set_index("term")
norvig


Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559
...,...
|c,1
|a,1
|',1
w|,1


In [104]:
norvig_orig = pd.read_csv(
    "http://norvig.com/ngrams/count_big.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig_orig.dropna(inplace=True)
norvig_orig.columns = ["term", "freq"]
norvig_orig


Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3
...,...,...
29131,zueblin,1
29132,zum,1
29133,zweck,1
29134,zygoma,1


In [117]:
from multiprocessing import cpu_count, Pool
import itertools
from string import ascii_lowercase


def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x["term"].count(c) * x["freq"], axis=1).sum()


character_set = list(map("".join, itertools.product(ascii_lowercase, repeat=1))) + list(
    map("".join, itertools.product(ascii_lowercase, repeat=2))
)

get_count("a", norvig_orig)
freq_list = []
with Pool(cpu_count()) as p:
    freq_list = p.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')
freq_df


Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999
...,...
zv,1
zw,1
zx,0
zy,32


In [147]:
def try_get(df: pd.DataFrame, key, default):
    return df.loc[key] if key in df.index else default


COCA["P(x|w)"] = [
    (try_get(norvig, "f| ", 0) / freq_df.loc["f"].values)[0],  # deet
    (norvig.loc["e| "].values / freq_df.loc["e"].values)[0],  # deft
    (norvig.loc["t|r"].values / freq_df.loc["r"].values)[0],  # defer
    (norvig.loc["e|ea"].values / freq_df.loc["ea"].values)[0],  # defeat
    (norvig.loc["e|ec"].values / freq_df.loc["ec"].values)[0],  # defect
]
COCA['10^9 P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']
COCA.set_index("word", inplace=True)
COCA.sort_values("10^9 P(x|w)P(w)", ascending=False)


Unnamed: 0_level_0,frequency,P(w),rank,P(x|w),10^9 P(x|w)P(w)
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
defeat,21940,2.190471e-05,1,0.012834,281.124909
defect,3972,3.965612e-06,2,0.003167,12.558705
defer,2237,2.233402e-06,3,3.6e-05,0.079366
deft,1240,1.238006e-06,4,3e-06,0.003912
deet,420,4.193245e-07,5,0.0,0.0


In [148]:
IULA["P(x|w)"] = COCA["P(x|w)"]
IULA["10^9 P(x|w)P(w)"] = 1e9 * IULA["P(w)"] * IULA["P(x|w)"]
IULA.sort_values("10^9 P(x|w)P(w)", ascending=False)


Unnamed: 0,frequency,P(w),rank,P(x|w),10^9 P(x|w)P(w)
defect,59,5.747149e-05,1,0.003167,182.0066
defeat,7,6.818651e-06,2,0.012834,87.510514
defer,1,9.74093e-07,3,3.6e-05,0.034615
deet,0,0.0,4,0.0,0.0
deft,0,0.0,4,3e-06,0.0


In [190]:
from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b")
bigram_freq_iula = bigram_vectorizer.fit_transform(processed_content)
bigram_freq_iula_series = pd.Series(
    np.squeeze(np.asarray(bigram_freq_iula.sum(axis=0))),
    index=bigram_vectorizer.get_feature_names_out(),
)
total = bigram_freq_iula.sum()
print(f"{total=}")
bigram_freq_iula_series


total=2132844


_                    1
_ diskette           1
a                24341
a a                 28
a accumulates        1
                 ...  
útils el             1
útils els            1
útils en             2
überbau              1
überbau is           1
Length: 432306, dtype: int64