In [198]:
import pandas as pd
from multiprocessing import cpu_count, Pool
import itertools
from string import ascii_lowercase
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


### Norvig Prep

In [2]:
norvig = pd.read_csv(
    "http://norvig.com/ngrams/count_1edit.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig.columns = ["term", "edit"]
norvig = norvig.set_index("term")
norvig


Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559
...,...
|c,1
|a,1
|',1
w|,1


In [3]:
norvig_orig = pd.read_csv(
    "http://norvig.com/ngrams/count_big.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig_orig.dropna(inplace=True)
norvig_orig.columns = ["term", "freq"]
norvig_orig


Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3
...,...,...
29131,zueblin,1
29132,zum,1
29133,zweck,1
29134,zygoma,1


In [89]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x["term"].count(c) * x["freq"], axis=1).sum()


character_set = list(map("".join, itertools.product(ascii_lowercase, repeat=1))) + list(
    map("".join, itertools.product(ascii_lowercase, repeat=2))
)

get_count("a", norvig_orig)
freq_list = []
with Pool(cpu_count()) as p:
    freq_list = p.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')
freq_df


Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999
...,...
zv,1
zw,1
zx,0
zy,32


### Process IULA corpus

In [219]:
all_content = []
for dirpath, dirnames, filename in os.walk("./corpus/iula/en"):
    for name in filename:
        if name.endswith("plain.txt"):
            with open(os.path.join(dirpath, name), "r") as f:
                all_content.append(f.read())

def process(s: str):
    s = re.sub(r"[^\w]", " ", s)
    s = re.sub(r"[\d]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s.strip()

processed_content = pd.Series([process(content) for content in all_content])

iula_count_vectorizer = CountVectorizer()
freq_iula = iula_count_vectorizer.fit_transform(processed_content)
freq_iula_sum = pd.Series(
    np.squeeze(np.asarray(freq_iula.sum(axis=0))),
    index=iula_count_vectorizer.get_feature_names_out(),
)
iula_total = freq_iula.sum()
print(f"{iula_total=}")

iula_freq_len_mapping = freq_iula_sum.groupby(lambda x: len(x)).apply(lambda x: set(x.index.to_list()))
freq_iula_sum


iula_total=1026596


aa         34
aaa         2
aaaaaa      1
aalborg     2
aarhus      1
           ..
úniques     1
ús         12
útil        5
útils       4
überbau     1
Length: 34490, dtype: int64

In [213]:
eng_web = pd.read_csv("./corpus/eng-com_web-public_2018/eng-com_web-public_2018_1M-sentences.txt", sep="\t", header=None)
processed_content = eng_web[1].apply(lambda x: process(x))

eng_web_count_vectorizer = CountVectorizer()
freq_eng_web = eng_web_count_vectorizer.fit_transform(processed_content)
freq_eng_web_sum = pd.Series(
    np.squeeze(np.asarray(freq_eng_web.sum(axis=0))),
    index=eng_web_count_vectorizer.get_feature_names_out(),
)
eng_web_total = freq_eng_web.sum()
print(f"{eng_web_total=}")

eng_web_freq_len_mapping = freq_eng_web_sum.groupby(lambda x: len(x)).apply(lambda x: set(x.index.to_list()))
freq_eng_web_sum


eng_web_total=16918582


aa          93
aaa         66
aaaa         3
aaaaa        1
aaaaaaa      1
            ..
ﬂoored       1
ﬂowering     1
ﬂowing       1
ﬂows         1
ﬂy           1
Length: 199819, dtype: int64

In [None]:
all_content = []
for dirpath, dirnames, filename in os.walk("./corpus/iula/en"):
    for name in filename:
        if name.endswith("plain.txt"):
            with open(os.path.join(dirpath, name), "r") as f:
                all_content.append(f.read())

def process(s: str):
    s = re.sub(r"[^\w]", " ", s)
    s = re.sub(r"[\d]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s.strip()

processed_content = pd.Series([process(content) for content in all_content])

iula_count_vectorizer = CountVectorizer()
freq_iula = iula_count_vectorizer.fit_transform(processed_content)
freq_iula_sum = pd.Series(
    np.squeeze(np.asarray(freq_iula.sum(axis=0))),
    index=iula_count_vectorizer.get_feature_names_out(),
)
iula_total = freq_iula.sum()
print(f"{iula_total=}")

iula_freq_len_mapping = freq_iula_sum.groupby(lambda x: len(x)).apply(lambda x: set(x.index.to_list()))
freq_iula_sum


### Get Candidates

In [215]:
def edits1_with_correction(word):
    "All edits that are one edit away from `word`."
    letters = "abcdefghijklmnopqrstuvwxyz"
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [(L + R[1:], f"{R[0]}| ") for L, R in splits if R]
    transposes = [
        (L + R[1] + R[0] + R[2:], f"{R[0]+R[1]}|{R[1]+R[0]}")
        for L, R in splits
        if len(R) > 1
    ]
    replaces = [(L + c + R[1:], f"{R[0]}|{c}") for L, R in splits if R for c in letters]
    inserts = [(L + c + R, f" |{c}") for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def get_candidates(word, len_mapping):
    # Get known words
    return {
        candidate
        for candidate in edits1_with_correction(word)
        if (
            len(candidate[0]) > 1 and candidate[0] in len_mapping[len(candidate[0])] and candidate[0] != word
        )
    }


def try_get(df: pd.DataFrame, key, default):
    return df.loc[key].values[0] if key in df.index else default

def calc_xw(x: str):
    target = x.split("|")
    if target[1] == " ":
        target = target[0]
    else:
        target = target[1]
    return try_get(norvig, x, 0) / try_get(freq_df, target, 1)


def get_corrections(word):
    candidates = get_candidates(word, iula_freq_len_mapping)
    query = [candidate[0] for candidate in candidates]

    transformed_query = [
        iula_count_vectorizer.inverse_transform(iula_count_vectorizer.transform([q]))
        for q in query
    ]
    query_freq = pd.Series(
        [
            freq_iula_sum.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0
            for tq in transformed_query
        ],
        index=query,
    )

    IULA = pd.DataFrame(query_freq, columns=["frequency"])
    IULA = IULA.join(
        pd.DataFrame(
            pd.Series([candidate[1] for candidate in candidates], index=query),
            columns=["correction"],
        )
    )
    IULA_pop = iula_total
    IULA["P(w)"] = IULA["frequency"] / IULA_pop
    IULA["P(x|w)"] = IULA["correction"].apply(lambda x: calc_xw(x))
    IULA["10^9 P(x|w)P(w)"] = 1e9 * IULA["P(w)"] * IULA["P(x|w)"]
    return IULA.sort_values("10^9 P(x|w)P(w)", ascending=False)

def get_corrections_eng_web(word):
    candidates = get_candidates(word, eng_web_freq_len_mapping)
    query = [candidate[0] for candidate in candidates]

    transformed_query = [
        eng_web_count_vectorizer.inverse_transform(eng_web_count_vectorizer.transform([q]))
        for q in query
    ]
    query_freq = pd.Series(
        [
            freq_eng_web_sum.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0
            for tq in transformed_query
        ],
        index=query,
    )

    ENGWEB = pd.DataFrame(query_freq, columns=["frequency"])
    ENGWEB = ENGWEB.join(
        pd.DataFrame(
            pd.Series([candidate[1] for candidate in candidates], index=query),
            columns=["correction"],
        )
    )
    ENGWEB_pop = eng_web_total
    ENGWEB["P(w)"] = ENGWEB["frequency"] / ENGWEB_pop
    ENGWEB["P(x|w)"] = ENGWEB["correction"].apply(lambda x: calc_xw(x))
    ENGWEB["10^9 P(x|w)P(w)"] = 1e9 * ENGWEB["P(w)"] * ENGWEB["P(x|w)"]
    return ENGWEB.sort_values("10^9 P(x|w)P(w)", ascending=False)


### Get Corrections

In [220]:
get_corrections("oney").head(4)


Unnamed: 0,frequency,correction,P(w),P(x|w),10^9 P(x|w)P(w)
only,1695,e|l,0.001651088,2.5e-05,41.563565
ones,194,y|s,0.000188974,2.1e-05,3.949997
obey,4,n|b,3.896372e-06,5.5e-05,0.21303
honey,1,|h,9.74093e-07,7e-06,0.006611


In [217]:
get_corrections_eng_web("oney").head(4)


Unnamed: 0,frequency,correction,P(w),P(x|w),10^9 P(x|w)P(w)
only,22832,e|l,0.00135,2.5e-05,33.97212
ones,2821,y|s,0.000167,2.1e-05,3.485248
honey,481,|h,2.8e-05,7e-06,0.192956
obey,56,n|b,3e-06,5.5e-05,0.180969
