In [1]:
import pandas as pd
from multiprocessing import cpu_count, Pool
import itertools
from string import ascii_lowercase
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
norvig = pd.read_csv(
    "http://norvig.com/ngrams/count_1edit.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig.columns = ["term", "edit"]
norvig = norvig.set_index("term")
norvig


Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559
...,...
|c,1
|a,1
|',1
w|,1


In [3]:
norvig_orig = pd.read_csv(
    "http://norvig.com/ngrams/count_big.txt",
    sep="\t",
    encoding="ISO-8859-1",
    header=None,
)
norvig_orig.dropna(inplace=True)
norvig_orig.columns = ["term", "freq"]
norvig_orig


Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3
...,...,...
29131,zueblin,1
29132,zum,1
29133,zweck,1
29134,zygoma,1


In [4]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x["term"].count(c) * x["freq"], axis=1).sum()


character_set = list(map("".join, itertools.product(ascii_lowercase, repeat=1))) + list(
    map("".join, itertools.product(ascii_lowercase, repeat=2))
)

freq_list = []
with Pool(cpu_count()) as p:
    freq_list = p.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')
freq_df


Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999
...,...
zv,1
zw,1
zx,0
zy,32


In [5]:
recipes_df = pd.read_parquet("~/resources/food/recipes.parquet")
texts_df = recipes_df[["Name", "Description"]].join(recipes_df["RecipeInstructions"].transform(" ".join).to_frame("RecipeInstructions")).fillna("").agg(" ".join, axis=1)
texts_df.info()


<class 'pandas.core.series.Series'>
RangeIndex: 522517 entries, 0 to 522516
Series name: None
Non-Null Count   Dtype 
--------------   ----- 
522517 non-null  object
dtypes: object(1)
memory usage: 4.0+ MB


In [6]:
texts_df


0         Low-Fat Berry Blue Frozen Dessert Make and sha...
1         Biryani Make and share this Biryani recipe fro...
2         Best Lemonade This is from one of my  first Go...
3         Carina's Tofu-Vegetable Kebabs This dish is be...
4         Cabbage Soup Make and share this Cabbage Soup ...
                                ...                        
522512    Meg's Fresh Ginger Gingerbread Make and share ...
522513    Roast Prime Rib au Poivre with Mixed Peppercor...
522514    Kirshwasser Ice Cream Make and share this Kirs...
522515    Quick & Easy Asian Cucumber Salmon Rolls Extre...
522516    Spicy Baked Scotch Eggs Great way to have hard...
Length: 522517, dtype: object

In [19]:
def process(s: str):
    s = re.sub(r"[^A-Za-z]", " ", s)
    s = re.sub(r"[\d]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s.strip()

texts_df = texts_df.apply(process)
texts_df


0         low fat berry blue frozen dessert make and sha...
1         biryani make and share this biryani recipe fro...
2         best lemonade this is from one of my first goo...
3         carina s tofu vegetable kebabs this dish is be...
4         cabbage soup make and share this cabbage soup ...
                                ...                        
522512    meg s fresh ginger gingerbread make and share ...
522513    roast prime rib au poivre with mixed peppercor...
522514    kirshwasser ice cream make and share this kirs...
522515    quick easy asian cucumber salmon rolls extreme...
522516    spicy baked scotch eggs great way to have hard...
Length: 522517, dtype: object

In [21]:
from nltk.tokenize import word_tokenize

iula_count_vectorizer = CountVectorizer(preprocessor=process, tokenizer=word_tokenize, stop_words='english', ngram_range=(1, 2))
freq_iula = iula_count_vectorizer.fit_transform(texts_df)
freq_iula


<522517x4384022 sparse matrix of type '<class 'numpy.int64'>'
	with 70525700 stored elements in Compressed Sparse Row format>

In [22]:
freq_iula_sum = pd.Series(
    np.squeeze(np.asarray(freq_iula.sum(axis=0))),
    index=iula_count_vectorizer.get_feature_names_out(),
)
iula_total = freq_iula.sum()
print(f"{iula_total=}")

iula_freq_len_mapping = freq_iula_sum.groupby(lambda x: len(x)).apply(lambda x: set(x.index.to_list()))
freq_iula_sum


iula_total=83383309


aa                                  30
aa aarmy                             1
aa b                                 1
aa bc                                1
aa c                                 1
                                    ..
zzzzzzzzzzzzz epsom                  1
zzzzzzzzzzzzzssssssssssssss          1
zzzzzzzzzzzzzssssssssssssss hear     1
zzzzzzzzzzzzzzzz                     1
zzzzzzzzzzzzzzzz did                 1
Length: 4384022, dtype: int64

In [10]:
freq_iula_sum.sort_values(ascending=False)


and                   3384649
the                   3183145
to                    1723182
in                    1610827
with                  1150063
                       ...   
ingredients heard           1
ingredients heart           1
ingredients hearty          1
ingredients hello           1
ﬂuffy like                  1
Length: 3486587, dtype: int64

In [23]:
def edits1_with_correction(word):
    "All edits that are one edit away from `word`."
    letters = "abcdefghijklmnopqrstuvwxyz"
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [(L + R[1:], f"{R[0]}| ") for L, R in splits if R]
    transposes = [
        (L + R[1] + R[0] + R[2:], f"{R[0]+R[1]}|{R[1]+R[0]}")
        for L, R in splits
        if len(R) > 1
    ]
    replaces = [(L + c + R[1:], f"{R[0]}|{c}") for L, R in splits if R for c in letters]
    inserts = [(L + c + R, f" |{c}") for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def get_candidates(word, len_mapping):
    # Get known words
    return {
        candidate
        for candidate in edits1_with_correction(word)
        if (
            len(candidate[0]) > 1 and candidate[0] in len_mapping[len(candidate[0])] and candidate[0] != word
        )
    }


def try_get(df: pd.DataFrame, key, default):
    return df.loc[key].values[0] if key in df.index else default

def calc_xw(x: str):
    target = x.split("|")
    if target[1] == " ":
        target = target[0]
    else:
        target = target[1]
    return try_get(norvig, x, 0) / try_get(freq_df, target, 1)


def get_corrections(word):
    terms_candidates = [get_candidates(term, iula_freq_len_mapping) for term in word.split()]
    query = [candidate[0] for candidate in terms_candidates]

    transformed_query = [
        iula_count_vectorizer.inverse_transform(iula_count_vectorizer.transform([q]))
        for q in query
    ]
    query_freq = pd.Series(
        [
            freq_iula_sum.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0
            for tq in transformed_query
        ],
        index=query,
    )

    IULA = pd.DataFrame(query_freq, columns=["frequency"])
    IULA = IULA.join(
        pd.DataFrame(
            pd.Series([candidate[1] for candidate in candidates], index=query),
            columns=["correction"],
        )
    )
    IULA_pop = iula_total
    IULA["P(w)"] = IULA["frequency"] / IULA_pop
    IULA["P(x|w)"] = IULA["correction"].apply(lambda x: calc_xw(x))
    IULA["10^9 P(x|w)P(w)"] = 1e9 * IULA["P(w)"] * IULA["P(x|w)"]
    return IULA.sort_values("10^9 P(x|w)P(w)", ascending=False)


In [24]:
get_corrections("bef wellingon")


Unnamed: 0,frequency,correction,P(w),P(x|w),10^9 P(x|w)P(w)
beef wellingon,74536,|e,0.000894,9e-06,8.472961


In [212]:
word = "boiled beef wellingon"
query = [[candidate[0] for candidate in get_candidates(term, iula_freq_len_mapping)] for term in word.split()]
corrections = [{candidate[0]: candidate[1] for candidate in get_candidates(term, iula_freq_len_mapping)} for term in word.split()]
corrections={k:v for d in corrections for k,v in d.items()}
combinations = np.array(np.meshgrid(*query)).T.reshape(-1,len(word.split()))
joined_combis = [" ".join(combi) for combi in combinations]
transformed_query = iula_count_vectorizer.inverse_transform(iula_count_vectorizer.transform(joined_combis))
transformed_query = [[query_term for query_term in [*query] if len(query_term.split()) == 1] for query in transformed_query]
scores = dict()
for i, terms in enumerate(transformed_query):
    term_freq = pd.Series([freq_iula_sum.get(term) for term in terms], index=terms)
    print(term_freq)
    IULA = IULA.join(
        pd.DataFrame(
            pd.Series([candidate[1] for candidate in candidates], index=query),
            columns=["correction"],
        )
    )
    bigrams = [term for term in terms if len(term.split()) > 1]
    scores[" ".join(transformed_query[i])] = np.prod([term_freq.loc[bigram]/term_freq.loc[bigram.split()[0]] for bigram in bigrams])
out = pd.DataFrame.from_dict(scores, orient='index', columns=["score"])
out.sort_values("score", ascending=False)


boiler        5457
eef              1
wellington     233
dtype: int64
boiler        5457
deef             1
wellington     233
dtype: int64
boiler        5457
reef            19
wellington     233
dtype: int64
bef              3
boiler        5457
wellington     233
dtype: int64
beefy          541
boiler        5457
wellington     233
dtype: int64
beek             1
boiler        5457
wellington     233
dtype: int64
beel             2
boiler        5457
wellington     233
dtype: int64
bee            241
boiler        5457
wellington     233
dtype: int64
beefs           12
boiler        5457
wellington     233
dtype: int64
bees            57
boiler        5457
wellington     233
dtype: int64
beaf             5
boiler        5457
wellington     233
dtype: int64
beet          1827
boiler        5457
wellington     233
dtype: int64
beez             2
boiler        5457
wellington     233
dtype: int64
beefl            1
boiler        5457
wellington     233
dtype: int64
beep           150
b

Unnamed: 0,score
boiler eef wellington,1.0
beefs boyled wellinton,1.0
beaf boyled wellinton,1.0
beet boyled wellinton,1.0
beez boyled wellinton,1.0
...,...
beet coiled wellington,1.0
beez coiled wellington,1.0
beefl coiled wellington,1.0
beep coiled wellington,1.0


In [189]:
[round for round in np.array(c).T]


[['boiler eef',
  'boiler deef',
  'boiler reef',
  'boiler bef',
  'boiler beefy',
  'boiler beek',
  'boiler beel',
  'boiler bee',
  'boiler beefs',
  'boiler bees',
  'boiler beaf',
  'boiler beet',
  'boiler beez',
  'boiler beefl',
  'boiler beep',
  'boiler peef',
  'boiler beed',
  'boiler geef',
  'boiler beeef',
  'boiler beeff',
  'boiler beer',
  'boiler beeg',
  'boiler boef',
  'boiler beff',
  'boiler teef',
  'boiler beef',
  'broiled eef',
  'broiled deef',
  'broiled reef',
  'broiled bef',
  'broiled beefy',
  'broiled beek',
  'broiled beel',
  'broiled bee',
  'broiled beefs',
  'broiled bees',
  'broiled beaf',
  'broiled beet',
  'broiled beez',
  'broiled beefl',
  'broiled beep',
  'broiled peef',
  'broiled beed',
  'broiled geef',
  'broiled beeef',
  'broiled beeff',
  'broiled beer',
  'broiled beeg',
  'broiled boef',
  'broiled beff',
  'broiled teef',
  'broiled beef',
  'loiled eef',
  'loiled deef',
  'loiled reef',
  'loiled bef',
  'loiled beefy',
  

In [166]:
word = "boiled bef wellingon"

query = [[candidate[0] for candidate in get_candidates(term, iula_freq_len_mapping)] + [term] for term in word.split()]
combinations = np.array(np.meshgrid(*query)).T.reshape(-1,len(word.split()))
combinations
joined_combis = [" ".join(combi) for combi in combinations]
joined_combis


['boiler bof wellington',
 'boiler bnf wellington',
 'boiler bet wellington',
 'boiler bel wellington',
 'boiler bed wellington',
 'boiler ref wellington',
 'boiler beg wellington',
 'boiler ber wellington',
 'boiler bea wellington',
 'boiler ef wellington',
 'boiler nef wellington',
 'boiler bee wellington',
 'boiler bif wellington',
 'boiler bep wellington',
 'boiler lef wellington',
 'boiler baf wellington',
 'boiler bsf wellington',
 'boiler def wellington',
 'boiler bev wellington',
 'boiler tef wellington',
 'boiler befc wellington',
 'boiler bei wellington',
 'boiler bff wellington',
 'boiler eef wellington',
 'boiler boef wellington',
 'boiler beef wellington',
 'boiler bf wellington',
 'boiler bec wellington',
 'boiler bew wellington',
 'boiler beaf wellington',
 'boiler beff wellington',
 'boiler beh wellington',
 'boiler ben wellington',
 'boiler bey wellington',
 'boiler bek wellington',
 'boiler bef wellington',
 'broiled bof wellington',
 'broiled bnf wellington',
 'broil