# Module 4 Handon

## Module Preparing

### The Essentail Library

In [1]:
import pandas as pd # do some data
import string
import timeit # just import for timer
import numpy as np
import multiprocessing as mp
import functools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf built in function
from scipy import sparse

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

---

## Prop Table Generation

### Coca [Sample](https://www.english-corpora.org/coca/)

In [2]:
COCA = pd.DataFrame(
    [
        ['defeat',21947],
        ['decet',6],
        ['defect',3973],
        ['deft',1240],
        ['defer',2239],
        ['Deeft',0]
    ],
    columns=['word','frequency']
)
COCA_pop = 1e9
COCA['P(w)'] = COCA['frequency']/COCA_pop
COCA['rank'] = COCA['frequency'].rank(ascending=False).astype(int)

COCA.head(6)

Unnamed: 0,word,frequency,P(w),rank
0,defeat,21947,2.1947e-05,1
1,decet,6,6e-09,5
2,defect,3973,3.973e-06,2
3,deft,1240,1.24e-06,4
4,defer,2239,2.239e-06,3
5,Deeft,0,0.0,6


### Wikipedia [sample](https://www.english-corpora.org/wiki/)

In [3]:
WIKI = pd.DataFrame(
    [
        ['defeat',121408],
        ['decet',81],
        ['defect',7793],
        ['deft',814],
        ['defer',1416],
        ['Deeft',0]
    ],
    columns=['word','frequency']
)
WIKI_pop = 1.9e9
WIKI['P(w)'] = WIKI['frequency']/WIKI_pop
WIKI['rank'] = WIKI['frequency'].rank(ascending=False).astype(int)

WIKI.head(6)

Unnamed: 0,word,frequency,P(w),rank
0,defeat,121408,6.389895e-05,1
1,decet,81,4.263158e-08,5
2,defect,7793,4.101579e-06,2
3,deft,814,4.284211e-07,4
4,defer,1416,7.452632e-07,3
5,Deeft,0,0.0,6


### IULA Spanish-English Technical Corpus [Sample](https://repositori.upf.edu/handle/10230/20052)

In [4]:
IULA = pd.DataFrame(
    [
        ['defeat',11],
        ['decet',0],
        ['defect',180],
        ['deft',0],
        ['defer',11],
        ['Deeft',0]
    ],
    columns=['word','frequency']
)
IULA_pop = 2.1e6
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)

IULA.head(6)

Unnamed: 0,word,frequency,P(w),rank
0,defeat,11,5e-06,2
1,decet,0,0.0,5
2,defect,180,8.6e-05,1
3,deft,0,0.0,5
4,defer,11,5e-06,2
5,Deeft,0,0.0,5


as you can see that The COCA and Wikipedia have the same result on `Deeft` but on the IULA we have the different result

It prove that "Corpus does matter"

---

## Update the tables with Norvig and calculate the final prob

### Channel model probability we use the collected list of errors, from [PeterNorvig's collection](http://norvig.com/ngrams/)

#### from [count_1edit.txt](http://norvig.com/ngrams/count_1edit.txt)

In [5]:
norvig = pd.read_csv(
    'http://norvig.com/ngrams/count_1edit.txt',
    sep='\t',
    encoding = "ISO-8859-1",
    header=None
)
norvig.columns = ['term', 'edit']
norvig = norvig.set_index('term')
norvig.head()

Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559


#### from [count_big.txt](http://norvig.com/ngrams/count_big.txt)

In [6]:
norvig_orig = pd.read_csv(
    'http://norvig.com/ngrams/count_big.txt',
    sep='\t',
    encoding = "ISO-8859-1",
    header=None
)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns=['term','freq']
norvig_orig.head()

Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3


### P(X|W)

In [10]:
character_set = norvig_orig['term']
with mp.Pool(processes=8) as pool:
    freq_list = pool.map(functools.partial(get_count, norvig_orig=norvig_orig), character_set)

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')
COCA['P(x|w)'] = [
    (norvig.loc['e|ea'].values / freq_df.loc['ea'].values)[0],
    (norvig.loc['f|c'].values / freq_df.loc['c'].values)[0],
    (norvig.loc['e|ec'].values / freq_df.loc['ec'].values)[0],
    (norvig.loc['e| '].values / freq_df.loc['e'].values)[0],
    (norvig.loc['t|r'].values / freq_df.loc['r'].values)[0],
    (norvig.loc['fe|ef'].values / freq_df.loc['ef'].values)[0]
]
COCA.head(6)

KeyError: "None of ['term'] are in the columns"

### P(x|w)P(w) — Using COCA

In [8]:
COCA['109 P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']

KeyError: 'P(x|w)'

### P(x|w)P(w) — Using IULA

In [9]:
IULA['P(x|w)'] = COCA['P(x|w)']
IULA['109 P(x|w)P(w)'] = 1e9 * IULA['P(w)'] * IULA['P(x|w)']

KeyError: 'P(x|w)'