# Create dictionary for spellchecking
Data From: [RAE](http://corpus.rae.es/lfrecuencias.html)
Parse as dictionary for pyspellchecking 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('../data/external/CREA_total.TXT', sep="\t", encoding='latin-1', na_filter=False).iloc[:, [0,1]]

In [None]:
data.columns = ["palabra", "freq"]

In [None]:
data.freq = data.freq.astype('str').str.replace(',','').replace(' ','').astype('int')
data.palabra = data.palabra.str.strip()

In [None]:
# data[data.freq.isna()],
# data[data.palabra.str.contains('banco santander')]

In [None]:
spanish_freq = { k:v for k,v in data.values }

In [None]:
import json
with open('../data/external/dict.json', 'w', encoding='utf-8') as outfile:
    json.dump(spanish_freq, outfile, ensure_ascii=False)

In [None]:
from spellchecker import SpellChecker

# turn off loading a built language dictionary, case sensitive on (if desired)
spell = SpellChecker(language='es', case_sensitive=False)

# if you have a dictionary...
spell.word_frequency.load_dictionary('../data/external/dict.json')

# export it out for later use!
spell.export('../data/interim/dictionary.gz', gzipped=True)

In [None]:
for i in spell.unknown("Tengo que ir al bancosantander".split(' ')):
    print(spell.candidates(i))

In [None]:
spell = SpellChecker()
spell.word_frequency.load_dictionary('../data/interim/dictionary.gz')

In [None]:
for i in spell.unknown("Tengo que ir al bancosantander".split(' ')):
    print(spell.candidates(i))

In [None]:
## clean de dataset?

In [None]:
import os

In [None]:
DATA_DIR ='../data'
RAW_DIR =  os.path.join(DATA_DIR, 'raw/')
EXTERNAL_DIR = os.path.join(DATA_DIR, 'external/')
TRAIN_DATA = os.path.join(RAW_DIR, 'train.csv')
STOPWORDS_DIR = os.path.join(EXTERNAL_DIR, 'stopwords.txt')

In [None]:
df = pd.read_csv(TRAIN_DATA, delimiter='|', encoding='utf-8')

In [None]:
from src.features.build_features import Cleaner

In [None]:
cleaner = Cleaner()

In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

def replace_unknown(r):
    misspelling = spell.unknown(cleaner.sentence_cleaning(r))
    corrections = [ spell.correction(i) for i in misspelling] 
    text = r
    for m, c in zip(misspelling, corrections):
        text = text.replace(m, c)
    return text

def tqdemizado(s, pbar):
    pbar.update(1)
    return replace_unknown(s)

df["Pregunta_clean"] = df.Pregunta.apply(lambda s: tqdemizado(s, pbar))


In [None]:
pbar = tqdm(total=len(df.Pregunta))
df["Pregunta_clean"] = df.Pregunta.apply(replace_unknown) # TODO: decorator
pbar.close()

In [None]:
df.Pregunta_clean.notna().sum()

In [None]:
df.Pregunta.str.contains('hombamnkin').sum()

In [None]:
df.to_parquet('../data/interim/train.parquet', index=False,)

In [None]:
test = pd.read_csv(TEST_DATA, delimiter=',')
test["Pregunta_clean"] = test.Pregunta.apply(replace_unknown)
test.to_parquet('../data/interim/test.parquet', index=False,)