In [None]:
import glob
from symspellpy import SymSpell, Verbosity
from symspellpy.editdistance import EditDistance, DistanceAlgorithm
import pkg_resources
from itertools import islice
import re
from unidecode import unidecode
import pandas as pd
from collections import Counter
from datetime import datetime
import numpy as np
import json

In [None]:
df = pd.read_csv("../data/items_druid_original.csv.zip", sep=";")

In [None]:
df

In [None]:
temp = df.sort_values(by="original")


In [None]:
exp = re.compile("(?:^|\s)([0-9A-Z])\1{4,}[A-Z]{5,}")
for i, row in temp.iterrows():
    if exp.search(row.original):
        print(row.original)

In [None]:
has_number = re.compile("[0-9]")

def normalize(text):
    text = text.lower()
    text = unidecode(text)
    text = re.sub("\d+\.\d+(?:.\d+)+", " ", text) # remove numeração de itens em lista, eg, "5.2.4" ENCUNHAME...
    text = re.sub("[^a-z0-9]", " ", text)
    tokens = text.strip().split()
    tokens = [t for t in tokens if not has_number.search(t) and t >= 3]
    return tokens

# Build the dictionary

In [None]:
all_tokens = []
for i, row in df.iterrows():
    all_tokens.extend(normalize(row.original))

In [None]:
freq = Counter(all_tokens)

In [None]:
for key, v in freq.most_common():
    if key.endswith("/"):
        print(key, v)

In [None]:
with open("../data/dicionario/br-words.txt") as f:
    lines = f.readlines()

with open("../data/dicionario/br-words-preprocessed.txt", "wt", encoding="utf-8") as f:
    lines = set([unidecode(l.lower().strip()) for l in lines]) & set(freq.keys())
    diff = set(freq.keys()) - set([unidecode(l.lower().strip()) for l in lines])
    for l in sorted(lines):
        f.write(f"{l} {freq[l]}\n")
            

# Select valid replacements

In [None]:
sym_spell = SymSpell()
sym_spell.load_dictionary("../data/dicionario/br-words-preprocessed.txt", 0, 1)
replacements = {}
for word in diff:
    suggestions = list(sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=1, ignore_token=r"\w+\d"))
    if len(suggestions) == 1:
        suggestion = suggestions[0]
        term = suggestion._term
        count = suggestion._count
        if count > 1000:
            print(f"replace \"{word}\" by \"{term}\"")
            replacements[word] = term
        


In [None]:
total = 0
for word in replacements:
    total += freq[word]
    
100 * total / sum(freq.values())

In [None]:
with open("../data/dicionario/replacement.json", "wt", encoding="utf-8") as f:
    json.dump(replacements, f)