In [4]:
import pandas as pd
import numpy as np
import multiprocessing
import nltk
import subprocess
# nltk.download('stopwords')
from difflib import SequenceMatcher
from os import cpu_count
from unidecode import unidecode
from tqdm import tqdm
tqdm.pandas(desc="my bar!")
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.lancaster import LancasterStemmer

In [5]:
year = 2013
df = pd.read_parquet(f'../datasets/raw/{year}.parquet', engine='pyarrow')
df = df[['resumo', 'palavra_chave', 'subareas', 'areas', 'colegios']]

In [6]:
df.palavra_chave.replace(to_replace = r'1[.]', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r' \d[.]', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r'(\S{2,})(?:\.)', value = '\\1;', regex=True, inplace=True)
df.palavra_chave.replace(to_replace = r'\b(pt|en)\b', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r' ,|, | [,] | /|/ | [/] ', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r' [.] |\(\d\)', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r' \x96 |\x93', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r'[^a-z0-9\s\x80-\xff,.:;?!-]', value = '', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r' - |; | ;|; ;', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r';{2,}', value = ';', regex = True, inplace=True)
df.palavra_chave.replace(to_replace= r'[\W_]+$|^[;]+', value='', regex=True, inplace=True)
df.palavra_chave.replace(to_replace = r'palavras-chave\S', value = '', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r'^\s+|\s+$|\x80', value = '', regex = True, inplace=True)
df.palavra_chave.replace(to_replace = r'[^\w\s;]', value = '', regex = True, inplace=True)

In [7]:
df.resumo.replace(to_replace= r'[^a-zA-Z0-9\s]+', value='', regex=True, inplace=True)

In [8]:
def remove_stopwords(text: str) -> str:
    stop_words = set(stopwords.words('portuguese'))
    new_text = [item for item in wordpunct_tokenize(text) if item not in stop_words]
    return " ".join(new_text)

In [9]:
def lc_stem(text: str) -> str:
    stemmer = LancasterStemmer()
    words = [stemmer.stem(word) for word in wordpunct_tokenize(text)]
    return " ".join(words)

In [10]:
df.resumo = df.resumo.progress_apply(remove_stopwords).progress_apply(lc_stem).progress_apply(lambda x: unidecode(x))
df.palavra_chave = df.palavra_chave.progress_apply(remove_stopwords).progress_apply(lc_stem).progress_apply(lambda x: unidecode(x))
df.palavra_chave.replace(to_replace = r' ; ', value = ';', regex = True, inplace=True)
del remove_stopwords, lc_stem

my bar!: 100%|██████████| 67534/67534 [00:09<00:00, 7379.67it/s]
my bar!: 100%|██████████| 67534/67534 [01:01<00:00, 1105.61it/s]
my bar!: 100%|██████████| 67534/67534 [00:00<00:00, 1431565.43it/s]
my bar!: 100%|██████████| 67534/67534 [00:03<00:00, 17544.35it/s]
my bar!: 100%|██████████| 67534/67534 [00:05<00:00, 13452.53it/s]
my bar!: 100%|██████████| 67534/67534 [00:00<00:00, 145350.47it/s]


In [11]:
df.drop(df.query("palavra_chave == '' | resumo == ''").index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [12]:
key_words = df.palavra_chave.copy()
key_words = list(set(';'.join(key_words).split(';')))

key_words = [item.replace(r'[^a-zA-Z0-9\s]+', '') for item in key_words]

key_words = [item for item in key_words if 2 < len(item) < 80]
key_words = [item for item in key_words if not item.isdigit()]
for i, _ in enumerate(key_words):
    key_words[i] = key_words[i].strip()

key_words = list(set(key_words))

In [13]:
len(key_words)

109190

In [14]:
df_test = df.copy()
df_test.palavra_chave.replace(r' ', '', regex=True, inplace=True)
df_test.resumo.replace(r' ', '', regex=True, inplace=True)

In [15]:
test = [item.replace(' ', '') for item in key_words]

test = list(set(test))

In [16]:
len(test)

108773

In [21]:
def calculate_similarity_or_pertency(sentence: str, text: str) -> str:
    if sentence == '' or text == '':
        return False
    if sentence in text or text in sentence:
        return True
    
    similaridade = SequenceMatcher(lambda x: x == " ", sentence, text).real_quick_ratio()
    similaridade = SequenceMatcher(lambda x: x == " ", sentence, text).ratio()
    similaridade = int(similaridade * 100)
    if similaridade > 75: return True
    
    return False

In [18]:
def apply_similarity(row, character):
    chave = calculate_similarity_or_pertency(character, row['palavra_chave'])
    resumo = calculate_similarity_or_pertency(character, row['resumo'])
    return chave or resumo

- 50 palavras_chave = 1m 1.3s
- 500 palavras_chave = 9 24.4s
- 5000 palavras_chave = 126m 2.3s
- 109190 palavras_chave = ?

In [22]:
def process_character(character):    
    cut = df[df.apply(lambda row: apply_similarity(row, character), axis=1)]
    results = []
    for subarea in sorted(cut.subareas.unique()):
        new_cut = cut[cut.subareas == subarea]
        colegio, area = new_cut.colegios.unique()[0], new_cut.areas.unique()[0]
        frequencia = new_cut.palavra_chave.str.count(character).sum() + new_cut.resumo.str.count(character).sum()
        if frequencia == 0: frequencia = new_cut.subareas.value_counts()[0]
        results.append([colegio, area, subarea, character, frequencia])
    
    return results

max_threads = cpu_count()
data = []
with multiprocessing.Pool(processes=max_threads) as pool:
    for result in pool.imap_unordered(process_character, key_words[:3]):
        data.extend(result)

In [None]:
columns_dtype = {
    'colegio': 'category',
    'area': 'category',
    'subarea': 'category',
    'palavra_chave': 'category',
    'frequencia': 'uint64'
}

freq = pd.DataFrame(data, columns=columns_dtype.keys()).astype(columns_dtype)

In [None]:
freq.set_index('palavra_chave').sort_index().sort_values('frequencia', ascending=False)

In [None]:
alo = freq.sort_values('frequencia', ascending=False)