In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
import re
import nltk
from scipy import sparse
from nltk import bigrams    
from unicodedata import normalize
from nltk.corpus import stopwords
from collections import Counter

In [2]:
dados = pd.read_csv('data/estadao_noticias_eleicao.csv')

In [3]:
def limpar_texto(texto):
    pattern = re.compile('[^a-zA-Z0-9 ]')
    texto = normalize('NFKD', texto).encode('ASCII', 'ignore').decode('ASCII')
    return pattern.sub(' ', texto)

In [4]:
words = [limpar_texto(stopword) for stopword in stopwords.words('portuguese')]

In [5]:
conteudo = dados.titulo + " " + dados.subTitulo + " " + dados.conteudo
conteudo = conteudo.fillna("")
conteudo = conteudo.apply(limpar_texto)
ids = dados.idNoticia

In [6]:
tokens = conteudo.apply(nltk.word_tokenize)
term_frequence = tokens.apply(Counter)

In [7]:
index = {}

for i in range(len(tokens)):
    id_noticia = ids[i]
    palavras = tokens[i]
    for palavra in palavras:
        palavra = palavra.lower()
        if palavra not in index:
            index[palavra] = {}
        
        id_rec = index[palavra].get(id_noticia)
        
        if not id_rec:
            docs = index[palavra]
            docs[id_noticia] = term_frequence[i][palavra]

In [8]:
def gera_docs_peso(termos):
    docs_peso = {}
    
    for i in range(len(termos)):
        termo = termos[i]
        docs = index[termo]
        for doc_id in docs:
            tf = docs[doc_id]
            
            if doc_id not in docs_peso:
                docs_peso[doc_id] = np.array([0 if j != i else tf for j in range(len(termos))])
            else:
                doc_vector = docs_peso[doc_id]
                doc_vector[i] = tf
    return docs_peso


In [9]:
def gera_query_vetor(termos):
    vetor = np.array([1 if index.get(termo) else 0 for termo in termos])
    return vetor

In [10]:
def busca(termos, gerador_query, gerador_doc_vetor):
    docs_peso = gerador_doc_vetor(termos)
    query = gerador_query(termos)
    
    doc_rank = sorted(list(docs_peso.items()), key=lambda doc: np.dot(doc[1], query), reverse=True)[:5] 
    return [doc[0] for doc in doc_rank]

In [11]:
def buscar_por_tf(termos):
    return busca(termos, gera_query_vetor, gera_docs_peso)

In [12]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

In [13]:
tokens_lists = conteudo.apply(lambda text: text.lower().split())

In [14]:
tokens = [token for tokens_list in tokens_lists for token in tokens_list if token not in words]

In [15]:
matrix, vocab = co_occurrence_matrix(tokens)

# Consult Bigram Frequency

In [16]:
consultable_matrix = matrix.tocsr()

In [17]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

In [18]:
def get_co_ocurrence(word):
    list_of_occurency = consultable_matrix[vocab[word]].getrow(0).toarray()[0]
    indexs, frequency = zip(*sorted(enumerate(list_of_occurency), key=lambda x: x[1], reverse=True))
    return indexs[:3], frequency[:3]

In [19]:
print([2871, 2890, 3954, 4235, 4750])
termo = 'lula'
ocurrecy = get_co_ocurrence(termo)
expansao = [word for key in ocurrecy[0] for word in vocab.keys() if vocab[word] == key]
expansao.append(termo)
buscar_por_tf(expansao)

[2871, 2890, 3954, 4235, 4750]


[2871, 2890, 3954, 4235, 4750]

# Example

In [20]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3