In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import math
import ast
from unicodedata import normalize
from collections import Counter

In [2]:
gabarito = pd.read_csv('gabarito/gabarito.csv')

In [3]:
def convert_str_in_lst(lista):
    return ast.literal_eval(lista)

In [4]:
gabarito.google = gabarito.google.apply(convert_str_in_lst)
gabarito.busca_binaria = gabarito.busca_binaria.apply(convert_str_in_lst)
gabarito.tf = gabarito.tf.apply(convert_str_in_lst)
gabarito.tfidf = gabarito.tfidf.apply(convert_str_in_lst)
gabarito.bm25 = gabarito.bm25.apply(convert_str_in_lst)


In [5]:
dados = pd.read_csv('data/estadao_noticias_eleicao.csv')
dados = dados.replace(np.nan, '', regex=True)

# Join do conteúdo
Juntando os títulos das notícias com seus respectivos conteúdos,
para posteriomente facilitar a tokenização

In [6]:
def limpar_texto(texto):
    pattern = re.compile('[^a-zA-Z0-9 ]')
    texto = normalize('NFKD', texto).encode('ASCII', 'ignore').decode('ASCII')
    return pattern.sub(' ', texto)

In [7]:
materias = dados.titulo + " " + dados.subTitulo +  " " + dados.conteudo
materias = materias.apply(lambda texto: "" if isinstance(texto, float) else limpar_texto(texto).lower())
ids = dados.idNoticia
    

# Tokenizando conteúdo
Criando tokens com cada palavra do texto para que posteriormente possam ser indexadas e associadas aos respectivos ids das notícias

In [8]:
tokens = materias.apply(nltk.word_tokenize)
term_frequence = tokens.apply(Counter)

# Indexando tokens
Criando indices invertidos com os tokens para poder aplicar os métodos de busca 

In [9]:
index = {}

for i in range(len(tokens)):
    id_noticia = ids[i]
    palavras = tokens[i]
    for palavra in palavras:
        palavra = palavra.lower()
        if palavra not in index:
            index[palavra] = {}
        
        id_rec = index[palavra].get(id_noticia)
        
        if not id_rec:
            docs = index[palavra]
            docs[id_noticia] = term_frequence[i][palavra]

In [10]:
def gera_vetor_binario(frase):
    termos = frase.split(" ")
    doc_binario = {}
    
    for i in range(len(termos)):
        termo = termos[i]
        docs = index[termo]
        for doc_id in docs:
            
            if doc_id not in doc_binario:
                doc_binario[doc_id] = np.array([0 if j != i else 1 for j in range(len(termos))])
            else:
                doc_vector = doc_binario[doc_id]
                doc_binario[doc_id] = np.array([doc_vector[j] if j != i else 1 for j in range(len(termos))])
    
    return doc_binario


def gera_tf_vetor(frase):
    termos = frase.split(" ")
    doc_tf = {}
    
    for i in range(len(termos)):
        termo = termos[i]
        docs = index[termo]
        for doc_id in docs:
            tf = docs[doc_id]
            
            if doc_id not in doc_tf:
                doc_tf[doc_id] = np.array([0 if j != i else tf for j in range(len(termos))])
            else:
                doc_vector = doc_tf[doc_id]
                doc_tf[doc_id] = np.array([doc_vector[j] if j != i else tf for j in range(len(termos))])
        
    return doc_tf

def gera_idf_vetor(frase):
    termos = frase.split(" ")
    idf_vector = np.array([math.log((len(materias)+1)/len(index[termo])) for termo in termos])
    return idf_vector

def gera_query_vetor(frase):
    termos = frase.split(" ")
    vetor = np.array([1 if index.get(termo) else 0 for termo in termos])
    return vetor

def gera_bm25_vetor(frase):
    docs_tf = gera_tf_vetor(frase)
    k = 5
    bm25_vetor = {doc_id: np.array([((k+1)*tf)/(tf+k) for tf in tf_vetor]) for doc_id, tf_vetor in docs_tf.items()}
    return bm25_vetor

In [11]:
def busca_binaria(frase):
    docs_bin = gera_vetor_binario(frase)
    query = gera_query_vetor(frase)
    
    doc_rank = sorted(list(docs_bin.items()), key=lambda doc: np.dot(doc[1], query), reverse=True)[:5] 
    return [doc[0] for doc in doc_rank]

In [12]:
def buscar_por_tf(frase):
    docs_tf = gera_tf_vetor(frase)
    query = gera_query_vetor(frase)
    
    doc_rank = sorted(list(docs_tf.items()), key=lambda doc: np.dot(doc[1], query), reverse=True)[:5] 
    return [doc[0] for doc in doc_rank]

In [13]:
def buscar_por_tf_idf(frase):
    docs_tf = gera_tf_vetor(frase)
    idf = gera_idf_vetor(frase)
    
    doc_rank = sorted(list(docs_tf.items()), key=lambda doc: np.dot(doc[1], idf), reverse=True)[:5]
    return [doc[0] for doc in doc_rank]

In [14]:
def buscar_por_bm25(frase):
    docs_bm25 = gera_bm25_vetor(frase)
    idf = gera_idf_vetor(frase)
    
    doc_rank = sorted(list(docs_bm25.items()), key=lambda doc: np.dot(doc[1], idf), reverse=True)[:5]
    return [doc[0] for doc in doc_rank]

In [15]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [16]:
busca_bin = [busca_binaria(limpar_texto(frase)) for frase in gabarito.str_busca]
busca_tf = [buscar_por_tf(limpar_texto(frase)) for frase in gabarito.str_busca]
busca_tfidf = [buscar_por_tf_idf(limpar_texto(frase)) for frase in gabarito.str_busca]
busca_bm25 = [buscar_por_bm25(limpar_texto(frase)) for frase in gabarito.str_busca]
print(mapk(gabarito.busca_binaria, busca_bin, k=5))
print(mapk(gabarito.tf, busca_tf, k=5))
print(mapk(gabarito.tfidf, busca_tfidf, k=5))
print(mapk(gabarito.bm25, busca_bm25, k=5))
print(mapk(gabarito.google, busca_bin, k=5))
print(mapk(gabarito.google, busca_tf, k=5))
print(mapk(gabarito.google, busca_tfidf, k=5))
print(mapk(gabarito.google, busca_bm25, k=5))

0.24
0.6519999999999999
0.616
0.7539999999999999
0.04
0.048
0.057999999999999996
0.128
