In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import collections
import bisect
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
result = pd.read_csv('https://raw.githubusercontent.com/GersonSales/rec-info/master/lab_07/results.csv')
                                           
json = pd.read_json('https://raw.githubusercontent.com/GersonSales/rec-info/master/lab_07/results_b.json')
feedback = {json['query'][i]:json['docs'][i] for i in range(10)}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
tknz = RegexpTokenizer(r'([A-Za-zÁáÉéÍíÓóÚúÃãÕõÇçÂâÊê]{3,27})')
stopwords = nltk.corpus.stopwords.words('portuguese') 
indexes = {}
M = result.text.count()

for i in range(len(result)):
  text = result.text[i]
  words = [word for word in tknz.tokenize(text.lower())
           if not bool(re.search(r'\d', word))
           and word not in stopwords and len(word) >= 3]  
  for t in words:
    if t not in indexes.keys():
      indexes[t] = []
    indexes[t].append(i)
    
for elem in indexes.items():
  d = dict(collections.Counter(elem[1]))
  indexes[elem[0]] = list(d.items())
  
for word in indexes:
  k = len(indexes[word])
  IDF = round(np.log((M+1)/k),2)
  indexes[word].append(IDF)

### 1_Escolha um documento dentre aqueles da base do aluno Bernardi e crie uma consulta que você acha que tem boas chances de recuperar este documento. Em seguida, avalie os resultados de tal consulta usando a métrica de avaliação Reciprocal Rank

### Reforma da Previdência

In [3]:
ndoc = 230

document = result.loc[ndoc]
query = 'câmara'

document.title

'Paulo Guedes defende sua reforma da Previdência em audiência na Câmara'

In [0]:
row = result.loc[result.url == document.url]


In [0]:
def binary_vsm(query, document):
  score = 0
  query_tokens = query.split()
  doc_tokens = document.split()
  
  for token in query_tokens:
    score += (token in doc_tokens)
    
  return score

def tf_vsm(query, document):
  score = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  for word in query_tokens:
    score += doc_tokens.count(word)
  
  return score


def bm25_vsm(query, document, k):
  score = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  words = [word for word in query_tokens if word in doc_tokens]
    
  for word in words:
    cwd = doc_tokens.count(word)
    dfw = 0
    if word in indexes:
      dfw = len(indexes[word][:-1])
    score += (((k+1) * cwd) / (cwd + k)) * np.log10(((M+1) / dfw)) if dfw != 0 else 0
  
  return round(score,2)

def tfidf_vsm(query, document):
  score = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  for word in query_tokens:
    cwd = doc_tokens.count(word)
    if word in indexes:
      score += cwd * indexes[word][-1]
  
  return round(score,2)

In [0]:

def create_topk_models(query,k):
  db = []
  dtf = []
  dtfidf = []
  dbm25 = []
  for i in range(len(result)):
    doc = result.text[i].lower()
    bisect.insort(db, (binary_vsm(query, doc), i))
    bisect.insort(dtf, (tf_vsm(query,doc), i))
    bisect.insort(dtfidf, (tfidf_vsm(query,doc), i))
    bisect.insort(dbm25, (bm25_vsm(query,doc,20), i))
  
  db.reverse()
  dtf.reverse()
  dtfidf.reverse()
  dbm25.reverse()
  
  return db[:k], dtf[:k], dtfidf[:k], dbm25[:k]

In [0]:
top_binary, top_tf, top_tfidf, top_bm25 = create_topk_models(query,10)
idoc = [doc for score,doc in top_binary]


In [8]:
query_df = pd.DataFrame()

query_df['Binary'] = top_binary
query_df['TF'] = top_tf
query_df['TF-IDF'] = top_tfidf
query_df['BM25'] = top_bm25

query_df.index+=1
query_df

Unnamed: 0,Binary,TF,TF-IDF,BM25
1,"(1, 241)","(12, 206)","(23.64, 206)","(6.72, 206)"
2,"(1, 239)","(7, 208)","(13.79, 208)","(4.65, 208)"
3,"(1, 234)","(4, 230)","(7.88, 230)","(2.99, 230)"
4,"(1, 230)","(4, 204)","(7.88, 204)","(2.99, 204)"
5,"(1, 228)","(4, 167)","(7.88, 167)","(2.99, 167)"
6,"(1, 226)","(4, 80)","(7.88, 80)","(2.99, 80)"
7,"(1, 224)","(3, 234)","(5.91, 234)","(2.34, 234)"
8,"(1, 221)","(3, 224)","(5.91, 224)","(2.34, 224)"
9,"(1, 219)","(3, 214)","(5.91, 214)","(2.34, 214)"
10,"(1, 217)","(3, 81)","(5.91, 81)","(2.34, 81)"


### Reciprocal Rank

In [9]:
def reciprocal_rank(tuples, docId):
  n = 1.0;
  for r,doc in tuples:
    if doc == docId:
      return  [round(1 / n, 2)]
    else:
      n += 1

rank_df = pd.DataFrame()
rank_df['Binary'] = reciprocal_rank(query_df['Binary'], ndoc)
rank_df['TF'] = reciprocal_rank(query_df['TF'], ndoc)
rank_df['TF-IDF'] = reciprocal_rank(query_df['TF-IDF'], ndoc)
rank_df['BM25'] = reciprocal_rank(query_df['BM25'], ndoc)
rank_df.index+=1
rank_df

Unnamed: 0,Binary,TF,TF-IDF,BM25
1,0.25,0.33,0.33,0.33


### 2_ A partir do gabarito fornecido em OBS1, calcule o MAP para cada algoritmo abaixo e aponte qual obteve o melhor resultado. Para os cálculos do MAP, considere que um documento é relevante para uma dada consulta se este documento estiver entre os documentos do gabarito para essa consulta, senão ele deve ser considerado irrelevante. 

In [0]:
def doc_indexes(model):
  return [doc for score,doc in model]

def intersection(a,b):
  return [elem for elem in a if elem in b]

def calc_AP(query):
  revelant_docs = []

  for doc_info in feedback[query]:
    row = result.loc[result.url == doc_info['URL']]
    revelant_docs.append(row.index[0])
  
  binary, tf, tfidf, bm25 = create_topk_models(query, 10)
  binary = doc_indexes(binary)
  tf = doc_indexes(tf)
  tfidf = doc_indexes(tfidf)
  bm25 = doc_indexes(bm25)
  
  ap_binary = len(intersection(binary, revelant_docs)) / len(binary)
  ap_tf = len(intersection(tf, revelant_docs)) / len(tf)
  ap_tfidf = len(intersection(tfidf, revelant_docs)) / len(tfidf)
  ap_bm25 = len(intersection(bm25, revelant_docs)) / len(bm25)
  
  return ap_binary, ap_tf, ap_tfidf, ap_bm25
  
def calc_MAP(queries):
  sum_binary = 0
  sum_tf = 0
  sum_tfidf = 0
  sum_bm25 = 0
  
  for query in queries:
    ap_binary, ap_tf, ap_tfidf, ap_bm25 = calc_AP(query)
    sum_binary += ap_binary
    sum_tf += ap_tf
    sum_tfidf += ap_tfidf
    sum_bm25 += ap_bm25
  
  map_binary = round(sum_binary / len(queries),2)
  map_tf = round(sum_tf / len(queries),2)
  map_tfidf = round(sum_tfidf / len(queries),2)
  map_bm25 = round(sum_bm25 / len(queries),2)
  
  return map_binary, map_tf, map_tfidf, map_bm25

In [0]:
map_binary, map_tf, map_tfidf, map_bm25 = calc_MAP(feedback.keys())


In [12]:
map_binary


0.08

In [13]:
map_tf


0.05

In [14]:
map_tfidf

0.13

In [15]:
map_bm25

0.14

### 3_Repita Q2 usando a avaliação multi-nível DCG. Utilize o campo "level" do gabarito para o cálculo do DCG e do idealDCG. 