In [160]:
import pandas as pd
import numpy as np
import nltk
import re
import collections
import bisect
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
result = pd.read_csv('https://raw.githubusercontent.com/Leonardomotta/lab06-ri/master/results.csv')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. 

In [161]:
tknz = RegexpTokenizer(r'([A-Za-zÁáÉéÍíÓóÚúÃãÕõÇçÂâÊê]{3,27})')
stopwords = nltk.corpus.stopwords.words('portuguese') 
i = {}
n = 0

for text in result.text:
  words = [word for word in tknz.tokenize(text.lower())
           if not bool(re.search(r'\d', word))
           and word not in stopwords and len(word) >= 3]  
  n += 1
  for t in words:
    if t not in i.keys():
      i[t] = []
    i[t].append(n)
    
for elem in i.items():
  d = dict(collections.Counter(elem[1]))
  i[elem[0]] = list(d.items())

In [162]:
i_df = pd.DataFrame()
i_df['Word'] = i.keys()
i_df['Documents and Frequencies'] = i.values()

i_df.head(10)

Unnamed: 0,Word,Documents and Frequencies
0,juíza,"[(1, 2), (2, 1)]"
1,federal,"[(1, 2), (2, 2), (3, 1), (6, 1), (7, 3), (15, ..."
2,ivani,"[(1, 1), (2, 1)]"
3,silva,"[(1, 3), (2, 1), (6, 1), (14, 2), (26, 1), (73..."
4,luz,"[(1, 3), (2, 1), (9, 1), (17, 1), (32, 2), (78..."
5,brasília,"[(1, 1), (8, 1), (33, 1), (35, 1), (44, 1), (4..."
6,proibiu,"[(1, 1), (2, 1), (119, 1), (162, 1)]"
7,caráter,"[(1, 1), (15, 1), (36, 1), (60, 1), (89, 1), (..."
8,liminar,"[(1, 1), (2, 3), (119, 1), (217, 1)]"
9,nesta,"[(1, 2), (3, 1), (4, 1), (8, 1), (21, 1), (22,..."


#2

In [163]:
M = result.text.count()
for word in i:
  k = len(i[word])
  IDF = round(np.log((M+1)/k),2)
  i[word].append(IDF)

In [164]:
i_df['IDF'] = [index[-1] for index in i.values()]

i_df.head(10)

Unnamed: 0,Word,Documents and Frequencies,IDF
0,juíza,"[(1, 2), (2, 1), 4.83]",4.83
1,federal,"[(1, 2), (2, 2), (3, 1), (6, 1), (7, 3), (15, ...",1.63
2,ivani,"[(1, 1), (2, 1), 4.83]",4.83
3,silva,"[(1, 3), (2, 1), (6, 1), (14, 2), (26, 1), (73...",3.04
4,luz,"[(1, 3), (2, 1), (9, 1), (17, 1), (32, 2), (78...",2.53
5,brasília,"[(1, 1), (8, 1), (33, 1), (35, 1), (44, 1), (4...",2.19
6,proibiu,"[(1, 1), (2, 1), (119, 1), (162, 1), 4.14]",4.14
7,caráter,"[(1, 1), (15, 1), (36, 1), (60, 1), (89, 1), (...",3.22
8,liminar,"[(1, 1), (2, 3), (119, 1), (217, 1), 4.14]",4.14
9,nesta,"[(1, 2), (3, 1), (4, 1), (8, 1), (21, 1), (22,...",0.91


In [165]:
#3.1

def binary_vsm(query, document):
  s = 0
  query_tokens = query.split()
  doc_tokens = document.split()
  
  for token in query_tokens:
    s += (token in doc_tokens)
    
  return s

In [166]:
#3.2

def tf_vsm(query, document):
  s = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  for word in query_tokens:
    s += doc_tokens.count(word)
  
  return s

In [167]:
#3.3
def tfidf_vsm(query, document):
  s = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  for word in query_tokens:
    cwd = doc_tokens.count(word)
    if word in i:
      s += cwd * i[word][-1]
  
  return round(s,2)

In [168]:
#3.4
def bm25_vsm(query, document, k):
  s = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  words = [word for word in query_tokens if word in doc_tokens]
    
  for word in words:
    cwd = doc_tokens.count(word)
    dfw = 0
    if word in i:
      dfw = len(i[word][:-1])
    s += (((k+1) * cwd) / (cwd + k)) * np.log10(((M+1) / dfw)) if dfw != 0 else 0
  
  return round(s,2)

In [169]:
queries = ['economia', 'bolsonaro', 'ministro da saude']

In [170]:
def create_top5_models(query):
  n = 0
  db = []
  dtf = []
  dtfidf = []
  dbm25 = []
  for doc in result.text:
    doc = doc.lower()
    n += 1
    bisect.insort(db, (binary_vsm(query, doc), n))
    bisect.insort(dtf, (tf_vsm(query,doc), n))
    bisect.insort(dtfidf, (tfidf_vsm(query,doc), n))
    bisect.insort(dbm25, (bm25_vsm(query,doc,20), n))
  
  db.reverse()
  dtf.reverse()
  dtfidf.reverse()
  dbm25.reverse()
  
  return db[:5], dtf[:5], dtfidf[:5], dbm25[:5]

In [171]:
top5_binary = ['','','']
top5_tf = ['','','']
top5_tfidf = ['','','']
top5_bm25 = ['','','']

top5_binary[0], top5_tf[0], top5_tfidf[0], top5_bm25[0] = create_top5_models(queries[0])
top5_binary[1], top5_tf[1], top5_tfidf[1], top5_bm25[1] = create_top5_models(queries[1])
top5_binary[2], top5_tf[2], top5_tfidf[2], top5_bm25[2] = create_top5_models(queries[2])

In [172]:
dfq = pd.DataFrame()

dfq['Query'] = queries
dfq['Binary'] = top5_binary
dfq['TF'] = top5_tf
dfq['TF-IDF'] = top5_tfidf
dfq['BM25'] = top5_bm25

dfq.index+=1
dfq

Unnamed: 0,Query,Binary,TF,TF-IDF,BM25
1,economia,"[(1, 248), (1, 240), (1, 236), (1, 235), (1, 2...","[(7, 138), (7, 125), (6, 34), (5, 127), (4, 26)]","[(11.41, 138), (11.41, 125), (9.78, 34), (8.15...","[(3.85, 138), (3.85, 125), (3.43, 34), (2.97, ..."
2,bolsonaro,"[(1, 248), (1, 240), (1, 238), (1, 237), (1, 2...","[(32, 151), (30, 207), (30, 166), (19, 19), (1...","[(42.24, 151), (39.6, 207), (39.6, 166), (25.0...","[(7.39, 151), (7.21, 207), (7.21, 166), (5.85,..."
3,ministro da saude,"[(2, 247), (2, 246), (2, 240), (2, 236), (2, 2...","[(52, 151), (47, 115), (43, 166), (41, 138), (...","[(15.29, 222), (9.73, 240), (9.73, 209), (8.34...","[(4.51, 222), (3.3, 240), (3.3, 209), (2.93, 2..."


## 5. Compare os resultados encontrados e responda.

### 5.1. Quais modelos você acha que trouxe os melhores resultados? Por que? Inspecione os documentos retornados para melhor embasar sua resposta.

In [173]:
def get_top1_score(top5):
  return [top[0][0] for top in top5]

def get_top1_doc(top5):
  return [top[0][1] for top in top5]

def get_doc_title(docs):
  return [result.title[doc] for doc in docs]

In [174]:
score_top5_binary = get_top1_score(top5_binary)
score_top5_tf = get_top1_score(top5_tf)
score_top5_tfidf = get_top1_score(top5_tfidf)
score_top5_bm25 = get_top1_score(top5_bm25)

doc_top5_binary = get_top1_doc(top5_binary)
doc_top5_tf = get_top1_doc(top5_tf)
doc_top5_tfidf = get_top1_doc(top5_tfidf)
doc_top5_bm25 = get_top1_doc(top5_bm25)

titles_binary = get_doc_title(doc_top5_binary)
titles_tf = get_doc_title(doc_top5_tf)
titles_tfidf = get_doc_title(doc_top5_tfidf)
titles_bm25 = get_doc_title(doc_top5_bm25)

In [175]:
rb = pd.DataFrame()
rb['Query'] = queries
rb['Document'] = doc_top5_binary
rb['Document Title'] = titles_binary
rb['Binary Score'] = score_top5_binary

rb.index += 1
rb

Unnamed: 0,Query,Document,Document Title,Binary Score
1,economia,248,As três espanholas do Estado Islâmico: “Só que...,1
2,bolsonaro,248,As três espanholas do Estado Islâmico: “Só que...,1
3,ministro da saude,247,Tatiana Roque: “O problema da esquerda não é a...,2


#### TF

In [176]:
tf_res = pd.DataFrame()
tf_res['Query'] = queries
tf_res['Document'] = doc_top5_tf
tf_res['Document Title'] = titles_tf
tf_res['TF Score'] = score_top5_tf

tf_res.index += 1
tf_res

Unnamed: 0,Query,Document,Document Title,TF Score
1,economia,138,Estatal espanhola fica com aeroportos do Norde...,7
2,bolsonaro,151,Socialismo ‘millennial’ nos EUA,32
3,ministro da saude,151,Socialismo ‘millennial’ nos EUA,52


#### TF-IDF

In [177]:
tfidf_res = pd.DataFrame()
tfidf_res['Query'] = queries
tfidf_res['Document'] = doc_top5_tfidf
tfidf_res['Document Title'] = titles_tfidf
tfidf_res['TFIDF Score'] = score_top5_tfidf

tfidf_res.index += 1
tfidf_res

Unnamed: 0,Query,Document,Document Title,TFIDF Score
1,economia,138,Estatal espanhola fica com aeroportos do Norde...,11.41
2,bolsonaro,151,Socialismo ‘millennial’ nos EUA,42.24
3,ministro da saude,222,“Não me arrependo de nada”,15.29


#### BM-25

In [178]:
bm25_res = pd.DataFrame()
bm25_res['Query'] = queries
bm25_res['Document'] = doc_top5_bm25
bm25_res['Document Title'] = titles_bm25
bm25_res['BM25 Score'] = score_top5_bm25

bm25_res.index += 1
bm25_res

Unnamed: 0,Query,Document,Document Title,BM25 Score
1,economia,138,Estatal espanhola fica com aeroportos do Norde...,3.85
2,bolsonaro,151,Socialismo ‘millennial’ nos EUA,7.39
3,ministro da saude,222,“Não me arrependo de nada”,4.51


fazendo um comparativo o bm25 apresenta os melhores resultados
Para a consulta economia:
- 'Estatal espanhola fica com aeroportos do nordeste'

Para a consulta bolsonaro:
-  'Socialismo 'milenial' no EUA'

Para Ministro da saude, temos:
- 'Não me arrependo de nada'



In [179]:
#5.2

def inter_size(a,b):
  return len([elem for elem in a if elem in b])

def jaccard_index(a,b):
  n_a = len(a)
  n_b = len(b)
  n_ab = inter_size(a,b)
  exp = n_a + n_b - n_ab
  jaccard = n_ab / exp if exp != 0 else 0
  return jaccard

In [180]:
ad = []
ad.append(doc_top5_binary)
ad.append(doc_top5_bm25)
ad.append(doc_top5_tf)
ad.append(doc_top5_tfidf)

matriz = [['Measure', 'Binary', 'TF', 'TF-IDF', 'BM25']]
titles = matriz
for i in range(len(ad)):
  line = []
  for j in range(len(ad)):
    jaccard = jaccard_index(ad[i], ad[j])
    line.append(round(jaccard,2))
  matriz.append(line)
for i in range(1, len(matriz)):
  matriz[i].insert(0, matriz[0][i])
  
pd.DataFrame(matriz)

Unnamed: 0,0,1,2,3,4
0,Measure,Binary,TF,TF-IDF,BM25
1,Binary,1,0,0,0
2,TF,0,1,0.5,1
3,TF-IDF,0,1,1,1
4,BM25,0,1,0.5,1


Usando Jaccard nota se uma diferença gritante entre o modelo binario e os demais , o bm25 e o tf por sua vez , estão em pé de igualdade ,enquanto que o tf-idf é o mais distindo dos demais isso ficou bem claro para as consultas economia e bolsonaro os quais eu considero que o modelo mostrou o melhor resultado