## IMPLEMENTAÇÃO DE UM SISTEMA DE RECUPERAÇÃO EM MEMÓRIA SEGUNDO O MODELO VETORIAL

### Nome: João Pedro de Faria Sales
### DRE: 121056457



In [1]:
# imports

import os
import configparser
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import numpy as np
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# GERADOR DE LISTA INVERTIDA

In [2]:
# lendo as configs

config = configparser.ConfigParser()
config.read_file(open(r'CLI.CFG'))
LEIA_LI = config.get('CONFIGS', 'LEIA')
ESCREVA_LI = config.get('CONFIGS', 'ESCREVA')

LEIA_list = LEIA_LI.split("; ")

LEIA_list, ESCREVA_LI


(['../data/cf74.xml',
  '../data/cf75.xml',
  '../data/cf76.xml',
  '../data/cf77.xml',
  '../data/cf78.xml',
  '../data/cf79.xml'],
 '../RESULT/ListaInvertida.csv')

In [3]:
#lendo os XMLs

documents = pd.DataFrame({
    "RECORDNUM": [],
    "ABSTRACT": []
})

records = []
abstracts = []

for path in LEIA_list:
    with open(path, 'r') as f:
        data = f.read()
    Bs_data = BeautifulSoup(data, "xml")
    for doc in Bs_data.find_all("RECORDNUM"):
        records.append( int(doc.text.replace(' ', '') ))
    for doc in Bs_data.find_all(["ABSTRACT","EXTRACT"]):
        abstracts.append( doc.text )
    
documents.RECORDNUM = records
documents.ABSTRACT = abstracts

documents
#Bs_data = BeautifulSoup(data, "xml")

Unnamed: 0,RECORDNUM,ABSTRACT
0,1,The significance of Pseudomonas aeruginosa inf...
1,2,Salivary amylase levels were determined in nor...
2,3,This article reports on the possibility of usi...
3,4,Instrumental neutron activation analysis (INAA...
4,5,The capacity of duodenal juice to hydrolyse ca...
...,...,...
1234,1235,Apart from the sound physiologic basis for the...
1235,1236,"Recently, we found the cystic fibrosis mucocil..."
1236,1237,Tests claimed to detect small airway disease w...
1237,1238,We theorize that cystic fibrosis may be a dise...


In [4]:
#criar um dicionario com as palavras e uma lista dos docs que ela aparece

def tokenize_and_remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(token).upper() for token in tokens if (token.lower() not in stop_words and len(token) > 2 and not token.isnumeric())] 
    return tokens

def create_inverted_index(dataframe):
    inverted_index_dict = {}
    for index, row in dataframe.iterrows():
        tokens = tokenize_and_remove_stopwords(row['ABSTRACT'])
        for token in tokens:
            if token not in inverted_index_dict:
                inverted_index_dict[token] = [row['RECORDNUM']]
            else:
                inverted_index_dict[token].append(row['RECORDNUM'])
    return inverted_index_dict

inverted_index_dict = create_inverted_index(documents)                 
for word, indices in inverted_index_dict.items():
    print(word + ':', indices)

SIGNIFIC: [1, 6, 19, 24, 30, 47, 52, 53, 54, 62, 62, 65, 68, 74, 78, 80, 84, 88, 89, 102, 102, 106, 106, 107, 115, 121, 137, 137, 141, 145, 145, 145, 147, 147, 147, 154, 156, 157, 161, 167, 179, 181, 181, 181, 181, 185, 185, 185, 192, 195, 204, 205, 217, 221, 223, 223, 230, 231, 232, 238, 242, 243, 244, 246, 253, 258, 259, 272, 278, 278, 283, 284, 284, 286, 306, 306, 313, 313, 313, 316, 317, 319, 323, 323, 323, 345, 346, 348, 349, 355, 355, 356, 356, 356, 356, 357, 358, 361, 366, 366, 373, 375, 375, 400, 402, 411, 412, 414, 421, 424, 426, 440, 440, 447, 455, 462, 463, 471, 485, 519, 523, 526, 526, 528, 528, 530, 536, 536, 545, 546, 555, 567, 577, 587, 588, 593, 597, 604, 604, 613, 625, 631, 631, 635, 639, 643, 645, 652, 661, 662, 663, 668, 676, 678, 679, 686, 689, 689, 691, 691, 700, 702, 707, 716, 735, 737, 737, 740, 742, 743, 746, 750, 751, 753, 754, 757, 758, 779, 780, 790, 795, 803, 807, 809, 811, 812, 812, 813, 820, 831, 853, 858, 859, 865, 874, 876, 878, 890, 895, 896, 904, 923, 

In [5]:
#convertendo para csv

words_list = []
indices_list = []
for word, indices in inverted_index_dict.items():
    words_list.append(word)
    indices_list.append(indices)

df_inverted_index = pd.DataFrame({
    "Word": words_list,
    "Indices": indices_list
})



df_inverted_index.to_csv(ESCREVA_LI, sep=';', index=False)

print (f"csv escrito em {ESCREVA_LI}")

csv escrito em ../RESULT/ListaInvertida.csv


# INDEXADOR

In [6]:
#lendo as configs

config = configparser.ConfigParser()
config.read_file(open(r'INDEX.CFG'))
LEIA_I = config.get('CONFIGS', 'LEIA')
ESCREVA_I = config.get('CONFIGS', 'ESCREVA')

LEIA_I, ESCREVA_I

('../RESULT/ListaInvertida.csv', '../RESULT/Modelo.csv')

In [7]:
#lendo a matriz invertida novamente

Matriz_Invertida = pd.read_csv(LEIA_I, sep = ";")
Matriz_Invertida.head()

Unnamed: 0,Word,Indices
0,SIGNIFIC,"[1, 6, 19, 24, 30, 47, 52, 53, 54, 62, 62, 65,..."
1,PSEUDOMONA,"[1, 1, 1, 7, 8, 18, 18, 61, 61, 62, 62, 62, 62..."
2,AERUGINOSA,"[1, 1, 1, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, ..."
3,INFECT,"[1, 1, 1, 6, 6, 6, 16, 18, 48, 48, 57, 58, 58,..."
4,RESPIRATORI,"[1, 1, 1, 6, 6, 7, 7, 8, 11, 11, 11, 15, 17, 2..."


In [8]:
#matriz documento x termo

Copy_Matrix = Matriz_Invertida.copy()

M_Doc_Term = Copy_Matrix.drop('Indices', axis = 1)


for index, list in enumerate(Matriz_Invertida.Indices):
    for docId in eval(list):
        if docId not in M_Doc_Term.columns:
            # print(docId)
            M_Doc_Term[docId] = 0
        M_Doc_Term.loc[index, docId] += 1

M_Doc_Term.set_index('Word', inplace=True)

M_Doc_Term.head()


Unnamed: 0_level_0,1,6,19,24,30,47,52,53,54,62,...,909,329,537,580,799,558,908,117,1011,939
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIGNIFIC,1,1,1,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
PSEUDOMONA,3,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
AERUGINOSA,3,6,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
INFECT,3,3,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
RESPIRATORI,3,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Matriz com tf normalizado

def normalized_tf_matrix(dataframe):
    df = dataframe.copy()
    for coluna in df: 
        max_t = max(df[coluna])
        df[coluna] = df[coluna] / max_t
    
    return df
        
norm = normalized_tf_matrix(M_Doc_Term)

norm


Unnamed: 0_level_0,1,6,19,24,30,47,52,53,54,62,...,909,329,537,580,799,558,908,117,1011,939
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIGNIFIC,0.166667,0.111111,0.090909,0.111111,0.2,0.166667,0.333333,0.333333,0.333333,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PSEUDOMONA,0.500000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.416667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AERUGINOSA,0.500000,0.666667,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INFECT,0.500000,0.333333,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.750000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RESPIRATORI,0.500000,0.222222,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
THROMBOSI,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MONOSPECIF,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CONSENT,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PATCHI,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#Matriz com os pesos calculados

def count_nonzero(row):
    return (row > 0).sum()

df_i = norm.apply(count_nonzero, axis=1)

idf_i = np.log(len(norm.columns) / df_i)

W_i = pd.DataFrame()
for col in norm:
    W_i[col] = norm[col] * idf_i

W_i.head()

Unnamed: 0_level_0,1,6,19,24,30,47,52,53,54,62,...,909,329,537,580,799,558,908,117,1011,939
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIGNIFIC,0.292681,0.19512,0.159644,0.19512,0.351217,0.292681,0.585361,0.585361,0.585361,0.292681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PSEUDOMONA,1.4158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.179834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AERUGINOSA,1.408997,1.878663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INFECT,1.115855,0.743904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.673783,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RESPIRATORI,1.23405,0.548467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# transformando em csv e salvando

W_i.to_csv(ESCREVA_I, sep=';', index=True)

print(f"Modelo salvo com sucesso em {ESCREVA_I}")

Modelo salvo com sucesso em ../RESULT/Modelo.csv


# PROCESSADOR DE CONSULTAS

In [12]:
## lendo as configs


config = configparser.ConfigParser()
config.read_file(open(r'PC.CFG'))
LEIA_PC = config.get('CONFIGS', 'LEIA')
CONSULTAS_PC = config.get('CONFIGS', 'CONSULTAS')
ESPERADOS_PC = config.get('CONFIGS', 'ESPERADOS')

LEIA_PC, CONSULTAS_PC, ESPERADOS_PC

('../data/cfquery.xml', '../RESULT/Consultas.csv', '../RESULT/Esperados.csv')

In [13]:
## lendo o XML


with open(LEIA_PC, 'r') as f:
    data = f.read()

Bs_data = BeautifulSoup(data, "xml")
 


In [14]:
## pegando os campos QueryNumber

QueryNumberList = Bs_data.find_all('QueryNumber')

NumbersList = [ number.text for number in QueryNumberList]

NumbersList[0:10]

['00001',
 '00002',
 '00003',
 '00004',
 '00005',
 '00006',
 '00007',
 '00008',
 '00009',
 '00010']

In [15]:
## pegando os campos QueryText e tratando os dados


def tokenize_and_remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(token).upper() for token in tokens if (token.lower() not in stop_words and len(token) > 2 and not token.isnumeric())] 
    return tokens

QueryTextList = Bs_data.find_all('QueryText')

TextList = [ tokenize_and_remove_stopwords(text.text) for text in QueryTextList]

TextList[0:10]

[['EFFECT', 'CALCIUM', 'PHYSIC', 'PROPERTI', 'MUCU', 'PATIENT'],
 ['ONE',
  'DISTINGUISH',
  'EFFECT',
  'MUCU',
  'HYPERSECRET',
  'INFECT',
  'SUBMUCOS',
  'GLAND',
  'RESPIRATORI',
  'TRACT'],
 ['SALIVARI', 'GLYCOPROTEIN', 'PATIENT', 'DIFFER', 'NORMAL', 'SUBJECT'],
 ['LIPID', 'COMPOSIT', 'RESPIRATORI', 'SECRET'],
 ['MUCU', 'ABNORM'],
 ['EFFECT',
  'WATER',
  'THERAPEUT',
  'AGENT',
  'PHYSIC',
  'PROPERTI',
  'VISCOS',
  'ELAST',
  'SPUTUM',
  'BRONCHIAL',
  'SECRET',
  'PATIENT'],
 ['MUCU',
  'GLYCOPROTEIN',
  'DEGRAD',
  'DIFFER',
  'PATIENT',
  'COMPAR',
  'NORMAL',
  'SUBJECT'],
 ['HISTOCHEM', 'DIFFER', 'DESCRIB', 'NORMAL', 'RESPIRATORI', 'EPITHELIA'],
 ['ASSOCI', 'LIVER', 'DISEAS', 'CIRRHOSI', 'VITAMIN', 'METABOL'],
 ['ROLE', 'VITAMIN', 'THERAPI', 'PATIENT']]

In [16]:

#criando um df das consultas 

data = pd.DataFrame({
   'QueryText': TextList,
    'QueryNumber': NumbersList
})

data.head()


Unnamed: 0,QueryText,QueryNumber
0,"[EFFECT, CALCIUM, PHYSIC, PROPERTI, MUCU, PATI...",1
1,"[ONE, DISTINGUISH, EFFECT, MUCU, HYPERSECRET, ...",2
2,"[SALIVARI, GLYCOPROTEIN, PATIENT, DIFFER, NORM...",3
3,"[LIPID, COMPOSIT, RESPIRATORI, SECRET]",4
4,"[MUCU, ABNORM]",5


In [17]:
#gerando csv

data.to_csv(CONSULTAS_PC, sep = ';', index=False)

In [18]:
#extraindo os numeros da consulta, os scores e os documentos
Records = Bs_data.find_all('Records')

queries_dict = {}

def get_votes (string):
    votos = 0
    for i in string:
        if eval(i) > 0:
            votos += 1
    return votos

for index, record in enumerate(Records):
    documents = [ (int(item.text), get_votes (item['score']) ) for item in record.find_all('Item')]
    queries_dict[index] = documents
    

queries_dict

{0: [(139, 4),
  (151, 4),
  (166, 1),
  (311, 1),
  (370, 2),
  (392, 1),
  (439, 1),
  (440, 2),
  (441, 4),
  (454, 1),
  (461, 4),
  (502, 1),
  (503, 1),
  (505, 1),
  (520, 2),
  (522, 1),
  (526, 3),
  (527, 2),
  (533, 4),
  (593, 1),
  (619, 1),
  (737, 1),
  (742, 1),
  (789, 1),
  (827, 1),
  (835, 1),
  (861, 1),
  (875, 4),
  (891, 1),
  (921, 2),
  (922, 2),
  (1175, 1),
  (1185, 1),
  (1222, 1)],
 1: [(169, 1), (434, 2), (454, 1), (498, 1), (499, 1), (592, 1), (875, 3)],
 2: [(23, 1),
  (40, 1),
  (139, 4),
  (190, 1),
  (221, 1),
  (246, 1),
  (309, 1),
  (311, 2),
  (325, 1),
  (345, 1),
  (347, 1),
  (356, 1),
  (370, 2),
  (374, 1),
  (375, 4),
  (439, 1),
  (440, 3),
  (454, 3),
  (515, 1),
  (520, 1),
  (524, 1),
  (526, 2),
  (527, 3),
  (533, 1),
  (535, 1),
  (560, 1),
  (561, 1),
  (571, 1),
  (584, 1),
  (604, 1),
  (623, 1),
  (633, 4),
  (733, 1),
  (742, 2),
  (854, 1),
  (856, 4),
  (950, 1),
  (967, 1),
  (1144, 4),
  (1161, 1),
  (1172, 1),
  (1175, 1),


In [19]:
#montando o arquivo

QueryNumber= []
DocNumber= []
DocVotes= []

for chave in queries_dict:
    lista = queries_dict[chave]
    QueryNumber.extend([chave] * len(queries_dict[chave]))
    for tuple in queries_dict[chave]:
        DocNumber.append(tuple[0])
        DocVotes.append(tuple[1])


df = pd.DataFrame({
    'QueryNumber': QueryNumber,
    'DocNumber': DocNumber,
    'DocVotes': DocVotes
})

In [20]:
#exportando em csv

df.to_csv(ESPERADOS_PC, sep = ';', index=False)


# BUSCADOR

In [21]:
# lendo as configs

config = configparser.ConfigParser()
config.read_file(open(r'BUSCA.CFG'))
MODELO_BU = config.get('CONFIGS', 'MODELO')
CONSULTAS_BU = config.get('CONFIGS', 'CONSULTAS')
RESULTADO_BU = config.get('CONFIGS', 'RESULTADOS')


MODELO_BU, CONSULTAS_BU,RESULTADO_BU

('../RESULT/Modelo.csv', '../RESULT/consultas.csv', '../RESULT/Resultados.csv')

In [22]:
# lendo os csvs

m_doc = pd.read_csv(MODELO_BU, sep =';')
consultas = pd.read_csv(CONSULTAS_BU, sep =';')

m_doc.set_index('Word', inplace=True)

m_doc.info

<bound method DataFrame.info of                     1         6        19       24        30        47  \
Word                                                                     
SIGNIFIC     0.292681  0.195120  0.159644  0.19512  0.351217  0.292681   
PSEUDOMONA   1.415800  0.000000  0.000000  0.00000  0.000000  0.000000   
AERUGINOSA   1.408997  1.878663  0.000000  0.00000  0.000000  0.000000   
INFECT       1.115855  0.743904  0.000000  0.00000  0.000000  0.000000   
RESPIRATORI  1.234050  0.548467  0.000000  0.00000  0.000000  0.000000   
...               ...       ...       ...      ...       ...       ...   
THROMBOSI    0.000000  0.000000  0.000000  0.00000  0.000000  0.000000   
MONOSPECIF   0.000000  0.000000  0.000000  0.00000  0.000000  0.000000   
CONSENT      0.000000  0.000000  0.000000  0.00000  0.000000  0.000000   
PATCHI       0.000000  0.000000  0.000000  0.00000  0.000000  0.000000   
THEORIZ      0.000000  0.000000  0.000000  0.00000  0.000000  0.000000   

     

In [23]:
m_consult = m_doc.copy()

for indice, consulta in consultas.iterrows():
    series = pd.Series(eval(consulta.iloc[0]))
    contagem_palavras = series.value_counts()
    contagem_palavras.name = indice + 1
    m_consult = pd.merge(m_consult, pd.DataFrame(contagem_palavras), left_index=True, right_index=True, how= 'left')
    m_consult.fillna(0, inplace=True)
   
m_consult = m_consult.iloc[:,m_doc.columns.size:]
m_consult

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,90,91,92,93,94,95,96,97,98,99
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIGNIFIC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PSEUDOMONA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AERUGINOSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INFECT,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RESPIRATORI,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
THROMBOSI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MONOSPECIF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CONSENT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PATCHI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#agora resta fazer os calculos

def norma_euclid (serie):
    norma = np.linalg.norm(serie)
    return norma

def similaridade (documento, consulta):
    produto_escalar = np.dot(documento, consulta)
    norma_doc = norma_euclid(documento)
    norma_con = norma_euclid(consulta)
 
    if (norma_doc != 0 and norma_doc != 0):
        return  produto_escalar / (norma_doc * norma_con)
    else:
        return 0

m_sim = pd.DataFrame ({
     'documentos': m_doc.columns   
})

for consulta in m_consult:
    
    sim = []
    for documento in m_doc:
        try:
            sim.append(similaridade(m_consult[consulta], m_doc[documento]))
        except:
            print(documento)
            continue
    m_sim[consulta] = sim

m_consult.head()

 

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,90,91,92,93,94,95,96,97,98,99
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIGNIFIC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PSEUDOMONA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AERUGINOSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INFECT,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RESPIRATORI,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#montando os resultas

resultados = pd.DataFrame({
    'Consulta': [],
    'Resultados': []
})
        

for consulta in m_sim.iloc[:,1:]:
    serie = m_sim[consulta]
    sorted = serie.sort_values(ascending = False)
    indices = sorted.index
    result = []
    for index, dist in enumerate(sorted):
        result += [(index, m_sim.documentos[indices[index]], dist)]
    nova_linha = pd.Series({'Consulta': consulta, 'Resultados': result})
    resultados.loc[consulta -1] = nova_linha
    

resultados

Unnamed: 0,Consulta,Resultados
0,1,"[(0, 437, 0.2739693801379849), (1, 498, 0.2067..."
1,2,"[(0, 761, 0.3971012546677342), (1, 160, 0.2654..."
2,3,"[(0, 856, 0.26246709138444785), (1, 1206, 0.24..."
3,4,"[(0, 715, 0.3351386598206889), (1, 357, 0.2555..."
4,5,"[(0, 498, 0.3935789407876576), (1, 437, 0.2409..."
...,...,...
94,95,"[(0, 455, 0.3061954146912846), (1, 871, 0.1469..."
95,96,"[(0, 1225, 0.3325028889615067), (1, 383, 0.256..."
96,97,"[(0, 1118, 0.2137060280222947), (1, 180, 0.191..."
97,98,"[(0, 268, 0.46158392638824614), (1, 449, 0.385..."


In [26]:
#exportando para csv

resultados.to_csv(RESULTADO_BU, sep = ';', index=False)
