## IMPLEMENTAÇÃO DE UM SISTEMA DE RECUPERAÇÃO EM MEMÓRIA SEGUNDO O MODELO VETORIAL

### Nome: João Pedro de Faria Sales
### DRE: 121056457



In [1]:
# imports

import os
import configparser
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import numpy as np

# GERADOR DE LISTA INVERTIDA

In [2]:
# lendo as configs

config = configparser.ConfigParser()
config.read_file(open(r'CLI.CFG'))
LEIA_LI = config.get('CONFIGS', 'LEIA')
ESCREVA_LI = config.get('CONFIGS', 'ESCREVA')

LEIA_list = LEIA_LI.split("; ")

LEIA_list, ESCREVA_LI


(['../data/cf74.xml',
  '../data/cf75.xml',
  '../data/cf76.xml',
  '../data/cf77.xml',
  '../data/cf78.xml',
  '../data/cf79.xml'],
 '../RESULT/ListaInvertida.csv')

In [3]:
#lendo os XMLs

documents = pd.DataFrame({
    "RECORDNUM": [],
    "ABSTRACT": []
})

records = []
abstracts = []

for path in LEIA_list:
    with open(path, 'r') as f:
        data = f.read()
    Bs_data = BeautifulSoup(data, "xml")
    records.append( int(Bs_data.find("RECORDNUM").text.replace(' ', '') ))
    abstracts.append( Bs_data.find(["ABSTRACT","EXTRACT"]).text )
    
documents.RECORDNUM = records
documents.ABSTRACT = abstracts

documents
#Bs_data = BeautifulSoup(data, "xml")

Unnamed: 0,RECORDNUM,ABSTRACT
0,1,The significance of Pseudomonas aeruginosa inf...
1,168,Cystic Fibrosis is a generalized hereditary di...
2,356,Total amylase activity of serum and mixed sali...
3,583,Twelve homozygote patients and thirty-two hete...
4,782,The aim of the present study was to determine ...
5,981,Sweat from 8 'rusters' and 8 control persons w...


In [4]:
#criar um dicionario com as palavras e uma lista dos docs que ela aparece

def tokenize_and_remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(token).upper() for token in tokens if (token.lower() not in stop_words and len(token) > 2 and not token.isnumeric())] 
    return tokens

def create_inverted_index(dataframe):
    inverted_index_dict = {}
    for index, row in dataframe.iterrows():
        tokens = tokenize_and_remove_stopwords(row['ABSTRACT'])
        for token in tokens:
            if token not in inverted_index_dict:
                inverted_index_dict[token] = [row['RECORDNUM']]
            else:
                inverted_index_dict[token].append(row['RECORDNUM'])
    return inverted_index_dict

inverted_index_dict = create_inverted_index(documents)                 
for word, indices in inverted_index_dict.items():
    print(word + ':', indices)

SIGNIFIC: [1, 356, 356, 356, 356]
PSEUDOMONA: [1, 1, 1]
AERUGINOSA: [1, 1, 1]
INFECT: [1, 1, 1, 782]
RESPIRATORI: [1, 1, 1]
TRACT: [1, 1, 1]
CYSTIC: [1, 168, 168, 583, 782, 981]
FIBROSI: [1, 168, 168, 583, 782, 981]
PATIENT: [1, 1, 1, 1, 168, 168, 583, 583, 782, 782, 782, 981, 981]
STUDI: [1, 356, 782, 782, 981]
MEAN: [1, 356, 356, 356]
IMMUNOELECTROPHORET: [1]
ANALYSI: [1]
SERA: [1, 1]
NUMBER: [1, 1, 1]
PRECIPITIN: [1, 1, 1, 1, 1, 1]
CONCENTR: [1, 1, 1, 583, 981, 981, 981]
SERUM: [1, 1, 1, 356, 356, 356, 356]
PROTEIN: [1, 1, 1, 1]
ADDIT: [1, 782]
CLINIC: [1, 782]
RADIOGRAPH: [1]
STATU: [1]
LUNG: [1]
EVALU: [1]
USE: [1, 782, 782]
SCORE: [1]
SYSTEM: [1]
DEMONSTR: [1]
MAXIMUM: [1]
ONE: [1, 168, 782]
SIGNIFICANTLI: [1, 356]
CHANG: [1, 1, 356, 782, 782, 782]
COMPAR: [1]
MATCH: [1]
CONTROL: [1, 356, 356, 356, 981]
PERSON: [1, 981]
NOTABL: [1, 1]
IGG: [1]
IGA: [1]
ELEV: [1, 356, 981]
ACUT: [1, 1, 782, 782, 782]
PHASE: [1, 1]
LATTER: [1, 583]
SUGGEST: [1, 1]
ACTIV: [1, 356, 356, 356, 356, 356

In [5]:
#convertendo para csv

words_list = []
indices_list = []
for word, indices in inverted_index_dict.items():
    words_list.append(word)
    indices_list.append(indices)

df_inverted_index = pd.DataFrame({
    "Word": words_list,
    "Indices": indices_list
})



df_inverted_index.to_csv(ESCREVA_LI, sep=';', index=False)

print (f"csv escrito em {ESCREVA_LI}")

csv escrito em ../RESULT/ListaInvertida.csv


# INDEXADOR

In [6]:
#lendo as configs

config = configparser.ConfigParser()
config.read_file(open(r'INDEX.CFG'))
LEIA_I = config.get('CONFIGS', 'LEIA')
ESCREVA_I = config.get('CONFIGS', 'ESCREVA')

LEIA_I, ESCREVA_I

('../RESULT/ListaInvertida.csv', '../RESULT/Modelo.csv')

In [7]:
#lendo a matriz invertida novamente

Matriz_Invertida = pd.read_csv(LEIA_I, sep = ";")
Matriz_Invertida.head()

Unnamed: 0,Word,Indices
0,SIGNIFIC,"[1, 356, 356, 356, 356]"
1,PSEUDOMONA,"[1, 1, 1]"
2,AERUGINOSA,"[1, 1, 1]"
3,INFECT,"[1, 1, 1, 782]"
4,RESPIRATORI,"[1, 1, 1]"


In [8]:
#matriz documento x termo

Copy_Matrix = Matriz_Invertida.copy()

M_Doc_Term = Copy_Matrix.drop('Indices', axis = 1)


for index, list in enumerate(Matriz_Invertida.Indices):
    for docId in eval(list):
        if docId not in M_Doc_Term.columns:
            # print(docId)
            M_Doc_Term[docId] = 0
        M_Doc_Term.loc[index, docId] += 1

M_Doc_Term.set_index('Word', inplace=True)

M_Doc_Term.head()


Unnamed: 0_level_0,1,356,782,168,583,981
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SIGNIFIC,1,4,0,0,0,0
PSEUDOMONA,3,0,0,0,0,0
AERUGINOSA,3,0,0,0,0,0
INFECT,3,0,1,0,0,0
RESPIRATORI,3,0,0,0,0,0


In [9]:
#Matriz com tf normalizado

def normalized_tf_matrix(dataframe):
    df = dataframe.copy()
    for coluna in df: 
        max_t = max(df[coluna])
        df[coluna] = df[coluna] / max_t
    
    return df
        
norm = normalized_tf_matrix(M_Doc_Term)

norm


Unnamed: 0_level_0,1,356,782,168,583,981
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SIGNIFIC,0.166667,0.666667,0.0,0.0,0.0,0.0
PSEUDOMONA,0.500000,0.000000,0.0,0.0,0.0,0.0
AERUGINOSA,0.500000,0.000000,0.0,0.0,0.0,0.0
INFECT,0.500000,0.000000,0.2,0.0,0.0,0.0
RESPIRATORI,0.500000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
SOLUT,0.000000,0.000000,0.0,0.0,0.0,0.2
ABSOLUT,0.000000,0.000000,0.0,0.0,0.0,0.2
ETHYL,0.000000,0.000000,0.0,0.0,0.0,0.2
ALCOHOL,0.000000,0.000000,0.0,0.0,0.0,0.2


In [10]:
#Matriz com os pesos calculados

def count_nonzero(row):
    return (row > 0).sum()

df_i = norm.apply(count_nonzero, axis=1)

idf_i = np.log(len(norm.columns) / df_i)

W_i = pd.DataFrame()
for col in norm:
    W_i[col] = norm[col] * idf_i

W_i.head()

Unnamed: 0_level_0,1,356,782,168,583,981
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SIGNIFIC,0.183102,0.732408,0.0,0.0,0.0,0.0
PSEUDOMONA,0.89588,0.0,0.0,0.0,0.0,0.0
AERUGINOSA,0.89588,0.0,0.0,0.0,0.0,0.0
INFECT,0.549306,0.0,0.219722,0.0,0.0,0.0
RESPIRATORI,0.89588,0.0,0.0,0.0,0.0,0.0


In [11]:
# transformando em csv e salvando

W_i.to_csv(ESCREVA_I, sep=';', index=True)

print(f"Modelo salvo com sucesso em {ESCREVA_I}")

Modelo salvo com sucesso em ../RESULT/Modelo.csv


# PROCESSADOR DE CONSULTAS

In [12]:
## lendo as configs


config = configparser.ConfigParser()
config.read_file(open(r'PC.CFG'))
LEIA_PC = config.get('CONFIGS', 'LEIA')
CONSULTAS_PC = config.get('CONFIGS', 'CONSULTAS')
ESPERADOS_PC = config.get('CONFIGS', 'ESPERADOS')

LEIA_PC, CONSULTAS_PC, ESPERADOS_PC

('../data/cfquery.xml', '../RESULT/Consultas.csv', '../RESULT/Esperados.csv')

In [13]:
## lendo o XML


with open(LEIA_PC, 'r') as f:
    data = f.read()

Bs_data = BeautifulSoup(data, "xml")
 


In [14]:
## pegando os campos QueryNumber

QueryNumberList = Bs_data.find_all('QueryNumber')

NumbersList = [ number.text for number in QueryNumberList]

NumbersList[0:10]

['00001',
 '00002',
 '00003',
 '00004',
 '00005',
 '00006',
 '00007',
 '00008',
 '00009',
 '00010']

In [15]:
## pegando os campos QueryText e tratando os dados


def tokenize_and_remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(token).upper() for token in tokens if (token.lower() not in stop_words and len(token) > 2 and not token.isnumeric())] 
    return tokens

QueryTextList = Bs_data.find_all('QueryText')

TextList = [ tokenize_and_remove_stopwords(text.text) for text in QueryTextList]

TextList[0:10]

[['EFFECT', 'CALCIUM', 'PHYSIC', 'PROPERTI', 'MUCU', 'PATIENT'],
 ['ONE',
  'DISTINGUISH',
  'EFFECT',
  'MUCU',
  'HYPERSECRET',
  'INFECT',
  'SUBMUCOS',
  'GLAND',
  'RESPIRATORI',
  'TRACT'],
 ['SALIVARI', 'GLYCOPROTEIN', 'PATIENT', 'DIFFER', 'NORMAL', 'SUBJECT'],
 ['LIPID', 'COMPOSIT', 'RESPIRATORI', 'SECRET'],
 ['MUCU', 'ABNORM'],
 ['EFFECT',
  'WATER',
  'THERAPEUT',
  'AGENT',
  'PHYSIC',
  'PROPERTI',
  'VISCOS',
  'ELAST',
  'SPUTUM',
  'BRONCHIAL',
  'SECRET',
  'PATIENT'],
 ['MUCU',
  'GLYCOPROTEIN',
  'DEGRAD',
  'DIFFER',
  'PATIENT',
  'COMPAR',
  'NORMAL',
  'SUBJECT'],
 ['HISTOCHEM', 'DIFFER', 'DESCRIB', 'NORMAL', 'RESPIRATORI', 'EPITHELIA'],
 ['ASSOCI', 'LIVER', 'DISEAS', 'CIRRHOSI', 'VITAMIN', 'METABOL'],
 ['ROLE', 'VITAMIN', 'THERAPI', 'PATIENT']]

In [16]:

#criando um df das consultas 

data = pd.DataFrame({
   'QueryText': TextList,
    'QueryNumber': NumbersList
})

data.head()


Unnamed: 0,QueryText,QueryNumber
0,"[EFFECT, CALCIUM, PHYSIC, PROPERTI, MUCU, PATI...",1
1,"[ONE, DISTINGUISH, EFFECT, MUCU, HYPERSECRET, ...",2
2,"[SALIVARI, GLYCOPROTEIN, PATIENT, DIFFER, NORM...",3
3,"[LIPID, COMPOSIT, RESPIRATORI, SECRET]",4
4,"[MUCU, ABNORM]",5


In [17]:
#gerando csv

data.to_csv(CONSULTAS_PC, sep = ';', index=False)

In [18]:
#extraindo os numeros da consulta, os scores e os documentos
Records = Bs_data.find_all('Records')

queries_dict = {}

def get_votes (string):
    votos = 0
    for i in string:
        if eval(i) > 0:
            votos += 1
    return votos

for index, record in enumerate(Records):
    documents = [ (int(item.text), get_votes (item['score']) ) for item in record.find_all('Item')]
    queries_dict[index] = documents
    

queries_dict

{0: [(139, 4),
  (151, 4),
  (166, 1),
  (311, 1),
  (370, 2),
  (392, 1),
  (439, 1),
  (440, 2),
  (441, 4),
  (454, 1),
  (461, 4),
  (502, 1),
  (503, 1),
  (505, 1),
  (520, 2),
  (522, 1),
  (526, 3),
  (527, 2),
  (533, 4),
  (593, 1),
  (619, 1),
  (737, 1),
  (742, 1),
  (789, 1),
  (827, 1),
  (835, 1),
  (861, 1),
  (875, 4),
  (891, 1),
  (921, 2),
  (922, 2),
  (1175, 1),
  (1185, 1),
  (1222, 1)],
 1: [(169, 1), (434, 2), (454, 1), (498, 1), (499, 1), (592, 1), (875, 3)],
 2: [(23, 1),
  (40, 1),
  (139, 4),
  (190, 1),
  (221, 1),
  (246, 1),
  (309, 1),
  (311, 2),
  (325, 1),
  (345, 1),
  (347, 1),
  (356, 1),
  (370, 2),
  (374, 1),
  (375, 4),
  (439, 1),
  (440, 3),
  (454, 3),
  (515, 1),
  (520, 1),
  (524, 1),
  (526, 2),
  (527, 3),
  (533, 1),
  (535, 1),
  (560, 1),
  (561, 1),
  (571, 1),
  (584, 1),
  (604, 1),
  (623, 1),
  (633, 4),
  (733, 1),
  (742, 2),
  (854, 1),
  (856, 4),
  (950, 1),
  (967, 1),
  (1144, 4),
  (1161, 1),
  (1172, 1),
  (1175, 1),


In [19]:
#montando o arquivo

QueryNumber= []
DocNumber= []
DocVotes= []

for chave in queries_dict:
    lista = queries_dict[chave]
    QueryNumber.extend([chave] * len(queries_dict[chave]))
    for tuple in queries_dict[chave]:
        DocNumber.append(tuple[0])
        DocVotes.append(tuple[1])


df = pd.DataFrame({
    'QueryNumber': QueryNumber,
    'DocNumber': DocNumber,
    'DocVotes': DocVotes
})

In [20]:
#exportando em csv

df.to_csv(ESPERADOS_PC, sep = ';', index=False)


# BUSCADOR

In [21]:
# lendo as configs

config = configparser.ConfigParser()
config.read_file(open(r'BUSCA.CFG'))
MODELO_BU = config.get('CONFIGS', 'MODELO')
CONSULTAS_BU = config.get('CONFIGS', 'CONSULTAS')
RESULTADO_BU = config.get('CONFIGS', 'RESULTADOS')


MODELO_BU, CONSULTAS_BU,RESULTADO_BU

('../RESULT/Modelo.csv', '../RESULT/consultas.csv', '../RESULT/Resultados.csv')

In [22]:
# lendo os csvs

m_doc = pd.read_csv(MODELO_BU, sep =';')
consultas = pd.read_csv(CONSULTAS_BU, sep =';')

m_doc.set_index('Word', inplace=True)

m_doc

Unnamed: 0_level_0,1,356,782,168,583,981
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SIGNIFIC,0.183102,0.732408,0.000000,0.0,0.0,0.000000
PSEUDOMONA,0.895880,0.000000,0.000000,0.0,0.0,0.000000
AERUGINOSA,0.895880,0.000000,0.000000,0.0,0.0,0.000000
INFECT,0.549306,0.000000,0.219722,0.0,0.0,0.000000
RESPIRATORI,0.895880,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...
SOLUT,0.000000,0.000000,0.000000,0.0,0.0,0.358352
ABSOLUT,0.000000,0.000000,0.000000,0.0,0.0,0.358352
ETHYL,0.000000,0.000000,0.000000,0.0,0.0,0.358352
ALCOHOL,0.000000,0.000000,0.000000,0.0,0.0,0.358352


In [23]:
m_consult = m_doc.copy()

for indice, consulta in consultas.iterrows():
    series = pd.Series(eval(consulta.iloc[0]))
    contagem_palavras = series.value_counts()
    contagem_palavras.name = indice + 1
    m_consult = pd.merge(m_consult, pd.DataFrame(contagem_palavras), left_index=True, right_index=True, how= 'left')
    m_consult.fillna(0, inplace=True)
   
    
m_consult = m_consult.iloc[:,6:]
m_consult

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,90,91,92,93,94,95,96,97,98,99
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIGNIFIC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PSEUDOMONA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AERUGINOSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INFECT,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RESPIRATORI,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOLUT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABSOLUT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ETHYL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALCOHOL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#agora resta fazer os calculos

def norma_euclid (serie):
    norma = np.linalg.norm(serie)
    return norma

def similaridade (documento, consulta):
    produto_escalar = np.dot(documento, consulta)
    norma_doc = norma_euclid(documento)
    norma_con = norma_euclid(consulta)
 
    if (norma_doc != 0 and norma_doc != 0):
        return  produto_escalar / (norma_doc * norma_con)
    else:
        return 0

m_sim = pd.DataFrame ({
     'documentos': m_doc.columns   
})

for consulta in m_consult:
    sim = []
    for documento in m_doc:
        sim.append(similaridade(m_consult[consulta], m_doc[documento]))
    m_sim[consulta] = sim

m_sim
 

Unnamed: 0,documentos,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1,0.018348,0.242763,0.014212,0.165628,0.0,0.018348,0.044849,0.135235,0.0,...,0.250123,0.045078,0.031779,0,0.018348,0.01589,0.018348,0.045078,0.022471,0.018348
1,356,0.0,0.0,0.18396,0.0,0.0,0.0,0.133231,0.139343,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,782,0.014114,0.030267,0.010932,0.0,0.0,0.014114,0.00998,0.0,0.034719,...,0.092302,0.028348,0.024445,0,0.014114,0.156892,0.014114,0.138701,0.017286,0.106581
3,168,0.091154,0.118349,0.01194,0.092763,0.185525,0.091154,0.064456,0.0,0.113754,...,0.013349,0.12218,0.026698,0,0.091154,0.093785,0.091154,0.0,0.111641,0.015414
4,583,0.016724,0.0,0.037578,0.0,0.0,0.016724,0.034304,0.03179,0.100643,...,0.014483,0.03179,0.028966,0,0.016724,0.014483,0.098899,0.0,0.020482,0.016724
5,981,0.069567,0.037841,0.053886,0.0,0.0,0.069567,0.049191,0.057803,0.0,...,0.010188,0.022361,0.020375,0,0.12737,0.010188,0.011764,0.057803,0.014407,0.011764


In [25]:
#montando os resultas

resultados = pd.DataFrame({
    'Consulta': [],
    'Resultados': []
})
        

for consulta in m_sim.iloc[:,1:]:
    serie = m_sim[consulta]
    sorted = serie.sort_values(ascending = False)
    indices = sorted.index
    result = []
    for index, dist in enumerate(sorted):
        result += [(index, m_sim.documentos[indices[index]], dist)]
    nova_linha = pd.Series({'Consulta': consulta, 'Resultados': result})
    resultados.loc[consulta -1] = nova_linha
    

resultados

Unnamed: 0,Consulta,Resultados
0,1,"[(0, 168, 0.09115436345739746), (1, 981, 0.069..."
1,2,"[(0, 1, 0.24276298332592008), (1, 168, 0.11834..."
2,3,"[(0, 356, 0.18395975771962617), (1, 981, 0.053..."
3,4,"[(0, 1, 0.16562790010184245), (1, 168, 0.09276..."
4,5,"[(0, 168, 0.1855252189851393), (1, 1, 0.0), (2..."
...,...,...
94,95,"[(0, 782, 0.15689150114400746), (1, 168, 0.093..."
95,96,"[(0, 583, 0.09889851440843972), (1, 168, 0.091..."
96,97,"[(0, 782, 0.13870104185693266), (1, 981, 0.057..."
97,98,"[(0, 168, 0.1116408391494124), (1, 1, 0.022471..."


In [26]:
#exportando para csv

resultados.to_csv(RESULTADO_BU, sep = ';', index=False)
