In [1]:
import pandas as pd
from xml.dom import minidom
import numpy as np
from unidecode import unidecode
from numpy import dot
from numpy.linalg import norm
sys.path.insert(1, '../utils')
from read_config import read_config_file
from nltk.tokenize import RegexpTokenizer

In [2]:
inputs_path = '../inputs/'
results_path = '../results/'

## Read Config File

In [3]:
config_dict = {}
read_config_file(inputs_path + 'BUSCA.CFG', config_dict)
config_dict

{'MODELO': ['modelo.csv.gz'],
 'CONSULTAS': ['consultas.csv'],
 'RESULTADOS': ['resultados.csv']}

## Read Results

In [4]:
vector_model= pd.read_csv(results_path + config_dict['MODELO'][0], sep=';', dtype={'doc':str})


In [5]:
vector_model = vector_model.set_index('doc')

In [11]:
consultas= pd.read_csv(results_path + config_dict['CONSULTAS'][0], sep=';', dtype={'QueryNumber':str})
consultas.head()

Unnamed: 0,QueryNumber,QueryText
0,1,what are the effects of calcium on the physica...
1,2,can one distinguish between the effects of muc...
2,3,how are salivary glycoproteins from cf patient...
3,4,what is the lipid composition of cf respirator...
4,5,is cf mucus abnormal?


In [12]:
N = vector_model.shape[0]
N

1215

## Implement Buscador

In [13]:
results = pd.DataFrame()
tokenizer = RegexpTokenizer(r'\w+')

for index, row in consultas.iterrows():
    num_consulta = row['QueryNumber']
    consulta = row['QueryText']
    tokens = tokenizer.tokenize(consulta)
    
    
    tf_idf_matrix = vector_model.loc[:, vector_model.columns.isin(tokens)]
    query_vector = np.ones(tf_idf_matrix.shape[1])
    
    
    matrix_norm = tf_idf_matrix.apply(lambda x: norm(x), axis=1)
    query_norm = norm(query_vector)
    query_docs_distance = 1 - (tf_idf_matrix@query_vector).divide(matrix_norm*query_norm).replace(np.nan, 0)
    
    
    query_docs_df = pd.DataFrame(query_docs_distance.sort_values(), columns=['distance']).reset_index()
    query_docs_df['ranking'] = [i+1 for i in range(0, N)]
    new_results = pd.concat([
        pd.Series([num_consulta for i in range(0, N)]), 
        query_docs_df.loc[:, ['ranking', 'doc', 'distance']].apply(lambda x: list(x), axis=1)
    ], axis=1)
    
    results = pd.concat([results, new_results])

## Generate Results

In [14]:
results = results.rename(columns={
    0: 'QueryNumber',
    1: 'Result'
})
results

Unnamed: 0,QueryNumber,Result
0,00001,"[1, 00568, 0.2567705545226937]"
1,00001,"[2, 00392, 0.30154363532400863]"
2,00001,"[3, 00767, 0.34648276399395195]"
3,00001,"[4, 00552, 0.3473665695005088]"
4,00001,"[5, 00043, 0.36078081267395345]"
...,...,...
1210,00100,"[1211, 00229, 0.7028979001133253]"
1211,00100,"[1212, 00561, 0.704207401001172]"
1212,00100,"[1213, 01007, 0.7072940040889628]"
1213,00100,"[1214, 00940, 0.7196677320148781]"


In [15]:
results.to_csv(results_path + config_dict['RESULTADOS'][0], sep=';', index=None)