In [1]:
import nltk
import math
import pandas as pd

from common import *

from collections import Counter
from unidecode import unidecode
from typing import List, Dict, Set

# importing dependencies and load data.
data = load_data_from_cvs("estadao_news.csv")

In [2]:
# joyning column of title of artile with column of content.
data['articles'] = data['titulo']  + ' ' + data['conteudo']

In [3]:
# normalizing and tokenizing articles.
data['articles'] = data['articles'].apply(normalize)
data['tokens'] = data['articles'].apply(tokenize)

In [4]:
# applying summarization to articles to produce inverted index.
inverted_index = summarize(data['tokens'], data['idNoticia'])

In [5]:
DISJUNCTION = 'OR'
FIRST_WORD_INDEX = 0
SECOND_WORD_INDEX = 2
QUERY_INDEX = 1

def calc_word_distance(str1, str2):
    return nltk.edit_distance(str1, str2)

def get_more_sismilar_word(word):
    tokens = inverted_index.keys()
    candidate = {
        'token': None,
        'distance': None
    }
    for token in tokens:
        distance = calc_word_distance(word, token)
        if distance == 1: return token
        
        if candidate['distance'] == None:
            candidate['token'] = token
            candidate['distance'] = distance
        elif distance < candidate['distance']:
            candidate['token'] = token
            candidate['distance'] = distance
    
    return candidate['token']

def get_index_value(word):
    word = word.lower()
    if word in inverted_index.keys():
        return inverted_index.get(word).get('IDs')
    
    similar_word = get_more_sismilar_word(word)
    return inverted_index.get(similar_word).get('IDs')

def split_query(query):
    return list(map((lambda w: unidecode(w)), query.split()))

def search(query: str) -> Set[str]:
    """Search in inverted index using passed query.
    :param query: Query with two elements that will be searched in
                  inverted index and between them a conjunction or disjunction.
                  Example: "<word1> AND/OR <word2>"
    :returns: Return result of query execution on inverted index.
    """
    elements = split_query(query)
    operation = elements[QUERY_INDEX]
    
    result = []
    if operation == DISJUNCTION:
        result = list(get_index_value(elements[FIRST_WORD_INDEX]))
        result.extend(list(get_index_value(elements[SECOND_WORD_INDEX])))
    else:
        result = set(get_index_value(elements[FIRST_WORD_INDEX])).intersection((
                    get_index_value(elements[SECOND_WORD_INDEX])
                 ))
    
    return set(result)

In [6]:
# Tests by assertion.

assert len(search("debate OR presidencial")) == 1770
assert len(search("debate AND presidencial")) == 201

assert len(search("presidenciáveis OR corruptos")) == 164
assert len(search("presidenciáveis AND corruptos")) == 0

assert len(search("Belo OR Horizonte")) == 331
assert len(search("Belo AND Horizonte")) == 242

TypeError: unhashable type: 'dict'

In [7]:
def conjunctive_search(query: str) -> int:
    """
    :param query: Query with n words that will be searched in
                  inverted index separated by space.
                  Example: "<word1> <word2> <word3> <word4>"
                 
    :returns: Return result of conjunction of the search between 
             all words on inverted index.
    """
    elements = split_query(query)
    
    index = {}
    for element in elements:
        index[len(inverted_index[element])] = element
    
    ordered_frequence = sorted(index.keys())
    
    # conjuntion between result of all elements.
    result = set(inverted_index[index[ordered_frequence[0]]])
    for i in range(1, len(ordered_frequence)):
        result = result.intersection(inverted_index[index[ordered_frequence[i]]])
        
    return result
        

print(len(conjunctive_search("inflação foi culpa do pt")))
print(len(conjunctive_search("inflação foi culpa do temer")))

2
2
