In [26]:
import nltk
import pandas as pd
from collections import Counter
from typing import List, Dict

nltk.download('punkt')

# importing dependencies and load data.
data = pd.read_csv("estadao_news.csv")

[nltk_data] Downloading package punkt to /home/gustavo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
# joyning column of title of artile with column of content.
data['articles'] = data['titulo']  + ' ' + data['conteudo']

In [28]:
# lambda funcion to normalize text to lower case.
to_lower_case = lambda text: text.lower()

In [29]:
# lambda function to split text in tokens.
tokenize = lambda row: row.split()

In [30]:
# normalizing and tokenizing articles.
data['articles'] = data['articles'].apply(to_lower_case)
data['tokens'] = data['articles'].apply(tokenize)

In [31]:
# lambda function to summarize frequence of token in a article.
counter = lambda row: Counter(row)

In [32]:
def count_frequence(article: str, token: str) -> Counter:
    """
        Count frequence of token in a specified article.
    """
    counter = Counter(article)
    return counter[token]

def summarize(matrix_of_tokens: List[str], docIds: List):
    """
        param matrix_of_tokens: matrix of article tokens lists.
        param docIds: list of document ids of all articles.
        
        Create a inverted index with all tokens and yours docIds.
    """
    index = {}
    for i in range(len(matrix_of_tokens)):
        for token in matrix_of_tokens[i]:
            if token in index.keys():
                index[token].append(docIds[i])
            else:
                index[token] = [docIds[i]]
    
    return index

In [33]:
# applying summarization to articles to produce inverted index.
inverted_index = summarize(data['tokens'], data['idNoticia'])

In [34]:
DISJUNCTION = 'OR'
FIRST_WORD_INDEX = 0
SECOND_WORD_INDEX = 2
QUERY_INDEX = 1

def search(query: str) -> Set[str]:
    """
        param query: Query with two elements that will be searched in
                     inverted index and between them a conjunction or disjunction.
                     Example: "<word1> AND/OR <word2>"
        returns: Return result of query execution on inverted index.
    """
    elements = query.split()
    operation = elements[QUERY_INDEX]
    
    result = []
    if operation == DISJUNCTION:
        result = list(inverted_index.get(elements[FIRST_WORD_INDEX].lower(), []))
        result.extend(list(inverted_index.get(elements[SECOND_WORD_INDEX].lower(), [])))
    else:
        result = set(inverted_index.get(elements[FIRST_WORD_INDEX].lower(), [])).intersection((
                    inverted_index.get(elements[SECOND_WORD_INDEX].lower(), []))
                 )
    
    return set(result)

In [35]:
# Tests by assertion.

assert len(search("debate OR presidencial")) == 1770
assert len(search("debate AND presidencial")) == 201

assert len(search("presidenciáveis OR corruptos")) == 164
assert len(search("presidenciáveis AND corruptos")) == 0

assert len(search("Belo OR Horizonte")) == 331
assert len(search("Belo AND Horizonte")) == 242

In [38]:
def conjunctive_search(query: str) -> int:
    """
        param query: Query with n words that will be searched in
                 inverted index separated by space.
                 Example: "<word1> <word2> <word3> <word4>"
                 
        returns: Return result of conjunction of the search between 
                 all words on inverted index.
    """
    elements = query.split()
    
    index = {}
    for element in elements:
        index[len(inverted_index[element])] = element
    
    ordered_frequence = sorted(index.keys())
    
    # conjuntion between result of all elements.
    result = set(inverted_index[index[ordered_frequence[0]]])
    for i in range(1, len(ordered_frequence)):
        result = result.intersection(inverted_index[index[ordered_frequence[i]]])
        
    return result
        

print(len(conjunctive_search("inflação foi culpa do pt")))
print(len(conjunctive_search("inflação foi culpa do temer")))

12
2
