In [109]:
#coding: utf-8

import pandas
import nltk

__author__ = "Ionesio Junior"

# Color consts
OKGREEN = '\033[92m'
FAIL = '\033[91m'
ENDC = '\033[0m'

match_words = {}

class Documents(object):
        def __init__(self,index, first_word, tf_first_word):
            self.__index = index
            self.__first_word = first_word
            self.__dict_of_words = { first_word: tf_first_word}
        
        def get_index(self):
            return self.__index
        
        def append_new_term(self,word, word_frequency):
                self.__dict_of_words[word] = word_frequency
        
        def get_term_frequency(self, word=None):
                if( word == None ):
                    self.__dict_of_words[this.__first_word]
                else:
                    try:
                        return self.__dict_of_words[word]
                    except KeyError:
                        return 0
        
        def get_words_dict(self):
            return self.__dict_of_words
        
        def update(self, other_doc):
            self.__dict_of_words.update(other_doc.get_words_dict())

        def __eq__(self,other):
            return self.__index == other.get_index()

        def __hash__(self):
            return hash(self.__index)
        
        
def add_dictionary( info_tuple ):
    '''
        This method fill lists of news index with match words

        Args:
            info_tuple(String, Int, Text ) : this tuple contain a specific word and index of news
    '''
    word, index, text = info_tuple
    try:
        match_words[ word.lower() ].add( Documents(index, word.lower(), text.count(word)) )
    except KeyError:
        match_words[ word.lower() ] = set( [ Documents(index, word.lower(), text.count(word))] )


def extract_words( text , id_col ):
    ''' 
        This method extract match words from all text tables and store in a dictionary correlating with news index
        
        Args:
                text() : table with text to extract words
                id_col : table with news index
    '''
    [ map( add_dictionary, [ ( word, id_col[i],text[i] ) for word in nltk.word_tokenize( text[i] ) ] ) for i in xrange( len(text) ) ]


In [110]:
file_csv = pandas.read_csv("noticias_estadao.csv")
text = file_csv.titulo + " " + file_csv.conteudo
extract_words(text,file_csv.idNoticia)

In [207]:
from math import log
k = 21

def AND_between_documents(x,y):
    new_list = []
    for first_doc in x:
        for second_doc in y:
            if(second_doc == first_doc):
                first_doc.update(second_doc)
                new_list.append(first_doc)
    return set(new_list)

def idf(word):
    return log( len(file_csv.idNoticia) + 1 / len(match_words[word]) )

def bm25(term_frequency):
        return ((k + 1)* term_frequency) / (k + term_frequency)

def search( type_of_search, words ):
    '''
        This method search a set of words using AND/OR operators to guide how to search

        Args:
                words(String) : string with words to be searched (Ex: word1 AND word2 AND ... | word1 OR word2 OR ... )
        Return:
                index_list[Int] : list of news index that matches with words
    '''
    if ( " " in words ):
        list_of_words = list( set( map( lambda x: ( x.strip().lower(), words.count(x) ), words.split(" ") ) ) )
        results = reduce( lambda x,y : AND_between_documents(x,y) , [ match_words[word[0].lower()] for word in list_of_words ] )
    else:
        results =  list( match_words[words] )
    if type_of_search.lower() == "binary":
        return map(lambda x: x.get_index(), results)
    elif type_of_search.lower() == "tf":
        tf_results = [ ( result.get_index(), sum( map(lambda x: x[1] * result.get_words_dict()[x[0]], list_of_words )) ) for result in results ]
        return sorted(tf_results, key=lambda x: x[1],reverse=True)
    elif type_of_search.lower() == "tf-idf":
        tf_idf_results = [ ( result.get_index(), sum( map(lambda x: x[1] * result.get_words_dict()[x[0]] * idf(x[0]), list_of_words )) ) for result in results ]
        return sorted(tf_idf_results, key=lambda x: x[1],reverse=True)
    elif type_of_search.lower() == "bm25":
        bm25_results = [ ( result.get_index(), sum( map(lambda x: x[1] * bm25(result.get_words_dict()[x[0]]) * idf(x[0]), list_of_words )) ) for result in results ]
        return sorted(bm25_results, key=lambda x: x[1],reverse=True)

In [208]:
def debug_test(type_of_search, text, expected_value):
    result =  len(search(type_of_search,text)) == expected_value
    if result == True:
        print "Testing \"" + text + "\"... " + OKGREEN + "Sucess!!" + ENDC
    else:
        print "Testing \"" + text + "\"... " + FAIL + "Fail!!" + ENDC
    

def test_search():
    #OR
    #debug_test("debate OR presidencial",1770)
    #debug_test("presidenciáveis OR corruptos", 164)
    #debug_test("Belo OR Horizonte", 331)

    #AND
    debug_test("binary","Belo Horizonte", 242)
    debug_test("binary","presidenciáveis corruptos", 0)
    debug_test("binary","debate presidencial",201)
    debug_test("binary","Campina Grande",12)

search("bm25","Belo Horizonte")

[(1883, 125.18163665340039),
 (1013, 89.41545475242884),
 (3183, 71.53236380194308),
 (1431, 71.53236380194308),
 (7128, 71.53236380194308),
 (7126, 71.53236380194308),
 (1877, 53.64927285145731),
 (1598, 53.64927285145731),
 (4765, 53.64927285145731),
 (4720, 53.64927285145731),
 (5100, 53.64927285145731),
 (4294, 53.64927285145731),
 (5111, 53.64927285145731),
 (1046, 35.76618190097154),
 (624, 35.76618190097154),
 (7453, 35.76618190097154),
 (1230, 35.76618190097154),
 (6591, 35.76618190097154),
 (1872, 35.76618190097154),
 (1880, 35.76618190097154),
 (1389, 35.76618190097154),
 (6587, 35.76618190097154),
 (3531, 35.76618190097154),
 (3534, 35.76618190097154),
 (3915, 26.824636425728656),
 (4, 17.88309095048577),
 (1033, 17.88309095048577),
 (1367, 17.88309095048577),
 (13, 17.88309095048577),
 (2063, 17.88309095048577),
 (2064, 17.88309095048577),
 (2065, 17.88309095048577),
 (4115, 17.88309095048577),
 (3604, 17.88309095048577),
 (2074, 17.88309095048577),
 (6174, 17.8830909504857