# A simple Boolean retrieval system

Except for reading the corpus, all the steps are independent from the specific corpus.

In [1]:
from functools import total_ordering, reduce
import csv #reading data
import re #performing normalization --> regular expressions

## Posting

In [2]:
@total_ordering
class Posting:
    
    def __init__(self, docID):
        self._docID = docID
        
    def get_from_corpus(self, corpus):
        '''
        returns the movie description
        '''
        return corpus[self._docID]
    
    #need to define an order for the postings   
    def __eq__(self, other):
        return self._docID == other._docID
    
    def __gt__(self, other):
        return self._docID > other._docID
    
    def __repr__(self):
        return str(self._docID)
    

## Posting Lists

In [3]:
class PostingList:
    
    '''
    class for the management of the posting list
    '''
    
    def __init__(self):
        '''
        attribute is a list of postings
        '''
        self._postings = []
    
    #we can create a posting list from the document ID
    #it is another constructor
    @classmethod #called as PostingList.from_docID
    def from_dicID(cls, docID):
        plist = cls()
        plist._postings = [(Posting(docID))]
        return plist
    
    #we can create a posting list merging multiple posting lists
    @classmethod
    def from_posting_list(cls, postingList):
        plist = cls()
        plist._postings = postingList
        return plist
    
    def merge(self, other):
        #we have to assume that all the docID of the second list are higher
        i = 0
        last = self._postings[-1]
        #skip all the elements in the second posting list that are equal to the last docID
        #we can have the same docID multiple times
        #when we merge lists we don't want duplicates docID
        while(i < len(other._postings) and last == other._postings[i]):
            i += 1
        self._postings += other._postings[i:]
        
    def intersection(self, other):
        intersection = []
        i = 0 #index for the first list
        j = 0 #index for the second posting list
        
        #until we reach the end of one of the posting lists
        while(i < len(self._postings) and j < len(other._postings)):
            if(self._postings[i] == other._postings[j]):
                intersection.append(self._postings[i])
                i += 1
                j += 1
            elif(self._postings[i] < other._postings[j]):
                i += 1
            else:
                j += 1
            return PostingList.from_posting_list(intersection)
        
    def union(self, other):
        union = []
        i = 0
        j = 0
        while(i < len(self._postings) and j < len(other._postings)):
            if(self._postings[i] == other._postings[j]):
                union.append(self._postings[i])
                i += 1
                j += 1
            elif(self._postings[i] < other._postings[j]):
                #we append the smallest one
                union.append(self._postings[i])
                i += 1
            else:
                union.append(other._postings[j])
                j += 1
        #we can have a list that is not emptied
        for k in range(i, len(self._postings)):
            union.append(self._postings[k])
        for k in range(j, len(other._postings)):
            union.append(other._postings[k])
        return PostingList.from_posting_list(union)
    
    #we want the collection of all documents of the posting lists
    def get_from_corpus(self, corpus):
        return list(map(lambda x : x.get_from_corpus(corpus), self.postings))
    
    def __repr__(self):
        return ", ".join(map(str, self._postings))

## Terms

In [4]:
class ImpossibleMergeError(Exception):
    pass

@total_ordering #all possible comparisons are already defined
class Term:
    
    def __init__(self, term, docID):
        self.term = term
        self.posting_list = PostingList.from_dicID(docID)
    
    def merge(self, other):
        if(self.term == other.term):
            self.posting_list.merge(other.posting_list)
        else:
            raise ImpossibleMergeError #some kind of error
            
    #we need to order the terms
    def __eq__(self, other):
        return self.term == other.term
    
    def __gt__(self, other):
        return self.term > other.term
    
    def __repr__(self):
        return self.term + ": " + repr(self.posting_list)

#example
x = Term("cat", 3)
y = Term("cat", 6)
x.merge(y)
print(x)

cat: 3, 6


## Inverted index

In [6]:
def normalize(text):
    '''
    remove punctuation and everything that is not a word
    '''
    #way of matching the text and substitute it
    # \w means not sth alphanumeric
    # \s not a space
    # ~ not a dash
    no_punctuation = re.sub(r'[\w^\s^~]', '', text)
    downcase = no_punctuation.lower()
    return downcase

def tokenize(movie):
    text = normalize(movie.description)
    return list(text.split())

class InvertedIndex:
    
    def __init__(self):
        self._dictionary = []
    
    @classmethod
    def from_corpus(cls, corpus):
        #keys are the tokens
        #actual terms are the values
        intermediate_dict = {}
        for docID, document in enumerate(corpus):
            tokens = tokenize(document)
            for token in tokens:
                term = Term(term, docID)
                try:
                    #merge the posting lists
                    intermediate_dict(token).merge(term)
                except KeyError:
                    #insert the new id
                    intermediate_dict[token] = term
            if(docID % 1000 == 0):
                print("ID: " + str(docID))
        #now we have all the terms and we can create the actual inverted index
        idx = cls()
        #the dictionary is the sorted list of all the terms
        idx._ditionary = sorted(intermediate_dict.values())
        return idx
    
    #we have to index the inverted index using terms
    def __getitem__(self, key):
        #since everything is sorted in principle we could do a binary search
        for term in self._dictionary:
            if term.term == key: #the one we are searching for
                return term.posting_list
            
        raise KeyError
        
    def __repr__(self):
        return "A dictionary with " + str(len(self._dictionary)) + " terms."
    

#example
tokenize(normalize("e.g., this is a text"))

AttributeError: 'str' object has no attribute 'description'

## Reading the corpus

Dependent on the specific corpus. 

In [None]:
class MovieDescription:
    '''
    container for al the info we have about the movie
    '''
    
    def __init__(self, title, description):
        self.title = title
        self.description = description
        
    def __repr__(self): 
        return self.title
    
    
def read_movie_description():
    #hardcode the names of the dataset's files
    filename = 'Lab/plot_summaries.txt'
    movie_names = 'Lab/movie.metadata.tsv'
    with open(movie_names, 'r') as csv_file:
        movie_names = csv.reader(csv_file, delimiter='\t')
        #dictionary where the index will be the ID of the movie
        #and the value the title of the movie
        names_table = {}
        for name in movie_names:
            names_table[name[0]] = name[2]
    #open the file containing the descriptions
    with open(filename, 'r') as csv_file:
        description = csv.reader(csv_file, delimiter='\t')
        #corpus is a list of objects
        corpus = []
        #we wrap into a try block since there are some descriptions
        #with errors in the ID
        for desc in description:
            try: 
                movie = MovieDescription(names_table[desc[0]], desc[1])
                corpus.append(movie)
            except KeyError:
                pass
    return corpus

In [None]:
corpus = read_movie_description()

In [None]:
idx = InvertedIndex

In [None]:
print(idx)
idx['batman'] #docId of all the movies containing batman in the description

## Putting it all together

In [None]:
class IRsystem:
    #in the system we have both the index and the corpus
    def __init__(self, corpus, index):
        self._corpus = corpus
        self._index = index
    
    #we want to geenrate the entire inverted index 
    #calling the constructor
    @classmethod
    def from_corpus(cls, corpus):
        index = InvertedIndex.from_corpus(corpus)
        return cls(corpus, index)
    
    def answer_query(self, words): 
        #same normalization in the query and in the index
        norm_words = map(normalize, words)
        postings = map(lambda w : self._index[w], norm_words)
        plist = reduce(lambda x, y: x.intersection(y), postings)
        return plist.get_from_corpus(self._corpus)

#permorm a query on a IR system
def query(ir, text):
    words = text.split()
    answer = ir.answer_query(words)
    for movie in answer:
        print(movie)       

In [None]:
ir = IRsystem(corpus, idx)

query(ir, "frodo Gandalf")