# <span style='color:Blue'> Implementing a Relevance Feedback Information Retrieval System </span>

In [None]:
import pickle
import re
from itertools import islice
from functools import reduce
from collections import defaultdict
from math import sqrt

## Importing the VectorSpace, Corpus, ChampionLists

In [None]:
class MovieDescription:
    
    def __init__(self,docID, title, description):
        self.title = title
        self.description = description
        self.docID = docID
        
    def __repr__(self):
        return self.title

In [None]:
def loadObject(name):
    with open('objects/' + name + '.pkl', 'rb') as infile:
        return pickle.load(infile)

In [None]:
vectorSpace = loadObject("vectorSpace")
corpus = loadObject("corpus")
champList = loadObject("champList")
inv_index_normalized = loadObject("inv_index_normalized")

## Importing all the methods needed to perform a free-forms query

In [None]:
def normalize(text):
    no_punctuation = re.sub(r'[^a-zA-Z\s]+','',text)
    downcase = no_punctuation.lower()
    return downcase

def tokenize(text):
    text = normalize(text)
    return list(text.split())

def sortVector(vector):
    sorted_vector = {k: v for k, v in sorted(vector.items(), key=lambda item: item[1], reverse=True)}
    return sorted_vector

def normalizeVector(vector):
    length = sqrt(sum([x**2 for x in vector.values()]))
    normalized = {k: tfidf/length for k, tfidf in vector.items()}
    return normalized

def sortAndNormalize(vector):
    return sortVector(normalizeVector(vector))

def queryAsVector(query):
    query = tokenize(query)
    query_vector = {}

    for term in query: #iterate through all the query terms
        query_vector[term] = 1
    query_vector = normalizeVector(query_vector)
    return query_vector

def innerProduct(vectorA, vectorB):
    setA = set(vectorA.keys())
    setB = set(vectorB.keys())
    product = 0
    intersection = setA.intersection(setB)
    
    for term in intersection:
        product += vectorA[term] * vectorB[term]
    return product

def docIDListToTitles(docID_list):
    result_titles = {docID: ' '.join(corpus[docID].title) for docID in docID_list}
    return result_titles

def searchVectorSpace(query_vector, vectorSpace):
    result_innerProduct = {}
    for docID, current_vector in vectorSpace.items():
        inner_product = innerProduct(query_vector, current_vector)
        if inner_product > 0:
            result_innerProduct[docID] = inner_product
    result_sorted_by_innerProduct = sortVector(result_innerProduct)
    docID_list = list(result_sorted_by_innerProduct.keys())
    result_titles = docIDListToTitles(docID_list)
    return result_titles

def union(listA, listB):
    setA = set(listA)
    setB = set(listB)
    union = setA.union(setB)
    return list(union)

In [None]:
def searchChampionList(query):
    query = tokenize(query)
    result_list = []
    
    for term in query:
        result_list.append(champList[term])
    union_result_list = reduce(union, result_list)
    return docIDListToTitles(union_result_list)

In [None]:
def searchVectorSpaceAsInvertedIndex(query_vector, inv_index_normalized):
    result_products = defaultdict(dict)

    for term, query_tfidf in query_vector.items():
        for docID, tfidf_normalized in inv_index_normalized[term].items():
            result_products[docID][term] = tfidf_normalized * query_tfidf

        result_innerProduct = {}
        for docID, vector_products in result_products.items():
            result_innerProduct[docID] = sum(vector_products.values())
        result_innerProduct = sortVector(result_innerProduct)

        result_titles = {}
        for docID, inner_product in result_innerProduct.items():
            result_titles[docID] = ' '.join(corpus[docID].title)
    return result_titles

## Implementing Relevance Feedback -> Rocchio Algorithm!!
- #### Reasonable values might be α = 1, β = 0.75, and γ = 0 (only positive feedback with γ = 0)

In [None]:
def sumVector(vectorA, vectorB):
    setA = set(vectorA.keys())
    setB = set(vectorB.keys())
    summation = {}
    intersection = setA.intersection(setB)
    union = setA.union(setB)
    
    for component in intersection: # sum tfidf when matching keys
        summation[component] = vectorA[component] + vectorB[component]

    for component in union - intersection: # assign the right tfidf when keys do not match
        try:
            summation[component] = vectorA[component]
        except KeyError:
            summation[component] = vectorB[component]
    return summation

In [None]:
def queryUpdateRocchio(query_vector, preference, vectorSpace):
    alpha = 1
    beta = 0.75

    # calculating sum of relevant documents
    summation = {}
    for docID in preference:
        summation = sumVector(summation, vectorSpace[docID])

    # implementing Rocchio algorithm
    denominator = beta * 1/abs(len(preference))
    centroid_Dr_multiplied_by_beta = {docID: denominator * tfidf for docID, tfidf in summation.items()}
    query_vector_multiplied_by_alpha = {docID: alpha * tfidf for docID, tfidf in query_vector.items()}
    new_query_vector = sumVector(query_vector_multiplied_by_alpha, centroid_Dr_multiplied_by_beta)
    return new_query_vector

## Some Heuristic for the Relevance Feedback
- #### At first iteration, use the Champion List to respond, if there are more than 50 results compute a search on the Vector Space instead
##### Why? Well, while using the Champion Lists, we can't rank the results, thus the user must scan the whole list of results without a sense of ordering, maybe it's a little too messy like this. Maybe it's better to display few results but with an order by relevance -> search on the Vector Space instead
- #### When the user express some preferences, move to the vector space and draft the query using the Rocchio Algorithm
##### Why? Well, while using the Champion Lists, few terms -> few results. Maybe we need to have a search space wider than the union of the ChampionLists when we express a preference towards a set of documents to be more precise in the drafting of the query!

## Workflow: 
- 1. Search for a query
- 2. Respond to the query using Champion Lists / VectorSpace (depends on the Champion Lists result)
- 3. Give positive feedback for some titles in the result (specifing the docIDs, separated by space)
- 4. Use Rocchio Algorithm to draft the query from the starter point of the Vector Space
- 5. Respond to the new query displaying only the 15 most relevant documents (just to have a nice view of the results and the cicle of iterations)
- 6. Back to point 3. 

In [None]:
def searchVectorSpaceSliced(query_vector, inv_index_normalized, max_length):
    result = searchVectorSpaceAsInvertedIndex(query_vector, inv_index_normalized)
    result_sliced = dict(islice(result.items(), max_length))
    return result_sliced

In [None]:
max_length = 15

query = input("Insert free-form query:")
print()

# The Rocchio algorithm perform a drafting of the query in the VectorSpace, thus we need to cast our query in a vector!
query_vector = queryAsVector(query)

result = searchChampionList(query)

if len(result) > 50:
    result = searchVectorSpaceSliced(query_vector, inv_index_normalized, max_length)

[print('docID: {}, title: "{}"'.format(docID, title)) for docID, title in result.items()]
preference = input("\n Insert docIDs of relevant results").split()


preference = [int(x) for x in preference] # just a parsing from String to Integer

while preference: # while preference list is not empty
    print()
    new_query = queryUpdateRocchio(query_vector, preference, vectorSpace)
    result = searchVectorSpaceSliced(new_query, inv_index_normalized, max_length)
    [print('docID: {}, title: "{}"'.format(docID, title)) for docID, title in result.items()]
    query_vector = new_query
    preference = input("Insert docIDs of relevant results").split()
    
    preference = [int(x) for x in preference]
    
print("\n Bye bye!")

## Trying the pseudo-feedback
- #### Perform the query as usual
- #### Consider the first K retrieved documents in the ranking as relevant and perform Relevance Feedback

In [None]:
def pseudoFeedback(query_vector, vectorSpace, inv_index_normalized,  max_length, K):
    result = searchVectorSpaceSliced(query_vector, inv_index_normalized, K)
    only_first_K_considered = list(result.keys())
    preference = only_first_K_considered
    new_query = queryUpdateRocchio(query_vector, preference, vectorSpace)
    result = searchVectorSpaceSliced(query_vector, inv_index_normalized, max_length)
    return result

## Workflow: 
- 1. Search for a query
- 2. Respond to the query using pseudo-feedback
- 3. Give positive feedback for some titles in the result (specifing the docIDs, separated by space)
- 4. Use Rocchio Algorithm to draft the query from the starter point of the Vector Space
- 5. Respond to the new query displaying only the 15 most relevant documents (just to have a nice view of the results and the cicle of iterations)
- 6. Back to point 3. 

In [None]:
max_length = 15
K = 3

query = input("Insert free-form query:")
print()
query_vector = queryAsVector(query)

result = pseudoFeedback(query_vector, vectorSpace, inv_index_normalized, max_length, K)

[print('docID: {}, title: "{}"'.format(docID, title)) for docID, title in result.items()]
preference = input("\n Insert docIDs of relevant results").split()

preference = [int(x) for x in preference] # just a parsing from String to Integer

while preference: # while preference list is not empty
    print()
    new_query = queryUpdateRocchio(query_vector, preference, vectorSpace)
    result = searchVectorSpaceSliced(new_query, inv_index_normalized, max_length)
    [print('docID: {}, title: "{}"'.format(docID, title)) for docID, title in result.items()]
    query_vector = new_query
    preference = input("Insert docIDs of relevant results").split()
    
    preference = [int(x) for x in preference]
    
print("\n Bye bye!")