# <span style='color:Blue'> Vector Space Model with a Champion List Implementation </span>

- #### The aim is to create an Information Retrieval model able to retrieve documents, given a 'free-form' query in input through an union of the champion lists of the single terms contained in the query.
- #### After the first query, the user can express his preferences, then the Rocchio algorithm moves the query in the direction desired by the user.

## Just some useful imports

In [1]:
from collections import defaultdict
from itertools import islice
from math import log, sqrt
from functools import reduce
import csv
import re
import pickle

## Let's rertrieve the corpus and store it in a dictionary!

The good old csv module do the job as seen in class

In [2]:
class MovieDescription:
    
    def __init__(self,docID, title, description):
        self.title = title
        self.description = description
        self.docID = docID
        
    def __repr__(self):
        return self.title

def readMovieDescriptions():
    filename = 'MovieSummaries/plot_summaries.txt'
    movie_names_file = 'MovieSummaries/movie.metadata.tsv'
    with open(movie_names_file, 'r') as csv_file:
        movie_names = csv.reader(csv_file, delimiter='\t')
        names_table = {}
        for name in movie_names:
            names_table[name[0]] = name[2]
    with open(filename, 'r') as csv_file:
        descriptions = csv.reader(csv_file, delimiter='\t')
        corpus = []
        for docID, desc in enumerate(descriptions):
            try:
                movie = MovieDescription(docID, names_table[desc[0]], desc[1])
                corpus.append(movie)
            except KeyError:
                pass
        return corpus

## Exploring the corpus...

In [3]:
corpus = readMovieDescriptions()
corpus

[Taxi Blues,
 The Hunger Games,
 Narasimham,
 The Lemon Drop Kid,
 A Cry in the Dark,
 End Game,
 Dark Water,
 Sing,
 Meet John Doe,
 Destination Meatball,
 Husband for Hire,
 Up and Down,
 Ghost In The Noonday Sun,
 Exodus,
 House Party 2,
 Forest of the Damned 2,
 Charlie Chan's Secret,
 The Biggest Fan,
 Ashes to Ashes,
 Green Dragon,
 The Rats of Tobruk,
 Red's Dream,
 A la salida nos vemos,
 Class of '61,
 Come Back, Africa,
 Nee Sneham,
 Bhagwan Dada,
 A Merry Mixup,
 Kehtaa Hai Dil Baar Baar,
 Mr. & Mrs. '55,
 Samson and Delilah,
 Getting Even,
 Love, Mary,
 The Deep End of the Ocean,
 Pieces,
 River of No Return,
 Amici miei,
 Yankee Doodle Daffy,
 The Good Life,
 Mickey's Big Game Hunt,
 Red Cliff,
 The Good, the Bad, the Weird,
 Eastern Promises,
 The Dancing Fool,
 To Fili Tis Zois,
 No Entry,
 Tahaan,
 Men In The City,
 Bikini Beach,
 The Storm Riders,
 Against Her Will: An Incident in Baltimore,
 Milk and Honey,
 La Cité de la peur,
 Cheap Kisses,
 The Saddest Music in the

In [4]:
corpus[0].description

"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."

## Mhh... we need to normalize the description of the movies

The good old re module do the job as seen in class

In [5]:
def normalize(text):
    no_punctuation = re.sub(r'[^a-zA-Z\s]+','',text)
    downcase = no_punctuation.lower()
    return downcase

def tokenize(text):
    text = normalize(text)
    return list(text.split())

In [6]:
for movie in corpus:
    movie.description, movie.title = tokenize(movie.description), tokenize(movie.title)

## Let's create an Inverted Index and compute Term Frequency for each docID associated with the current Term
- #### We'll need it to compute tf-idf and create the whole space vector in an efficient way (this is the hope...).
- #### First of all we need to deal with the title and description of the movies, then create an inverted index.
#### (For the sake of simplicity, i merged together title and description).

In [7]:
def makeInvertedIndex(corpus):
    """
    Each posting is not only a document id, but the term frequency
    where the term is contained in the article.
    
    We are creating an entity of this type {term: {docID: termFrequency}}
    """
    index = defaultdict(dict)
    for docID, _ in enumerate(corpus):
        for term in corpus[docID].title + corpus[docID].description:
            try:
                index[term][docID] += 1
            except KeyError:
                index[term][docID] = 1
    return index, docID+1


In [8]:
inv_index, length_corpus = makeInvertedIndex(corpus)

In [9]:
inv_index['hello']

{6: 2,
 418: 1,
 747: 1,
 1252: 1,
 2025: 1,
 3490: 1,
 4346: 1,
 4891: 1,
 5727: 1,
 5905: 1,
 6828: 1,
 9252: 1,
 9447: 1,
 9615: 1,
 9629: 1,
 9826: 3,
 10212: 1,
 10798: 1,
 11108: 1,
 11880: 3,
 12016: 1,
 13107: 1,
 13478: 1,
 13500: 1,
 13799: 2,
 14303: 1,
 14477: 1,
 14984: 1,
 15035: 1,
 15456: 1,
 16000: 1,
 16641: 2,
 16743: 1,
 16824: 1,
 17405: 1,
 17678: 1,
 17707: 1,
 17894: 1,
 17972: 2,
 18362: 2,
 18815: 1,
 18962: 1,
 19699: 1,
 20127: 1,
 20466: 1,
 20676: 1,
 20791: 1,
 20981: 1,
 21352: 1,
 21440: 1,
 21789: 1,
 22625: 1,
 23091: 1,
 23552: 1,
 23861: 1,
 24887: 1,
 24916: 1,
 25139: 1,
 25400: 2,
 25979: 2,
 28439: 1,
 28485: 1,
 30139: 1,
 31473: 1,
 31577: 1,
 32263: 1,
 32826: 2,
 32935: 1,
 33501: 1,
 34518: 1,
 34841: 2,
 34843: 1,
 34888: 1,
 38051: 1,
 39164: 1,
 39657: 1,
 40265: 1,
 41748: 1}

## Creating an Inverted Index containing for each term, the 15 most relevant docIDs (sorted by Term Frequency) -> ChampionLists

In [10]:
def makeInvertedIndexChampionList(inv_index):

    inv_index_champList = defaultdict(list)
    max_length_champList = 15

    """
    We are creating an entity of this type {term: [docID1, docID2, docID3...]}
    """
    
    for term in inv_index:
        sorted_dict = {docID: tf for docID, tf in sorted(inv_index[term].items(), key=lambda item: item[1], reverse=True)}
        if len(sorted_dict) > max_length_champList:
            for i in range(0,max_length_champList):
                docID = list(sorted_dict.keys())[i]
                inv_index_champList[term].append(docID)
        else:
            for i in range(0,len(sorted_dict)):
                docID = list(sorted_dict.keys())[i]
                inv_index_champList[term].append(docID)
    return inv_index_champList

In [11]:
champList = makeInvertedIndexChampionList(inv_index)

In [12]:
champList['hello']

[9826,
 11880,
 6,
 13799,
 16641,
 17972,
 18362,
 25400,
 25979,
 32826,
 34841,
 418,
 747,
 1252,
 2025]

## Creating an Inverted Index with the tfidf for each document of each term -> {term: {docID: tfidf}}

In [13]:
def makeInvertedIndex_tfidf(inv_index, length_corpus):
    N = length_corpus
    inv_index_tfidf = defaultdict(dict)
    
    for term in inv_index.keys():
        idf = log(N/len(inv_index[term]))
        inv_index_tfidf[term] = {docID: tf * idf for docID, tf in inv_index[term].items()}
    return inv_index_tfidf

In [14]:
inv_index_tfidf = makeInvertedIndex_tfidf(inv_index, length_corpus)

In [15]:
inv_index_tfidf['taxi']

{0: 9.003603972688673,
 107: 4.501801986344336,
 155: 4.501801986344336,
 174: 18.007207945377345,
 175: 4.501801986344336,
 178: 4.501801986344336,
 185: 4.501801986344336,
 340: 4.501801986344336,
 435: 4.501801986344336,
 548: 4.501801986344336,
 791: 4.501801986344336,
 944: 13.505405959033009,
 1009: 4.501801986344336,
 1019: 4.501801986344336,
 1086: 4.501801986344336,
 1109: 13.505405959033009,
 1198: 18.007207945377345,
 1423: 4.501801986344336,
 1697: 4.501801986344336,
 1805: 4.501801986344336,
 2002: 4.501801986344336,
 2053: 9.003603972688673,
 2191: 4.501801986344336,
 2210: 4.501801986344336,
 2243: 22.509009931721682,
 2250: 4.501801986344336,
 2346: 9.003603972688673,
 2506: 4.501801986344336,
 2674: 4.501801986344336,
 2759: 4.501801986344336,
 2769: 13.505405959033009,
 2830: 4.501801986344336,
 2930: 9.003603972688673,
 3151: 9.003603972688673,
 3284: 4.501801986344336,
 3312: 4.501801986344336,
 3315: 4.501801986344336,
 3423: 4.501801986344336,
 3515: 4.50180198634

## Let's investigate the term 'taxi' in the list regarding the first document in the corpus

In [16]:
docID = 0 ## first document
term = 'taxi' ## first term object
print("term:", term, "tfidf_docID_0:", inv_index_tfidf[term][docID])

term: taxi tfidf_docID_0: 9.003603972688673


## Our Inverted Index represent the whole Vector Space in an efficient way!!!
- #### Represent a vector of the Vector Space for the first document -> {Term: tf-idf}
- #### Absence of a term in the vector -> tfidf = 0 -> Compact Representation!!!

In [17]:
def documentToVector(docID, inv_index_tfidf):
    vector = {}

    for term in inv_index_tfidf.keys():
        try:
            vector[term] = inv_index_tfidf[term][docID]
        except KeyError:
            pass
    return vector

In [18]:
docID = 0
vector = documentToVector(docID, inv_index_tfidf)
vector

{'taxi': 9.003603972688673,
 'blues': 5.896680091155618,
 'shlykov': 10.650270282261983,
 'a': 0.10871331596846709,
 'hardworking': 6.0755593037586,
 'driver': 3.4498453893170256,
 'and': 0.11799982333012017,
 'lyosha': 9.957123101702038,
 'saxophonist': 7.759898524365818,
 'develop': 4.270147745362218,
 'bizarre': 4.734068219654548,
 'lovehate': 7.705831303095542,
 'relationship': 2.3162790350670073,
 'despite': 2.41226203304358,
 'their': 0.7022874898393036,
 'prejudices': 6.779069271354092,
 'realize': 3.182899215344423,
 'they': 0.6796382320965244,
 'arent': 5.29368400758997,
 'so': 1.4280013394205262,
 'different': 3.070080864317442,
 'after': 0.7337184403966536,
 'all': 1.142050012857573}

## We can do better, sort by tfidf and normalize the vector!

In [19]:
def sortVector(vector):
    sorted_vector = {k: v for k, v in sorted(vector.items(), key=lambda item: item[1], reverse=True)}
    return sorted_vector

def normalizeVector(vector):
    length = sqrt(sum([x**2 for x in vector.values()]))
    normalized = {k: tfidf/length for k, tfidf in vector.items()}
    return normalized

def sortAndNormalize(vector):
    return sortVector(normalizeVector(vector))

In [20]:
vector = sortAndNormalize(vector)
vector

{'shlykov': 0.41869060935019553,
 'lyosha': 0.39144113983378664,
 'taxi': 0.3539557526489571,
 'saxophonist': 0.30506236513768187,
 'lovehate': 0.3029368380646999,
 'prejudices': 0.26650334393650754,
 'hardworking': 0.23884648554606175,
 'blues': 0.23181426527935517,
 'arent': 0.2081088764983428,
 'bizarre': 0.18610888316081145,
 'develop': 0.16787092854335986,
 'driver': 0.13562264899725454,
 'realize': 0.12512828094065775,
 'different': 0.1206930898248001,
 'despite': 0.09483247220584608,
 'relationship': 0.09105912384519446,
 'so': 0.05613855189672154,
 'all': 0.04489704046179188,
 'after': 0.028844434249973266,
 'their': 0.0276087995202866,
 'they': 0.02671839662211341,
 'and': 0.004638888650139529,
 'a': 0.004273811208634466}

## Let's create a a method to parse a query of terms in a normalized vector

In [21]:
def queryAsVector(query):
    query = tokenize(query)
    query_vector = {}

    for term in query: #iterate through all the query terms
        query_vector[term] = 1
    query_vector = normalizeVector(query_vector)
    return query_vector

In [22]:
query = "christmas murder love"
query_vector = queryAsVector(query)
query_vector

{'christmas': 0.5773502691896258,
 'murder': 0.5773502691896258,
 'love': 0.5773502691896258}

## Creating the VectorSpace

In [23]:
def createVectorSpace(inv_index_tfidf, length_corpus):
    vectorSpace = defaultdict(dict)
    for term in inv_index_tfidf.keys():
        for docID in inv_index_tfidf[term].keys():
            vectorSpace[docID][term] = inv_index_tfidf[term][docID]
    return vectorSpace

In [24]:
vectorSpace = createVectorSpace(inv_index_tfidf, length_corpus)

In [25]:
vectorSpace[0]

{'taxi': 9.003603972688673,
 'blues': 5.896680091155618,
 'shlykov': 10.650270282261983,
 'a': 0.10871331596846709,
 'hardworking': 6.0755593037586,
 'driver': 3.4498453893170256,
 'and': 0.11799982333012017,
 'lyosha': 9.957123101702038,
 'saxophonist': 7.759898524365818,
 'develop': 4.270147745362218,
 'bizarre': 4.734068219654548,
 'lovehate': 7.705831303095542,
 'relationship': 2.3162790350670073,
 'despite': 2.41226203304358,
 'their': 0.7022874898393036,
 'prejudices': 6.779069271354092,
 'realize': 3.182899215344423,
 'they': 0.6796382320965244,
 'arent': 5.29368400758997,
 'so': 1.4280013394205262,
 'different': 3.070080864317442,
 'after': 0.7337184403966536,
 'all': 1.142050012857573}

## Now we need to normalize the whole Vector Space

In [26]:
def normalizeVectorSpace(vectorSpace):
    for docID, vector in vectorSpace.items():
        vectorSpace[docID] = sortAndNormalize(vectorSpace[docID])
    return vectorSpace

In [27]:
vectorSpace = normalizeVectorSpace(vectorSpace)

In [28]:
vectorSpace[12]

{'noonday': 0.3239736863884414,
 'treasure': 0.30267974325552427,
 'milligan': 0.2941604478478744,
 'ghost': 0.26182766511643074,
 'sellers': 0.23479531254667912,
 'crewman': 0.222669622749888,
 'relies': 0.21277153477809374,
 'onetime': 0.21070500692895736,
 'goon': 0.1918595529122728,
 'halfway': 0.18942828210504772,
 'bumbling': 0.1834961080966215,
 'spike': 0.18160021981906,
 'sorts': 0.179808746639155,
 'pirate': 0.1759764545337077,
 'reunion': 0.15687084601934384,
 'provided': 0.15510159573135077,
 'colleague': 0.1434874451328957,
 'sun': 0.1392698480150779,
 'buried': 0.13553675378120614,
 'memory': 0.12728809399165777,
 'learning': 0.12099808741082416,
 'lose': 0.11477069668272677,
 'hidden': 0.10950121109448223,
 'more': 0.10665528247882013,
 'captain': 0.1048124427614891,
 'murdered': 0.10184158639532254,
 'hes': 0.09122300038137932,
 'appears': 0.08161869258814977,
 'show': 0.08098087809881621,
 'film': 0.0799085289435067,
 'kills': 0.0781621863994239,
 'begins': 0.055686090

## It's always faster to work with Inverted Index 😃 -> Inverted Index of the normalized Vector Space

In [29]:
def makeInvertedIndexNormalized(vectorSpace):
    inv_index_normalized = defaultdict(dict)
    for docID in vectorSpace.keys():
        for term, tfidf_normalized in vectorSpace[docID].items():
            inv_index_normalized[term][docID] = tfidf_normalized
    return inv_index_normalized

In [30]:
inv_index_normalized = makeInvertedIndexNormalized(vectorSpace)

In [31]:
inv_index_normalized['taxi']

{0: 0.3539557526489571,
 107: 0.04923040112876527,
 155: 0.014210565476356131,
 174: 0.04998939392506807,
 175: 0.01740690231114301,
 178: 0.010545512913208585,
 185: 0.012198968043002481,
 340: 0.03772018487811873,
 435: 0.02355007675996912,
 548: 0.11054258282951285,
 791: 0.02675043138386501,
 944: 0.09497126924540071,
 1009: 0.015004108731079286,
 1019: 0.051836332998080444,
 1086: 0.018511733010045515,
 1109: 0.04094256151875057,
 1198: 0.3393662013442858,
 1423: 0.02669753532765486,
 1697: 0.12090295764128459,
 1805: 0.1727461716999423,
 2002: 0.0426831778686505,
 2053: 0.04301513383441717,
 2191: 0.0420453959945048,
 2210: 0.02881821621644794,
 2243: 0.18542630597312662,
 2250: 0.061765777988104294,
 2346: 0.0847361154854706,
 2506: 0.16363752213533525,
 2674: 0.014599923868220848,
 2759: 0.01171486469503855,
 2769: 0.06842763143426256,
 2830: 0.07483690367919482,
 2930: 0.012134643476191836,
 3151: 0.048834951725034735,
 3284: 0.022392162828212724,
 3312: 0.07872148585623145,
 

## Now we have all the ingredients to compute the Cosine similarity between a query represented as a vector and the Vector Space!!
- #### Cosine similarity between normalized vectors -> Inner Product
- #### Here an example of inner product between the query vector and another document

In [32]:
def innerProduct(vectorA, vectorB):
    setA = set(vectorA.keys())
    setB = set(vectorB.keys())
    product = 0
    intersection = setA.intersection(setB)
    
    for term in intersection:
        product += vectorA[term] * vectorB[term]
    return product

In [33]:
query = "christmas murder love"
query_vector = queryAsVector(query)

vector = vectorSpace[10]

innerProduct(query_vector, vector)

0.00566679140659617

## Search for the best answer to a given query in the whole VectorSpace... 🤖
- Compute Inner Product between a query and every document of the vectorSpace -> Compute a search on the normalized Inverted Index!!!
- sort results by value of the Inner Product

In [34]:
query = "christmas murder love"
query_vector = queryAsVector(query)
query_vector

{'christmas': 0.5773502691896258,
 'murder': 0.5773502691896258,
 'love': 0.5773502691896258}

In [35]:
result_products = defaultdict(dict)

for term, query_tfidf in query_vector.items():
    for docID, tfidf_normalized in inv_index_normalized[term].items():
        result_products[docID][term] = tfidf_normalized * query_tfidf
for docID, vector_products in result_products.items():
    print(docID, vector_products)

2210 {'christmas': 0.014717224667187447}
2674 {'christmas': 0.0521924225479512}
3151 {'christmas': 0.037409408900457075, 'murder': 0.00787734221728514}
3315 {'christmas': 0.020097219057213164, 'love': 0.0034664412577598215}
7336 {'christmas': 0.005277043359456971, 'love': 0.00546122150576162}
10184 {'christmas': 0.032615630533597666, 'love': 0.0056256622872872105}
15086 {'christmas': 0.008980789888055656, 'love': 0.01549039223118527}
18567 {'christmas': 0.018092849540088808}
19102 {'christmas': 0.06055353893866328}
19130 {'christmas': 0.02469756770354843, 'love': 0.002839950721877586}
20411 {'christmas': 0.005681534761049472}
24059 {'christmas': 0.025908102192122088}
31614 {'christmas': 0.012397815699181377, 'love': 0.021384202323415476}
35897 {'christmas': 0.018055181517793294, 'love': 0.0031142232142390798}
36942 {'christmas': 0.10744262519501682}
37033 {'christmas': 0.032444607249906714, 'love': 0.017587942651168004}
8432 {'christmas': 0.011336448492943836}
15615 {'christmas': 0.013

In [36]:
result_innerProduct = {}
for docID, vector_products in result_products.items():
    result_innerProduct[docID] = sum(vector_products.values())
result_innerProduct

{2210: 0.014717224667187447,
 2674: 0.0521924225479512,
 3151: 0.04528675111774222,
 3315: 0.023563660314972985,
 7336: 0.01073826486521859,
 10184: 0.03824129282088488,
 15086: 0.024471182119240924,
 18567: 0.018092849540088808,
 19102: 0.06055353893866328,
 19130: 0.027537518425426018,
 20411: 0.005681534761049472,
 24059: 0.025908102192122088,
 31614: 0.03378201802259685,
 35897: 0.021169404732032375,
 36942: 0.10744262519501682,
 37033: 0.05003254990107472,
 8432: 0.011336448492943836,
 15615: 0.013120758653112855,
 3: 0.021281187588358483,
 7: 0.014029353075316534,
 8: 0.030046712193433172,
 64: 0.013174600919912927,
 86: 0.05393721104946987,
 146: 0.13944111579231666,
 156: 0.008456898486725987,
 215: 0.05471571704579079,
 292: 0.05332320332763527,
 609: 0.018776628629288014,
 623: 0.06543053303352393,
 716: 0.03201827733496171,
 721: 0.16886886323117475,
 914: 0.052589270784934576,
 987: 0.03377440627555867,
 1030: 0.005587587141946851,
 1044: 0.06885771908915074,
 1104: 0.06301

In [37]:
result_innerProduct = sortVector(result_innerProduct)
result_innerProduct

{2891: 0.3559888685187781,
 40389: 0.27789695036164047,
 12278: 0.27425966364063126,
 36802: 0.26710354825880145,
 11832: 0.26423974448934573,
 7940: 0.25686356919086206,
 18490: 0.2534087897365302,
 2777: 0.2474259505164858,
 21176: 0.2440585409996114,
 39775: 0.23234069411326938,
 41829: 0.23219173870906853,
 36378: 0.22523169256248446,
 5065: 0.2246995468813512,
 26991: 0.22400486026127905,
 9812: 0.22173844334459072,
 18043: 0.22137431030763538,
 30824: 0.21141564615892225,
 15708: 0.2107546543125741,
 7202: 0.20840504852135588,
 13390: 0.20777459419171476,
 14269: 0.2044394036218686,
 17433: 0.20361908739712895,
 38736: 0.20049950711711897,
 33411: 0.1991751822598946,
 1896: 0.1988136202344582,
 30405: 0.1952037585049157,
 35351: 0.19507856020874706,
 39733: 0.19352712212767495,
 6367: 0.19338359451248882,
 7953: 0.19222259097910743,
 23411: 0.19142562082567915,
 21895: 0.19101857223813118,
 9607: 0.19087079003542473,
 18467: 0.19069101170048164,
 17818: 0.19037320578364836,
 2084

In [38]:
result_titles = {}
for docID in result_innerProduct.keys():
    result_titles[docID] = ' '.join(corpus[docID].title)
result_titles

{2891: 'christmas under fire',
 40389: 'a christmas carol',
 12278: 'christmas doover',
 36802: 'stealing christmas',
 11832: 'a christmas carol',
 7940: 'a carol christmas',
 18490: 'mickeys magical christmas snowed in at the house of mouse',
 2777: 'young pioneers christmas',
 21176: 'a very christmas story',
 39775: 'the christmas kid',
 41829: 'the good witchs gift',
 36378: 'a christmas snow',
 5065: 'treevenge',
 26991: 'a cosmic christmas',
 9812: 'a boyfriend for christmas',
 18043: 'dates of christmas',
 30824: 'christmas with a capital c',
 15708: 'a mom for christmas',
 7202: 'national lampoons christmas vacation',
 13390: 'the muppet christmas carol',
 14269: 'how the grinch stole christmas',
 17433: 'elmo saves christmas',
 38736: 'a dennis the menace christmas',
 33411: 'christmas comes to pacland',
 1896: 'a fairly odd christmas',
 30405: 'beauty and the beast the enchanted christmas',
 35351: 'accused',
 39733: 'caspers first christmas',
 6367: 'the country mouse and th

## End of the show, wrap everything in a function!

In [39]:
def searchVectorSpaceSad(query_vector, inv_index_normalized):
    result_products = defaultdict(dict)

    for term, query_tfidf in query_vector.items():
        for docID, tfidf_normalized in inv_index_normalized[term].items():
            result_products[docID][term] = tfidf_normalized * query_tfidf

        result_innerProduct = {}
        for docID, vector_products in result_products.items():
            result_innerProduct[docID] = sum(vector_products.values())
        result_innerProduct = sortVector(result_innerProduct)

        result_titles = {}
        for docID, inner_product in result_innerProduct.items():
            result_titles[docID] = ' '.join(corpus[docID].title)
    return result_titles

In [40]:
searchVectorSpaceSad(query_vector, inv_index_normalized)

{2891: 'christmas under fire',
 40389: 'a christmas carol',
 12278: 'christmas doover',
 36802: 'stealing christmas',
 11832: 'a christmas carol',
 7940: 'a carol christmas',
 18490: 'mickeys magical christmas snowed in at the house of mouse',
 2777: 'young pioneers christmas',
 21176: 'a very christmas story',
 39775: 'the christmas kid',
 41829: 'the good witchs gift',
 36378: 'a christmas snow',
 5065: 'treevenge',
 26991: 'a cosmic christmas',
 9812: 'a boyfriend for christmas',
 18043: 'dates of christmas',
 30824: 'christmas with a capital c',
 15708: 'a mom for christmas',
 7202: 'national lampoons christmas vacation',
 13390: 'the muppet christmas carol',
 14269: 'how the grinch stole christmas',
 17433: 'elmo saves christmas',
 38736: 'a dennis the menace christmas',
 33411: 'christmas comes to pacland',
 1896: 'a fairly odd christmas',
 30405: 'beauty and the beast the enchanted christmas',
 35351: 'accused',
 39733: 'caspers first christmas',
 6367: 'the country mouse and th

## Mhhhh... Turns out that doing the search using the normalized inverted index is suuuuuper slow... 😞
## -> Let's use the vectorSpace!!

In [41]:
query = "christmas murder love"
query_vector = queryAsVector(query)
query_vector

{'christmas': 0.5773502691896258,
 'murder': 0.5773502691896258,
 'love': 0.5773502691896258}

In [42]:
result_innerProduct = {}
for docID, current_vector in vectorSpace.items():
    inner_product = innerProduct(query_vector, current_vector)
    if inner_product > 0:
        result_innerProduct[docID] = inner_product
result_innerProduct

{107: 0.01588226069594743,
 155: 0.0045844823596297,
 175: 0.011231310494154281,
 340: 0.013290475127264457,
 435: 0.008297724691299246,
 944: 0.022308361704114855,
 1019: 0.018264213089772795,
 1086: 0.0032612476307783264,
 1697: 0.08519882691647836,
 1805: 0.030433025523690697,
 2002: 0.013770055548974631,
 2053: 0.0037890294557775908,
 2210: 0.014717224667187447,
 2243: 0.019600145509143213,
 2346: 0.007464062271165777,
 2674: 0.0521924225479512,
 2759: 0.004127660517883538,
 2769: 0.008036685013901269,
 2830: 0.01318416134708286,
 2930: 0.0030262754647496348,
 3151: 0.04528675111774222,
 3284: 0.007223954293071542,
 3312: 0.027737031330418167,
 3315: 0.023563660314972985,
 3423: 0.018947149797275027,
 3515: 0.013381508864488056,
 3751: 0.01119581990282765,
 4012: 0.01275985673959487,
 4212: 0.004877375580355177,
 4539: 0.015233400210964329,
 4733: 0.0019735963752168257,
 4737: 0.013971383499636406,
 5010: 0.012324934664454699,
 5195: 0.00873359387525483,
 5295: 0.022033943273902595

In [43]:
result_innerProduct = sortVector(result_innerProduct)
result_innerProduct

{2891: 0.3559888685187781,
 40389: 0.27789695036164047,
 12278: 0.27425966364063126,
 36802: 0.26710354825880145,
 11832: 0.26423974448934573,
 7940: 0.25686356919086206,
 18490: 0.2534087897365302,
 2777: 0.2474259505164858,
 21176: 0.2440585409996114,
 39775: 0.23234069411326938,
 41829: 0.23219173870906853,
 36378: 0.22523169256248446,
 5065: 0.2246995468813512,
 26991: 0.22400486026127905,
 9812: 0.22173844334459072,
 18043: 0.22137431030763538,
 30824: 0.21141564615892225,
 15708: 0.2107546543125741,
 7202: 0.20840504852135588,
 13390: 0.20777459419171476,
 14269: 0.2044394036218686,
 17433: 0.20361908739712895,
 38736: 0.20049950711711897,
 33411: 0.1991751822598946,
 1896: 0.1988136202344582,
 30405: 0.1952037585049157,
 35351: 0.19507856020874706,
 39733: 0.19352712212767495,
 6367: 0.19338359451248882,
 7953: 0.19222259097910743,
 23411: 0.19142562082567915,
 21895: 0.19101857223813118,
 9607: 0.19087079003542473,
 18467: 0.19069101170048164,
 17818: 0.19037320578364836,
 2084

In [44]:
result_titles = {}
for docID in result_innerProduct.keys():
    result_titles[docID] = ' '.join(corpus[docID].title)
result_titles

{2891: 'christmas under fire',
 40389: 'a christmas carol',
 12278: 'christmas doover',
 36802: 'stealing christmas',
 11832: 'a christmas carol',
 7940: 'a carol christmas',
 18490: 'mickeys magical christmas snowed in at the house of mouse',
 2777: 'young pioneers christmas',
 21176: 'a very christmas story',
 39775: 'the christmas kid',
 41829: 'the good witchs gift',
 36378: 'a christmas snow',
 5065: 'treevenge',
 26991: 'a cosmic christmas',
 9812: 'a boyfriend for christmas',
 18043: 'dates of christmas',
 30824: 'christmas with a capital c',
 15708: 'a mom for christmas',
 7202: 'national lampoons christmas vacation',
 13390: 'the muppet christmas carol',
 14269: 'how the grinch stole christmas',
 17433: 'elmo saves christmas',
 38736: 'a dennis the menace christmas',
 33411: 'christmas comes to pacland',
 1896: 'a fairly odd christmas',
 30405: 'beauty and the beast the enchanted christmas',
 35351: 'accused',
 39733: 'caspers first christmas',
 6367: 'the country mouse and th

## End of the show, wrap everything in a function!

In [45]:
def docIDListToTitles(result):
    res_titles = {docID: ' '.join(corpus[docID].title) for docID in result}
    return res_titles

def searchVectorSpace(query_vector, vectorSpace):
    result_innerProduct = {}
    for docID, current_vector in vectorSpace.items():
        inner_product = innerProduct(query_vector, current_vector)
        if inner_product > 0:
            result_innerProduct[docID] = inner_product
    result_sorted_by_innerProduct = sortVector(result_innerProduct)
    docID_list = list(result_sorted_by_innerProduct.keys())
    result_titles = docIDListToTitles(docID_list)
    return result_titles

In [46]:
searchVectorSpace(query_vector, vectorSpace)

{2891: 'christmas under fire',
 40389: 'a christmas carol',
 12278: 'christmas doover',
 36802: 'stealing christmas',
 11832: 'a christmas carol',
 7940: 'a carol christmas',
 18490: 'mickeys magical christmas snowed in at the house of mouse',
 2777: 'young pioneers christmas',
 21176: 'a very christmas story',
 39775: 'the christmas kid',
 41829: 'the good witchs gift',
 36378: 'a christmas snow',
 5065: 'treevenge',
 26991: 'a cosmic christmas',
 9812: 'a boyfriend for christmas',
 18043: 'dates of christmas',
 30824: 'christmas with a capital c',
 15708: 'a mom for christmas',
 7202: 'national lampoons christmas vacation',
 13390: 'the muppet christmas carol',
 14269: 'how the grinch stole christmas',
 17433: 'elmo saves christmas',
 38736: 'a dennis the menace christmas',
 33411: 'christmas comes to pacland',
 1896: 'a fairly odd christmas',
 30405: 'beauty and the beast the enchanted christmas',
 35351: 'accused',
 39733: 'caspers first christmas',
 6367: 'the country mouse and th

## OK! Everything is working, let's work with the Champion Lists!!!
- #### The champList entity it's of the type {term: [docID1, docID14, docID23, ...]}

## Responding to a query using the CampionList... 🤖

In [47]:
def union(listA, listB):
    setA = set(listA)
    setB = set(listB)
    union = setA.union(setB)
    return list(union)

def searchChampionList(query, champList):
    query = tokenize(query)
    result_list = []
    
    for term in query:
        result_list.append(champList[term])
    union_result_list = reduce(union, result_list)
    return docIDListToTitles(union_result_list)

In [48]:
query = "christmas murder love"
searchChampionList(query, champList)

{5386: 'a lizard in a womans skin',
 37899: 'christmas cupid',
 33294: 'twisted desire',
 17170: 'embrace me tightly',
 35605: 'ek ruka hua faisla',
 9750: 'santa buddies',
 27031: 'west nd',
 16665: 'call me claus',
 17433: 'elmo saves christmas',
 21919: 'down with love',
 41760: 'summer wars',
 27936: 'moulin rouge',
 7202: 'national lampoons christmas vacation',
 40866: 'minority report',
 41256: 'cyrano de bergerac',
 37033: '',
 39984: 'merry christmas drake josh',
 4915: 'the lincoln lawyer',
 36153: 'jesse stone no remorse',
 6972: 'love actually',
 14269: 'how the grinch stole christmas',
 3266: 'the man who wasnt there',
 9924: 'heman and shera a christmas special',
 30405: 'beauty and the beast the enchanted christmas',
 28870: 'murder at',
 11463: 'one magic christmas',
 36679: 'rope',
 13643: 'mohabbatein',
 29133: 'kadhir',
 13390: 'the muppet christmas carol',
 16334: 'the horsemen',
 29903: 'mr brooks',
 25556: 'mickeys twice upon a christmas',
 1876: 'white zombie',
 2

## Store the ChampList and the vector Space

In [49]:
def saveObject(obj, name):
    with open('objects/' + name + '.pkl', 'wb') as outfile:
        pickle.dump(obj, outfile, pickle.HIGHEST_PROTOCOL)

def loadObject(name):
    with open('objects/' + name + '.pkl', 'rb') as infile:
        return pickle.load(infile)

In [50]:
saveObject(champList, "champList")
saveObject(vectorSpace, "vectorSpace")
saveObject(corpus, "corpus")