# Groep Opdracht Week 4 Zoekmachines

## Students: Jasper van Eck, Ghislaine van den Boogerd, Joris Galema, Lotte Bottema
## Student IDs: 6228194, 10996087, 11335165, 11269642


# Table of Content<a name='Top'></a>
[Import data](#ImportData)

[Create the TF Dict](#TFDict)

[Create the TF-IDF and Normalize](#TFIDFNorm)

[Vectorize Query](#InputQuery)

[Results](#Results)

- [WordCloud](#WordCloud) Requirement 3
- [Interact with Filters](#Filters) Requirements 1, 2, 4 and 5

[Cohen's Kappa](#Cohen) Requirement 6



# Import Data<a name='ImportData'></a>

In [1]:
#Imports
import pandas as pd
import math
import numpy as np
from elasticsearch import Elasticsearch
import nltk
import PIL
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import json
from collections import Counter, defaultdict
from sklearn import preprocessing
from datetime import datetime

In [2]:
#Open & read JSON file
#Init empty list for json data to be stored
jsonDataReviews = []
with open('IMDB_reviews.json') as json_file:
    #Loop through lines in json file, each review is on seperate line
    for line in json_file:
        #Append to the list of json data
        jsonDataReviews.append(json.loads(line))

#Read the data from the json file
dataReviews = pd.DataFrame(jsonDataReviews)

#Add Review_id column
#Create index range
review_id = list(range(len(dataReviews)))
#Insert the index range into the DF
dataReviews.insert(0,'review_id',review_id,True)
#Cast to string from obj
dataReviews['review_summary'] = dataReviews['review_summary'].astype(str)
dataReviews['review_text'] = dataReviews['review_text'].astype(str)
#Cast to int from str
dataReviews['rating'] = dataReviews['rating'].astype(int)
#Cast to bool from obj
dataReviews['is_spoiler'] = dataReviews['is_spoiler'].astype(bool)
#Create datetime objects from the review_date string
dataReviews['review_date'] = [datetime.strptime(dateString, '%d %B %Y') for dateString in dataReviews['review_date'].values]

In [3]:
#Example of data
dataReviews.head(10)

Unnamed: 0,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
0,0,True,tt0111161,10,2006-02-10,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt...",ur1898687
1,1,True,tt0111161,10,2000-09-06,Simply amazing. The best film of the 90's.,The Shawshank Redemption is without a doubt on...,ur0842118
2,2,True,tt0111161,8,2001-08-03,The best story ever told on film,I believe that this film is the best story eve...,ur1285640
3,3,True,tt0111161,10,2002-09-01,Busy dying or busy living?,"**Yes, there are SPOILERS here**This film has ...",ur1003471
4,4,True,tt0111161,8,2004-05-20,"Great story, wondrously told and acted",At the heart of this extraordinary movie is a ...,ur0226855
5,5,True,tt0111161,8,2004-08-12,"Good , But It Is Overrated By Some",In recent years the IMDB top 250 movies has ha...,ur1532177
6,6,True,tt0111161,9,2005-10-09,This Movie Saved My Life.,I have been a fan of this movie for a long tim...,ur6574726
7,7,True,tt0111161,10,2012-02-04,Movie you can see 1000 times,I made my account on IMDb Just to Rate this mo...,ur31182745
8,8,True,tt0111161,10,2008-10-24,The Shawshank Redemption,"A friend of mine listed ""The Shawshank Redempt...",ur9871443
9,9,True,tt0111161,10,2011-07-30,"""I'm a convicted murderer who provides sound f...",Well I guess I'm a little late to the party as...,ur2707735


# Create the TF Dict<a name='TFDict'></a>

[Top](#Top)

In [9]:
#Init a default dict
tfDict = defaultdict(lambda: defaultdict(int))

#Init Porter Stemmer
ps = nltk.stem.PorterStemmer()

#Use less reviews to reduce runtimes for testing/practice
dataReviewsLess = dataReviews.head(10000).copy()

#Retrieve the actual reviews
reviewTexts = dataReviewsLess['review_text'].values

#Loop through reviews
for i in range(len(reviewTexts)):
    #Tokenize reviews and lowercase the text
    line = re.split('\W+',reviewTexts[i].lower())
    #Loop through tokens in review
    for word in line:
        #Stem token
        stem = ps.stem(word)
        #Increment frequency
        tfDict[stem][i] += 1

#Add in Corpus Frequency, Document Frequency and reposition the frequencies per document
tfDictXtra = defaultdict(lambda: defaultdict(int))
for word in tfDict:
    tfDictXtra[word]['CorpusFreq'] = sum(tfDict[word].values())
    tfDictXtra[word]['DocFreq'] = len(tfDict[word])
    tfDictXtra[word]['Freq_per_doc'] = tfDict[word]


# Create the TF-IDF and Normalize<a name='TFIDFNorm'></a>

[Top](#Top)

In [10]:
#Get the total number of reviews/documents
totalDocs = len(dataReviewsLess)

#Total unique words
totalUniqueWords = len(tfDictXtra)

#Create np matrix with zeros
tfIdf = np.zeros((totalUniqueWords,totalDocs))

#Create dataframe of words with index list to get the word position in matrix for future reference
wordsIndex = pd.DataFrame(list(tfDictXtra.keys()),columns=['Words'])
#Create index range
wordID = list(range(totalUniqueWords))
#Insert the index range
wordsIndex.insert(0,'Index',wordID,True)
#Index counter, to keep track of location in word list
wordCounter = 0


#loop through words in dict
for word in tfDictXtra:
    #Loop through frequencies of word in a doc from dict; LET OP deze regel geeft soms AttributeError: 'int' object has no attribute 'keys'
    #run de vorige cellen dan weer even opnieuw. Dat verhelpt t meestal
    dictLoop = list(tfDictXtra[word]['Freq_per_doc'].keys())
    for doc in dictLoop:
        #Calculate the TF-IDF
        tfIdf[wordCounter,doc] = tfDictXtra[word]['Freq_per_doc'][doc]*math.log((totalDocs/(1+tfDictXtra[word]['DocFreq'])))
    wordCounter += 1


In [11]:
#Transpose the tfIdf matrix and normalize, since the normalize works on rows, and we need to normalize the columns
tfIdfNorm = preprocessing.normalize(tfIdf.T, norm='l2')

# Vectorize query<a name='InputQuery'></a>

[Top](#Top)

In [13]:
#Starting/test query
query = "Shawshank remdemption interesting cool"

#Create a normalized vector of query
def vectorizeQuery(query):
    #Create empty base vector for Term Freq
    queryVector = np.zeros(totalUniqueWords)
    #Tokenize and make lowercase
    line = re.split('\W+',query.lower())
    #Loop through words
    for word in line:
        #Stem each word
        stem = ps.stem(word)
        #Increase term freq of query term
        queryVector[wordsIndex[wordsIndex['Words']==stem]['Index'].values] += 1
    
    #Create empty base vector for TF-IDF
    queryVectorTfIdf = np.zeros(totalUniqueWords)
    #Loop through TF vector of query
    for i in range(len(queryVector)):
        #Act where a term frequency was recorded
        if queryVector[i] != 0:
            #Determine the which word it was based on the index
            word = str(wordsIndex[wordsIndex['Index']==i]['Words'].values)
            #Calculate the TF-IDF
            queryVectorTfIdf[i] = queryVector[i]*math.log((totalDocs/(1+tfDictXtra[word]['DocFreq'])))
    
    #Make the TF-IDF vector a unit vector
    length = np.sqrt(queryVectorTfIdf.dot(queryVectorTfIdf))
    queryVectorNorm = queryVectorTfIdf/length
    
    #Return the unit vector
    return queryVectorNorm


In [14]:
#Cosine similarity matching
def cosineSim(vector, docVector):
    #Only dot product needed since vectors are already unit vectors and therefore the lengths are 1
    return vector.dot(docVector)#/(length vector * length docVector)
    
def rankedList(queryVector):
    #Create empty score list
    scoreList = np.zeros(totalDocs)
    #Loop through each doc
    for i in range(len(tfIdfNorm)):
        #Calculate for each doc the cosine sim. Index of scoreList = review_id
        scoreList[i] = cosineSim(queryVector,tfIdfNorm[i])
    
    #Create new data frame for ranked list based on smaller DF of data
    rankedDocList = dataReviewsLess.copy()
    #Insert the similarity score for each review
    rankedDocList.insert(0,'Score',scoreList,True)
    #Sort the review similarity based on the score and return
    return rankedDocList.sort_values(by='Score',ascending=False)

In [15]:
#Create the ranking list
rankings = rankedList(vectorizeQuery(query))

# Results<a name='Results'></a>

[Top](#Top)

### WordCloud <a name='WordCloud'></a>

[Top](#Top)

In [16]:
#Source: https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = "WordCloud of Query Results"):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=40,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

@interact
def showingWordcloudsOfKRanking(k=(1,50,1)):
    show_wordcloud(rankings.head(k)['review_text'])
    

@interact
def showingWordCloudOfOneReview(i=(1,len(dataReviewsLess),1)):
    show_wordcloud(dataReviewsLess[dataReviewsLess['review_id']==i]['review_text'].values,'WordCloud of a review')

A Jupyter Widget

A Jupyter Widget

### Interact with Filters<a name='Filters'></a>

[Top](#Top)

In [17]:
#Function to filter on the variables created by interact widget
def showResultsTime(start_date, end_date, AmountResults, AtleastRating, spoiler):
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)
    if spoiler == 'Both':
        return rankings[(rankings.review_date > start_date) 
                    & (rankings.review_date < end_date) 
                    & (rankings.rating >= AtleastRating)].head(AmountResults)
    elif spoiler == 'Yes':
        return rankings[(rankings.review_date > start_date) 
                    & (rankings.review_date < end_date) 
                    & (rankings.rating >= AtleastRating)
                    & (rankings.is_spoiler == True)].head(AmountResults)
    elif spoiler == 'No':
        return rankings[(rankings.review_date > start_date) 
                    & (rankings.review_date < end_date) 
                    & (rankings.rating >= AtleastRating)
                    & (rankings.is_spoiler == False)].head(AmountResults)

#The interact function for faceted search
_ = interact(showResultsTime,
             start_date=widgets.DatePicker(value=pd.to_datetime('2014-01-01')),
             end_date=widgets.DatePicker(value=pd.to_datetime('2019-01-01')),
             AmountResults=(10, 100, 10),
             AtleastRating=(1,10,1),
             spoiler=['Both','Yes','No'])

A Jupyter Widget

## Cohen's Kappa<a name='Cohen'></a>
[Top](#Top)

In [18]:
# all the queries that will be judged by two judges

query1 = "best comedy movie of 2015"

query2 = "worst acting in drama movie"

query3 = "sad ending, but great storyline"

query4 = "well-acted and a very loving movie"

query5 = "The Notebook was a real tear-puller but a loving movie"


In [19]:
#Create the ranking lists
rankings1 = rankedList(vectorizeQuery(query1))

rankings2 = rankedList(vectorizeQuery(query2))

rankings3 = rankedList(vectorizeQuery(query3))

rankings4 = rankedList(vectorizeQuery(query4))

rankings5 = rankedList(vectorizeQuery(query5))


In [22]:
rankings1.head(10)

Unnamed: 0,Score,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
4260,0.260692,4260,False,tt0111161,10,2004-02-22,Best film of all time.,This movie is the best movie ever made. I'm no...,ur2918742
3215,0.232432,3215,False,tt0111161,10,2002-06-23,Shawshank is the best movie.,"The Shawshank Redemption was ,is and will alwa...",ur1811770
5565,0.190863,5565,False,tt0068646,10,2015-10-24,+10,"One of the best movies so far, never gets bore...",ur22260662
4818,0.184331,4818,False,tt0068646,10,2015-11-18,Best movie of all time,"One of the best movies I have ever see, and I ...",ur58484064
9966,0.182864,9966,False,tt0468569,10,2017-12-22,Great Movie,"Without a doubt, one of the best if not the be...",ur83525829
4507,0.180775,4507,True,tt0068646,10,2017-07-19,Masterpiece,A masterpiece must watch one of the best movie...,ur78548123
9726,0.178607,9726,False,tt0468569,10,2008-12-03,Probably the best movie ever !!!,",,The Dark Knight'' ! Another Batman movie ! I...",ur20243389
3860,0.174167,3860,False,tt0111161,9,2017-02-20,My favorite movie,"This is my go to favorite movie of all time, t...",ur70486050
176,0.169786,176,True,tt0111161,10,2017-08-21,Short and Simple,"One of the best movies i have seen till date,I...",ur79657064
2156,0.162738,2156,False,tt0111161,10,2015-10-24,One of top five best movies I have ever seen!,What else to write about The Shawshank Redempt...,ur49897769


In [28]:
rankings2.head(10)

Unnamed: 0,Score,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
3653,0.303825,3653,False,tt0111161,1,2003-06-14,I am so sorry to who likes this movie but...,It is one of the worst movies I have ever seen...,ur2438017
2317,0.215119,2317,False,tt0111161,9,2015-01-10,The Best of the Best,"Hi guys,This is the best of the best and it is...",ur57802321
5427,0.19737,5427,False,tt0068646,10,2004-07-16,Definitely deserves its high rating,"I haven't seen all dramas, nor have I ever see...",ur2093818
5422,0.19602,5422,False,tt0068646,1,2008-09-13,i h8 this movie,This is the worst movie ever. It is so over ra...,ur15821933
6272,0.183658,6272,False,tt0068646,10,2002-11-28,THE BEST MOVIE OF ALL TIME,This is the best movie of all time. It has eve...,ur2064005
5748,0.18038,5748,False,tt0068646,10,2002-12-31,The Greatest Drama Ever,The Godfather is the greatest Drama movie ever...,ur1435717
2424,0.179896,2424,False,tt0111161,9,2014-07-26,Not a bad movie,I've watched this movie for the first time lik...,ur38041134
9014,0.179276,9014,False,tt0468569,1,2009-03-07,shockingly bad,One of the worst films I have ever seen. I nea...,ur20774362
1480,0.176842,1480,False,tt0111161,1,2003-03-08,What a bad movie,This is the worst movie I have ever seen. It's...,ur0552804
6211,0.176327,6211,False,tt0068646,10,2003-12-12,The Godfather of all movies.,"I made my day by watching ""The Godfather"". Whe...",ur2572884


In [29]:
rankings3.head(10)

Unnamed: 0,Score,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
3039,0.28697,3039,False,tt0111161,10,2005-12-25,Best of the Best,This is one of those movies that you have to w...,ur7279698
3897,0.283494,3897,False,tt0111161,10,1998-10-03,One of the best ever,"This film is quite simply fantastic, great per...",ur0120998
217,0.253918,217,True,tt0111161,10,2017-05-14,All around good movie,"Great Acting, Great Storyline, Great Graphics,...",ur66920671
9929,0.224031,9929,False,tt0468569,4,2008-07-27,"Disappointing, Poor Joker, Poor Ending, Awful ...",Good film but lacked depth. Sad performance fr...,ur12489948
8763,0.212152,8763,False,tt0468569,10,2015-04-02,Great Movie Ever Seen,"Great acting of Joker,Salute to him The best s...",ur59604422
3895,0.21001,3895,False,tt0111161,10,1998-10-08,"The greatest movie, especially the storyline (...",This is simply the best movie I have ever come...,ur0129989
2944,0.20333,2944,False,tt0111161,10,2007-11-06,Interesting,"I have to say that at first, I thought this mo...",ur2324311
9944,0.19712,9944,False,tt0468569,6,2015-01-05,Great Movie but didn't like it very much,"This is a great movie, because it has a lot of...",ur57660921
3120,0.196748,3120,False,tt0111161,10,2003-04-17,a phenomenal film,"Such a great film. Great acting, great cinema...",ur2230604
8438,0.191319,8438,True,tt0468569,10,2013-10-15,Must Watch,"The Greatest Movie of All Time!!!, everything ...",ur38673567


In [30]:
rankings4.head(10)

Unnamed: 0,Score,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
7997,0.295344,7997,True,tt0468569,10,2013-08-31,Amazing And Awesome,"This film is Amazing,and Cool,this is the best...",ur46286132
5190,0.294058,5190,False,tt0068646,10,2017-10-12,Loving movie.,A very good movie. Some real things at late. L...,ur69130034
9970,0.237074,9970,False,tt0468569,9,2017-12-19,Excellent!,I LOVE this movie. Its one of my favorite movi...,ur83367451
1635,0.233459,1635,False,tt0111161,8,2017-09-25,Awesome,The Shawshank Redemption is my all time favori...,ur80763822
497,0.221867,497,True,tt0111161,10,2014-08-15,excellent film,The Shawshank Redemption is a great movie. Mor...,ur50195663
4060,0.209169,4060,False,tt0111161,10,2014-09-10,Awesome Movie,This is truly one of the best movies ever made...,ur27327039
516,0.204996,516,True,tt0111161,10,2014-05-24,The Best Movie ever,I have seen the movie first and than read the ...,ur0781734
1326,0.19449,1326,False,tt0111161,10,2015-12-10,Number 1 for a reason,This movie has been rated number one by IMDb f...,ur46143670
5505,0.190946,5505,False,tt0068646,10,2013-09-05,Great Movie,I love to think that dramas start one way and ...,ur43986175
1059,0.179384,1059,False,tt0111161,10,1999-05-12,"Hope is a good thing, maybe the best of things.",Truly a masterpiece of cinema. I hope this fi...,ur0327799


In [31]:
rankings5.head(10)

Unnamed: 0,Score,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
5190,0.273003,5190,False,tt0068646,10,2017-10-12,Loving movie.,A very good movie. Some real things at late. L...,ur69130034
7997,0.193992,7997,True,tt0468569,10,2013-08-31,Amazing And Awesome,"This film is Amazing,and Cool,this is the best...",ur46286132
9970,0.181068,9970,False,tt0468569,9,2017-12-19,Excellent!,I LOVE this movie. Its one of my favorite movi...,ur83367451
3339,0.176967,3339,False,tt0111161,8,2001-07-16,over-rated,When I saw this movie on the top 250 movies of...,ur1256157
7770,0.173517,7770,True,tt0468569,9,2017-07-05,"Great movie, One of the best.",A lot of people have fallen in love with this ...,ur77832802
1561,0.163451,1561,False,tt0111161,10,2017-12-15,Top,Loved how true this movie was to the original ...,ur7832547
3669,0.157369,3669,False,tt0111161,10,1999-05-17,The best movie ever made!,"The acting is top-notch, the dialogue is super...",ur0009235
1262,0.147863,1262,False,tt0111161,10,1999-01-27,a really good movie,The first time I saw this movie I loved it. Bu...,ur0224544
4455,0.145298,4455,True,tt0068646,8,2012-08-25,A classic that you can't miss!,I LOVED IT! The movie was so long but had grea...,ur26315465
3116,0.143788,3116,False,tt0111161,10,2003-05-30,The best movie i have ever seen....,Best movie i have ever seen. Tim Robbins and M...,ur2449059


In [48]:
## A one means the judges thinks the review is relevant. A zero means the judge thinks the reviews is irrelevant

## The first number is from the first judge, the second from the second judge for the rankings of query 1
judges1 = [[0,0], [0,0], [1,1], [1,1], [0,0], [0,1], [0,1], [0,0], [0,0], [1,1]]

## The first number is from the first judge, the second from the second judge for the rankings of query 2
judges2 = [[1,1], [0,0], [0,0], [1,1], [0,0], [1,0], [0,0], [0,0], [1,1], [0,0]]

## The first number is from the first judge, the second from the second judge for the rankings of query 3
judges3 = [[0,0], [1,0], [1,1], [0,0], [1,0], [0,1], [0,0], [0,0], [0,1], [0,0]]

## The first number is from the first judge, the second from the second judge for the rankings of query 3
judges4 = [[1,0], [1,1], [0,0], [1,0], [0,0], [0,0], [1,1], [0,0], [1,0], [1,1]]

## The first number is from the first judge, the second from the second judge for the rankings of query 3
judges5 = [[0,1], [0,1], [1,1], [0,1], [1,1], [1,1], [1,0], [0,0], [1,1], [0,0]]


In [49]:
def AveragePrecision(ranked_list_of_results, list_of_relevant_objects):
    begin = 1/len(list_of_relevant_objects)
    count = 0
    for i, res in enumerate(ranked_list_of_results):
        for j, obj in enumerate(list_of_relevant_objects):
            if obj == res:
                itera = (j+1) / (i+1)
            count = count + itera
    return begin * count

def PE(data):
    '''On input data, return the P(E) (expected agreement).'''
    relevant = 0
    nonrelevant = 0
    # Iterate over the data
    for i in data:
        for j in i:
            
            # Top up the relevant documents by one if 1 is encountered
            if j == 1:
                relevant += 1
            # Top up the nonrelevant documents by one if 0 is encountered
            if j == 0:
                nonrelevant += 1

    # Calculates the total of inspected documents for the judges combined
    total = len(data)*2

    # Calculates the pooled marginals
    rel = relevant/total
    nonrel = nonrelevant/total

    # Calculates the P(E)
    P_E = nonrel**2 + rel **2    
    return    P_E 


def kappa(data, P_E):
    agree = 0
    for i in data:
        temp = None
        for j in i:
            if temp == j:
                agree += 1
            temp = j
    P_A = agree / len(data)
    kappa = (P_A - P_E)/(1 - P_E)   
    return kappa

In [50]:
print(kappa(judges1, PE(judges1)))

print(kappa(judges2, PE(judges2)))

print(kappa(judges3, PE(judges3)))

print(kappa(judges4, PE(judges4)))

print(kappa(judges5, PE(judges5)))


0.5833333333333334
0.7802197802197802
0.04761904761904766
0.3939393939393937
0.1666666666666666
