# Groep Opdracht Week 4 Zoekmachines

## Students: Jasper van Eck, Ghislaine, Joris Galema, Lotte
## Student IDs: 6228194, -, 11335165, 11269642


# Import Data

In [119]:
#Imports
import pandas as pd
import math
import numpy as np
import nltk
import re
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import json
from collections import Counter, defaultdict
from sklearn import preprocessing

In [120]:
#Open JSON file
jsonDataReviews = []
with open('IMDB_reviews.json') as json_file:
    for line in json_file:
        jsonDataReviews.append(json.loads(line))

#Read the data from the json file
dataReviews = pd.DataFrame(jsonDataReviews)

#Add Review_id column
review_id = list(range(len(dataReviews)))
dataReviews.insert(0,'review_id',review_id,True)
dataReviews['review_summary'] = dataReviews['review_summary'].astype(str)
dataReviews['review_text'] = dataReviews['review_text'].astype(str)

In [121]:
#Example of data
dataReviews.head(10)

Unnamed: 0,review_id,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
0,0,True,tt0111161,10,10 February 2006,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt...",ur1898687
1,1,True,tt0111161,10,6 September 2000,Simply amazing. The best film of the 90's.,The Shawshank Redemption is without a doubt on...,ur0842118
2,2,True,tt0111161,8,3 August 2001,The best story ever told on film,I believe that this film is the best story eve...,ur1285640
3,3,True,tt0111161,10,1 September 2002,Busy dying or busy living?,"**Yes, there are SPOILERS here**This film has ...",ur1003471
4,4,True,tt0111161,8,20 May 2004,"Great story, wondrously told and acted",At the heart of this extraordinary movie is a ...,ur0226855
5,5,True,tt0111161,8,12 August 2004,"Good , But It Is Overrated By Some",In recent years the IMDB top 250 movies has ha...,ur1532177
6,6,True,tt0111161,9,9 October 2005,This Movie Saved My Life.,I have been a fan of this movie for a long tim...,ur6574726
7,7,True,tt0111161,10,4 February 2012,Movie you can see 1000 times,I made my account on IMDb Just to Rate this mo...,ur31182745
8,8,True,tt0111161,10,24 October 2008,The Shawshank Redemption,"A friend of mine listed ""The Shawshank Redempt...",ur9871443
9,9,True,tt0111161,10,30 July 2011,"""I'm a convicted murderer who provides sound f...",Well I guess I'm a little late to the party as...,ur2707735


# Create the TF Dict

In [136]:
#Init a default dict
tfDict = defaultdict(lambda: defaultdict(int))

#Init Porter Stemmer
ps = nltk.stem.PorterStemmer()

#Use less reviews to reduce runtimes for testing/practice
dataReviewsLess = pd.DataFrame(dataReviews.head(10000))

#Retrieve the actual reviews
reviewTexts = dataReviewsLess['review_text'].values

#Loop through reviews
for i in range(len(reviewTexts)):
    #Tokenize reviews and lowercase the text
    line = re.split('\W+',reviewTexts[i].lower())
    #Loop through tokens in review
    for word in line:
        #Stem token
        stem = ps.stem(word)
        #Increment frequency
        tfDict[stem][i] += 1

#Add in Corpus Frequency and Document Frequency
tfDictXtra = defaultdict(lambda: defaultdict(int))
for word in tfDict:
    tfDictXtra[word]['CorpusFreq'] = sum(tfDict[word].values())
    tfDictXtra[word]['DocFreq'] = len(tfDict[word])
    tfDictXtra[word]['Freq_per_doc'] = tfDict[word]


# Create the TF-IDF and Normalize

In [137]:
#Get the total number of reviews/documents
totalDocs = len(dataReviewsLess)

#Total unique words
totalUniqueWords = len(tfDictXtra)

#Create np matrix with zeros
tfIdf = np.zeros((totalUniqueWords,totalDocs))

#Create dataframe of words with index list to get the word position in matrix
wordsIndex = pd.DataFrame(list(tfDictXtra.keys()),columns=['Words'])
wordID = list(range(totalUniqueWords))
wordsIndex.insert(0,'Index',wordID,True)
wordCounter = 0


#loop through words in dict
for word in tfDictXtra:
    #Loop through frequencies of word in a doc from dict; LET OP deze regel geeft soms AttributeError: 'int' object has no attribute 'keys'
    #run de vorige cellen dan weer even opnieuw. Dat verhelpt t meestal
    dictLoop = list(tfDictXtra[word]['Freq_per_doc'].keys())
    for doc in dictLoop:
        #Calculate the TF-IDF
        tfIdf[wordCounter,doc] = tfDictXtra[word]['Freq_per_doc'][doc]*math.log((totalDocs/(1+tfDictXtra[word]['DocFreq'])))
    wordCounter += 1


In [138]:
#Transpose the tfIdf matrix and normalize, since the normalize works on rows, and we need to normalize the columns
tfIdfNorm = preprocessing.normalize(tfIdf.T, norm='l2')
len(tfIdfNorm[0])

20377

# Input query

In [142]:
query = "Shawshank redemption interesting cool"

#Create a normalized
def vectorizeQuery(query):
    queryVector = np.zeros(totalUniqueWords)
    line = re.split('\W+',query.lower())
    for word in line:
        stem = ps.stem(word)
        queryVector[wordsIndex[wordsIndex['Words']==stem]['Index'].values] += 1
    
    queryVectorTfIdf = np.zeros(totalUniqueWords)
    for i in range(len(queryVector)):
        if queryVector[i] != 0:
            word = str(wordsIndex[wordsIndex['Index']==i]['Words'].values)
            queryVectorTfIdf[i] = queryVector[i]*math.log((totalDocs/(1+tfDictXtra[word]['DocFreq'])))
    
    length = np.sqrt(queryVectorTfIdf.dot(queryVectorTfIdf))
    queryVectorNorm = queryVectorTfIdf/length
    
    return queryVectorNorm


In [149]:
#Cosine similarity matching
def cosineSim(vector, docVector):
    #Only dot product needed since vectors are already unit vectors and therefore the lengths are 1
    return vector.dot(docVector)#/(length vector * length docVector)

def cosineSimFast(vector, docVector):
    similarity = np.dot(vector, docVector.T)

    # squared magnitude of preference vectors (number of occurrences)
    square_mag = np.diag(similarity)

    # inverse squared magnitude
    inv_square_mag = 1 / square_mag

    # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
    inv_square_mag[np.isinf(inv_square_mag)] = 0

    # inverse of the magnitude
    inv_mag = np.sqrt(inv_square_mag)

    # cosine similarity (elementwise multiply by inverse magnitudes)
    cosine = similarity * inv_mag
    return cosine.T * inv_mag
    
def rankedList(queryVector):
    scoreList = np.zeros(totalDocs)
    for i in range(len(tfIdfNorm)):
        scoreList[i] = cosineSim(queryVector,tfIdfNorm[i])
    
    rankedDocList = pd.DataFrame(scoreList,columns=['Score'])
    rankedDocList.insert(0,'DocID',list(range(totalDocs)))
    return rankedDocList.sort_values(by='Score',ascending=False)

In [150]:
rankedList(vectorizeQuery(query))

Unnamed: 0,DocID,Score
2288,2288,0.653760
441,441,0.653760
3417,3417,0.347825
1562,1562,0.324774
487,487,0.290769
2053,2053,0.289090
57,57,0.283424
837,837,0.265075
676,676,0.252843
191,191,0.248598


# Results