# Speed running entire project

In [2]:
import pandas as pd
import numpy as np
import mmh3

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import networkx as nx
import math as math
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Get reviews
Movies = pd.read_csv('data/movies_786.csv')
Reviews = pd.read_csv('data/reviews_786.csv')
print(f"Movies data has {Movies.columns.size} columns with {Movies.shape[0]} rows each.")
print(f"Reviews data has {Reviews.columns.size} columns with {Reviews.shape[0]} rows each.")
Reviews.head(n=2)

Movies data has 2 columns with 786 rows each.
Reviews data has 7 columns with 7855 rows each.


Unnamed: 0,Movie ID,Reviewer Name,Reviewer URL,Title,Review,Helpful,Total
0,tt0111161,hitchcockthelegend,ur16161013,Some birds aren't meant to be caged.,The Shawshank Redemption is written and direct...,1111,1203
1,tt0111161,Sleepin_Dragon,ur15311310,An incredible movie. One that lives with you.,It is no wonder that the film has such a high ...,362,394


In [8]:
# extract first 10 ids
Movies['ID'].head(24).to_list()

['tt0111161',
 'tt0068646',
 'tt0110912',
 'tt0071562',
 'tt1375666',
 'tt0167260',
 'tt0076759',
 'tt0120737',
 'tt0133093',
 'tt0114369',
 'tt0211915',
 'tt0103064',
 'tt1520211',
 'tt1156398',
 'tt0365748',
 'tt0480249',
 'tt0455407',
 'tt0462322',
 'tt0289043',
 'tt0463854',
 'tt0432021',
 'tt0363547',
 'tt0120804',
 'tt1077258']

# Book based model <a id="model1"></a> <br>

## Jaccard Similarity on Movie Data <a id="jaccard-sim"></a> <br>


Function jaccard, that takes two titles and outputs the estimated jaccard similarity. \
Function max_jaccard takes a list of titles and compare the titles with each other. It then returns the two titles with the highest jaccard similarity.

In [12]:
Movies.head(n=2)
# Extract all titles

Unnamed: 0,ID,Title
0,tt0111161,The Shawshank Redemption
1,tt0068646,The Godfather


In [None]:
# Finds jaccard similarity between two titles.
def jaccard(title1, title2):
    words1 = set(str(title1).lower().split())
    words2 = set(str(title2).lower().split())
    
    # Compute the intersection and union of the sets
    intersection = len(words1.intersection(words2))
    union = len(words1) + len(words2) - intersection
    
    # Calculate and return the Jaccard similarity
    return intersection / union if union > 0 else 0.0

# Finds title set with highest jaccard similarity
def max_jaccard(title_list):
    max_similarity = 0.0
    idx1 = 0
    idx2 = 0
    for i in range(len(title_list)):
        for j in range(i + 1, len(title_list)):
            similarity = jaccard(title_list[i], title_list[j])
            if similarity > max_similarity:
                idx1 = i
                idx2 = j
                max_similarity = similarity
    return max_similarity, idx1, idx2

"""
Example usage for comparing two titles
title1 = books_data_1['title'][0]
title2 = books_data_1['title'][1]
print(title1,title2)
similarity = jaccard(title1, title2)
print(f"Jaccard Similarity: {similarity}")
"""

# Example usage with a list of titles
title_list = Movies['Title'].tolist()
max_similarity, idx1, idx2 = max_jaccard(title_list)
print(f"Max Jaccard Similarity: {max_similarity}")
print(title_list[idx1])
print(title_list[idx2])
print("shocking :o")


Max Jaccard Similarity: 1.0
Dawn of the Dead
Dawn of the Dead
shocking :o


### Finding similar titles and reviews

In [19]:
# Finds similar titles/descriptions
def similar(texts, jaccard_threshold=0.6):
    # Create a dictionary to store the similar texts
    similar_texts = {}
    
    # Loop through each name in the list
    for i in tqdm(range(len(texts))):
        for j in range(i+1, len(texts)):
            similarity_score = jaccard(texts[i], texts[j])
            if similarity_score >= jaccard_threshold:
                similar_texts[(texts[i], texts[j])] = similarity_score
    return similar_texts

# Example usage:    
titles = Movies['Title'][0:500]
similar_titles = similar(titles)
# Print titles in a way that is easier to read

for (desc1, desc2), score in similar_titles.items():
    print(f"Similarity Score: {score}")
    print(f"Title 1: {desc1}")
    print(f"Title 2: {desc2}")
    print()
# 0.5 secs on 786

100%|██████████| 500/500 [00:00<00:00, 943.89it/s]

Similarity Score: 0.6
Title 1: The Godfather Part II
Title 2: The Godfather Part III

Similarity Score: 0.6666666666666666
Title 1: The Matrix
Title 2: The Matrix Reloaded

Similarity Score: 0.6666666666666666
Title 1: The Matrix
Title 2: The Matrix Revolutions

Similarity Score: 0.6
Title 1: Shaun of the Dead
Title 2: Dawn of the Dead

Similarity Score: 0.6
Title 1: Shaun of the Dead
Title 2: Land of the Dead

Similarity Score: 0.6
Title 1: Shaun of the Dead
Title 2: Survival of the Dead

Similarity Score: 0.6
Title 1: Shaun of the Dead
Title 2: Diary of the Dead

Similarity Score: 0.6
Title 1: Shaun of the Dead
Title 2: Day of the Dead

Similarity Score: 0.6
Title 1: Dawn of the Dead
Title 2: Land of the Dead

Similarity Score: 1.0
Title 1: Dawn of the Dead
Title 2: Dawn of the Dead

Similarity Score: 0.6
Title 1: Dawn of the Dead
Title 2: Survival of the Dead

Similarity Score: 0.6
Title 1: Dawn of the Dead
Title 2: Diary of the Dead

Similarity Score: 0.6
Title 1: Dawn of the Dead





In [20]:
similar(['J a s o n', 'N o j a n', 'N i k l a s', 'A m a l i e'])

100%|██████████| 4/4 [00:00<?, ?it/s]


{('J a s o n', 'N o j a n'): 0.8}

In [None]:
# Example usage for finding similar descriptions
reviews = Reviews['Review']
similar_descriptions = similar(reviews, jaccard_threshold=0.4)

for (desc1, desc2), score in similar_descriptions.items():
    print(f"Similarity Score: {score}")
    print(f"Description 1: {desc1}")
    print(f"Description 2: {desc2}") 
    print()

# tqdm estimates ~1:20 hours for 786 reviews


  1%|          | 95/7855 [01:05<1:29:34,  1.44it/s]


KeyboardInterrupt: 

In [None]:
def clean_text_string(text):
    tokens = word_tokenize(text)
    clean_tokens = []
    
    for token in tokens:
        # Convert to lowercase
        token = token.lower()
        
        # Remove punctuation
        if token not in string.punctuation:
            # Remove stopwords
            if token not in stopwords.words('english'):
                clean_tokens.append(token)

    return ' '.join(clean_tokens)

def lemmatize_text_string(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = []
    
    for token in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    
    return ' '.join(lemmatized_tokens)

def get_movie_reviews(movie_id):
    movie_reviews = reviews[reviews['Movie ID'] == movie_id]['Review']
    return movie_reviews

def get_movie_reviews_cleaned(movie_id):
    movie_reviews = get_movie_reviews(movie_id)
    movie_reviews_cleaned = movie_reviews.apply(clean_text_string)
    return movie_reviews_cleaned

def get_movie_reviews_lemmatized(movie_id):
    movie_reviews = get_movie_reviews(movie_id)
    movie_reviews_lemmatized = movie_reviews.apply(lemmatize_text_string)
    return movie_reviews_lemmatized

# Test on first movie, first review
movie_id = movies['ID'][0]
raw = get_movie_reviews(movie_id)[0]
clean = get_movie_reviews_cleaned(movie_id)[0]
lemmatized = get_movie_reviews_lemmatized(movie_id)[0]
print(f"Raw review: {raw}")
print(f"Cleaned review: {clean}")
print(f"Lemmatized review: {lemmatized}")

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\PC/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.12_3.12.2032.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.12_3.12.2032.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.12_3.12.2032.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\PC\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
clean[0]

'The Shawshank Redemption is written and directed by Frank Darabont. It is an adaptation of the Stephen King novella Rita Hayworth and Shawshank Redemption. Starring Tim Robbins and Morgan Freeman, the film portrays the story of Andy Dufresne (Robbins), a banker who is sentenced to two life sentences at Shawshank State Prison for apparently murdering his wife and her lover. Andy finds it tough going but finds solace in the friendship he forms with fellow inmate Ellis "Red" Redding (Freeman). While things start to pick up when the warden finds Andy a prison job more befitting his talents as a banker. However, the arrival of another inmate is going to vastly change things for all of them.There was no fanfare or bunting put out for the release of the film back in 94, with a title that didn\'t give much inkling to anyone about what it was about, and with Columbia Pictures unsure how to market it, Shawshank Redemption barely registered at the box office. However, come Academy Award time the