In [1]:
%%time

! pip install datasketch

Defaulting to user installation because normal site-packages is not writeable
CPU times: total: 0 ns
Wall time: 1.22 s



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import time
import itertools
import multiprocessing
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasketch import MinHash
from datasketch import MinHashLSHForest
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Import the CSV file
df = pd.read_csv("https://raw.githubusercontent.com/LaertXh/LLM-Project/refs/heads/main/Data/rotten_tomatoes_movies.csv")

In [5]:
df.shape

(17712, 22)

In [6]:
df.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


### Dataset Cleaning

In [7]:
#Drop unneeded columns
df.drop(['rotten_tomatoes_link','critics_consensus', 'original_release_date', 'streaming_release_date', 'runtime', 'authors','production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'], axis=1, inplace = True)

In [8]:
df.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."


#### Keep only the first director, and the first 4 actors

In [9]:
#Limit the number of directors to the first listed director
df['directors'] = df['directors'].str.split(',').str[0]
#Limit the number of actors to the first four listed actors
df['actors'] = df['actors'].str.split(',').str[:4].str.join(',')

In [10]:
df['directors'].head()

0       Chris Columbus
1    Nicole Holofcener
2        Blake Edwards
3         Sidney Lumet
4    Richard Fleischer
Name: directors, dtype: object

In [11]:
df['actors'].head()

0    Logan Lerman, Brandon T. Jackson, Alexandra Da...
1    Catherine Keener, Amanda Peet, Oliver Platt, R...
2    Dudley Moore, Bo Derek, Julie Andrews, Robert ...
3    Martin Balsam, John Fiedler, Lee J. Cobb, E.G....
4    James Mason, Kirk Douglas, Paul Lukas, Peter L...
Name: actors, dtype: object

#### Check Duplicates and drop all null values

In [12]:
#Check for duplicate rows
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [movie_title, movie_info, content_rating, genres, directors, actors]
Index: []


In [13]:
#Identify null values
df.isnull().sum()

movie_title         0
movie_info        321
content_rating      0
genres             19
directors         194
actors            352
dtype: int64

In [14]:
#Drop rows with null values
df_cleaned = df.dropna(subset=['movie_info', 'genres', 'directors', 'actors'])

In [15]:
df_cleaned.isnull().sum()

movie_title       0
movie_info        0
content_rating    0
genres            0
directors         0
actors            0
dtype: int64

#### Reset index, we will use this dataset moving forward 

In [16]:
# reset index 
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned_english = df_cleaned.copy() # used for readable outputs 
df_cleaned

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."
...,...,...,...,...,...,...
16903,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",PG,"Action & Adventure, Comedy, Kids & Family",Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Spencer..."
16904,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,R,"Drama, Musical & Performing Arts",Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid..."
16905,Zootopia,From the largest elephant to the smallest shre...,PG,"Action & Adventure, Animation, Comedy",Byron Howard,"J.K. Simmons, Kristen Bell, Octavia Spencer, A..."
16906,Zulu,"In 1879, the Zulu nation hands colonial Britis...",PG,"Classics, Drama",Cy Endfield,"Stanley Baker, Jack Hawkins, Ulla Jacobsson, J..."


### Remove white space of movie_info

In [17]:
# Display the full contents of each cell 
pd.set_option('display.max_colwidth', None)

In [18]:
#clean white space, new lines, tabs
df_cleaned['movie_info'] = df_cleaned['movie_info'].replace({r'[\r\n\t]+': ' '}, regex=True)

In [19]:
df_cleaned['movie_info'].head()

0                                       Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
1       Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
2         

### Remove Spaces from names of actors and directors

In [20]:
# Remove all spaces from the 'directors' column (e.g., "John Smith" -> "JohnSmith")
df_cleaned['directors'] = df_cleaned['directors'].str.replace(' ', '')

# Remove spaces within each actor's name in the 'actors' column
# (e.g., "Tom Hanks, Meg Ryan" -> "TomHanks, MegRyan")
df_cleaned['actors'] = df_cleaned['actors'].apply(
    lambda x: ', '.join(name.replace(' ', '') for name in x.split(','))
)

# Remove spaces within each genre in the 'genres' column
# (e.g., "Action & Adventure" -> "Action&Adventure") to make one token
df_cleaned['genres'] = df_cleaned['genres'].apply(
    lambda x: ', '.join(name.replace(' ', '') for name in x.split(','))
)

In [21]:
#Function to clean_text
def clean_text(text):
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))
    punct = string.punctuation
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', punct)).lower()
    tokens = re.split(r'\W+', text)
    tokens = [lem.lemmatize(word) for word in tokens if word not in stop]
    return ' '.join(tokens)

In [22]:
def weight_and_salt_names(name_string, n):
    # Remove spaces and split the input string by commas to extract individual names
    names = [name.strip().replace(' ', '') for name in name_string.split(',')]
    salted_names = []

    def salt_name(name, weight):
        # If the weight is zero or negative, return an empty list
        if weight <= 0:
            return []
        # Start with the original name (unsalted), then add salted versions
        salted = [name]  # original name once
        salted += [f"{name}__salt{i}" for i in range(weight - 1)]
        return salted

    # Assign weights to the first four names in decreasing order of importance
    if len(names) > 0:
        salted_names.extend(salt_name(names[0], n))               # Highest weight
    if len(names) > 1:
        salted_names.extend(salt_name(names[1], (n // 2) + 1))    # Medium weight
    if len(names) > 2:
        salted_names.extend(salt_name(names[2], n // 2))          # Lower weight
    if len(names) > 3:
        salted_names.extend(salt_name(names[3], n // 3))          # Lowest weight

    # Return all salted names as a space-separated string
    return ' '.join(salted_names)


# Apply weighted salting to actors with n=6
df_cleaned['actors'] = df_cleaned['actors'].apply(lambda x: weight_and_salt_names(x, n=6))

# Apply weighted salting to directors with n=5
df_cleaned['directors'] = df_cleaned['directors'].apply(lambda x: weight_and_salt_names(x, n=5))


# Clean movie titles with custom cleaning function
df_cleaned['movie_title'] = df_cleaned['movie_title'].apply(clean_text)

# Replace spaces in the cleaned movie titles with commas (e.g., "die hard" -> "die,hard")
df_cleaned['movie_title'] = df_cleaned['movie_title'].str.replace(' ', ',', regex=False)

# Apply weighted salting to the modified movie titles with n=3
df_cleaned['movie_title'] = df_cleaned['movie_title'].apply(lambda x: weight_and_salt_names(x, n=3))


Get key words from movie_info

In [23]:
#Get only the gey words of the move_info column and place them on the movie_info_reduced column 
corpus = df_cleaned['movie_info'].fillna('')

vectorizer = TfidfVectorizer(stop_words='english') 
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()

def extract_keywords(row_index, top_n=8):
    row_vector = X[row_index]
    sorted_items = zip(row_vector.indices, row_vector.data)
    sorted_items = sorted(sorted_items, key=lambda x: -x[1])
    keywords = [features[idx] for idx, score in sorted_items[:top_n]]
    return " ".join(keywords)

df_cleaned['movie_info_reduced'] = [extract_keywords(i) for i in range(X.shape[0])]


In [24]:
# Combining relevant columns for cleaning and then LSH model training
df_cleaned['movie_combined'] =df_cleaned['movie_title'] + ' ' + df_cleaned['actors']+ ' ' + df_cleaned['directors'] + ' ' + df_cleaned['movie_info_reduced'] + ' ' + df_cleaned['content_rating']

In [25]:
#Use custom clean_text function to clean the movie_combined column
df_cleaned['movie_combined'] = df_cleaned['movie_combined'].apply(clean_text)
df_cleaned['genres'] = df_cleaned['genres'].apply(clean_text)

In [26]:
# Show the df_cleaned df, allows inspection of the weights, movie_combined and the overall set up so far
df_cleaned.head(5)

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors,movie_info_reduced,movie_combined
0,percy percy__salt0 percy__salt1 jackson jackson__salt0 olympian lightning,"Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.",PG,actionadventure comedy drama sciencefictionfantasy,ChrisColumbus ChrisColumbus__salt0 ChrisColumbus__salt1 ChrisColumbus__salt2 ChrisColumbus__salt3,LoganLerman LoganLerman__salt0 LoganLerman__salt1 LoganLerman__salt2 LoganLerman__salt3 LoganLerman__salt4 BrandonT.Jackson BrandonT.Jackson__salt0 BrandonT.Jackson__salt1 BrandonT.Jackson__salt2 AlexandraDaddario AlexandraDaddario__salt0 AlexandraDaddario__salt1 JakeAbel JakeAbel__salt0,percy god olympians deities erupting learns poseidon hades,percy percysalt0 percysalt1 jackson jacksonsalt0 olympian lightning loganlerman loganlermansalt0 loganlermansalt1 loganlermansalt2 loganlermansalt3 loganlermansalt4 brandontjackson brandontjacksonsalt0 brandontjacksonsalt1 brandontjacksonsalt2 alexandradaddario alexandradaddariosalt0 alexandradaddariosalt1 jakeabel jakeabelsalt0 chriscolumbus chriscolumbussalt0 chriscolumbussalt1 chriscolumbussalt2 chriscolumbussalt3 percy god olympian deity erupting learns poseidon hades pg
1,please please__salt0 please__salt1 give give__salt0,"Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.",R,comedy,NicoleHolofcener NicoleHolofcener__salt0 NicoleHolofcener__salt1 NicoleHolofcener__salt2 NicoleHolofcener__salt3,CatherineKeener CatherineKeener__salt0 CatherineKeener__salt1 CatherineKeener__salt2 CatherineKeener__salt3 CatherineKeener__salt4 AmandaPeet AmandaPeet__salt0 AmandaPeet__salt1 AmandaPeet__salt2 OliverPlatt OliverPlatt__salt0 OliverPlatt__salt1 RebeccaHall RebeccaHall__salt0,kate tidy reselling cheaply remodel occupant granddaughters assuage,please pleasesalt0 pleasesalt1 give givesalt0 catherinekeener catherinekeenersalt0 catherinekeenersalt1 catherinekeenersalt2 catherinekeenersalt3 catherinekeenersalt4 amandapeet amandapeetsalt0 amandapeetsalt1 amandapeetsalt2 oliverplatt oliverplattsalt0 oliverplattsalt1 rebeccahall rebeccahallsalt0 nicoleholofcener nicoleholofcenersalt0 nicoleholofcenersalt1 nicoleholofcenersalt2 nicoleholofcenersalt3 kate tidy reselling cheaply remodel occupant granddaughter assuage r
2,10 10__salt0 10__salt1,"A successful, middle-aged Hollywood songwriter falls hopelessly in love with the woman of his dreams, and even follows the girl and her new husband to their Mexican honeymoon resort. While his behavior seems sure to land him in trouble, out of the blue fate plays into his hands.",R,comedy romance,BlakeEdwards BlakeEdwards__salt0 BlakeEdwards__salt1 BlakeEdwards__salt2 BlakeEdwards__salt3,DudleyMoore DudleyMoore__salt0 DudleyMoore__salt1 DudleyMoore__salt2 DudleyMoore__salt3 DudleyMoore__salt4 BoDerek BoDerek__salt0 BoDerek__salt1 BoDerek__salt2 JulieAndrews JulieAndrews__salt0 JulieAndrews__salt1 RobertWebber RobertWebber__salt0,hopelessly honeymoon songwriter sure blue aged resort plays,10 10salt0 10salt1 dudleymoore dudleymooresalt0 dudleymooresalt1 dudleymooresalt2 dudleymooresalt3 dudleymooresalt4 boderek bodereksalt0 bodereksalt1 bodereksalt2 julieandrews julieandrewssalt0 julieandrewssalt1 robertwebber robertwebbersalt0 blakeedwards blakeedwardssalt0 blakeedwardssalt1 blakeedwardssalt2 blakeedwardssalt3 hopelessly honeymoon songwriter sure blue aged resort play r
3,12 12__salt0 12__salt1 angry angry__salt0 men twelve,"Following the closing arguments in a murder trial, the 12 members of the jury must deliberate, with a guilty verdict meaning death for the accused, an inner-city teen. As the dozen men try to reach a unanimous decision while sequestered in a room, one juror (Henry Fonda) casts considerable doubt on elements of the case. Personal issues soon rise to the surface, and conflict threatens to derail the delicate process that will decide one boy's fate.",NR,classic drama,SidneyLumet SidneyLumet__salt0 SidneyLumet__salt1 SidneyLumet__salt2 SidneyLumet__salt3,MartinBalsam MartinBalsam__salt0 MartinBalsam__salt1 MartinBalsam__salt2 MartinBalsam__salt3 MartinBalsam__salt4 JohnFiedler JohnFiedler__salt0 JohnFiedler__salt1 JohnFiedler__salt2 LeeJ.Cobb LeeJ.Cobb__salt0 LeeJ.Cobb__salt1 E.G.Marshall E.G.Marshall__salt0,unanimous sequestered arguments juror deliberate verdict delicate dozen,12 12salt0 12salt1 angry angrysalt0 men twelve martinbalsam martinbalsamsalt0 martinbalsamsalt1 martinbalsamsalt2 martinbalsamsalt3 martinbalsamsalt4 johnfiedler johnfiedlersalt0 johnfiedlersalt1 johnfiedlersalt2 leejcobb leejcobbsalt0 leejcobbsalt1 egmarshall egmarshallsalt0 sidneylumet sidneylumetsalt0 sidneylumetsalt1 sidneylumetsalt2 sidneylumetsalt3 unanimous sequestered argument juror deliberate verdict delicate dozen nr
4,20000 20000__salt0 20000__salt1 league league__salt0 sea,"In 1866, Professor Pierre M. Aronnax (Paul Lukas) and his assistant Conseil (Peter Lorre), stranded in San Francisco by reports of a giant sea monster attacking ships in the Pacific Ocean, are invited to join an expedition to search for the creature. During the search, they and harpooner Ned Land (Kirk Douglas) are thrown overboard during an attack, eventually discovering that the supposed monster is actually a submarine piloted by the brilliant but haunted Captain Nemo (James Mason).",G,actionadventure drama kidsfamily,RichardFleischer RichardFleischer__salt0 RichardFleischer__salt1 RichardFleischer__salt2 RichardFleischer__salt3,JamesMason JamesMason__salt0 JamesMason__salt1 JamesMason__salt2 JamesMason__salt3 JamesMason__salt4 KirkDouglas KirkDouglas__salt0 KirkDouglas__salt1 KirkDouglas__salt2 PaulLukas PaulLukas__salt0 PaulLukas__salt1 PeterLorre PeterLorre__salt0,monster search 1866 aronnax conseil harpooner nemo lorre,20000 20000salt0 20000salt1 league leaguesalt0 sea jamesmason jamesmasonsalt0 jamesmasonsalt1 jamesmasonsalt2 jamesmasonsalt3 jamesmasonsalt4 kirkdouglas kirkdouglassalt0 kirkdouglassalt1 kirkdouglassalt2 paullukas paullukassalt0 paullukassalt1 peterlorre peterlorresalt0 richardfleischer richardfleischersalt0 richardfleischersalt1 richardfleischersalt2 richardfleischersalt3 monster search 1866 aronnax conseil harpooner nemo lorre g


# Shingles Size is Fixed

In [27]:
# This function generates "genre-guided shingles" by combining each genre token
# with (k-1)-word combinations from the movie description. These shingles are useful
# for enhancing semantic alignment between genre and content when building text
# eg. genre_Tokens = [comedy, drama], movie_tokens = [a, b, c], shingles = [[comedy, a], [comedy, b], [comedy, c], [drama, a], [drama, b], [drama, c]] for shingle size 2 

def create_genre_guided_shingles(genre_text, movie_text, k):
    # Split the genre and movie input texts into individual tokens (words)
    genre_tokens = genre_text.split()
    movie_tokens = movie_text.split()

    # Edge case: if k <= 1, return individual genre tokens only
    if k <= 1:
        return genre_tokens

    # If there are not enough movie tokens for k-1 combinations, just return the whole movie text
    if len(movie_tokens) < (k - 1):
        movie_combinations = [' '.join(movie_tokens)] if movie_tokens else []
    else:
        # Generate all (k-1)-word combinations from the movie tokens
        movie_combinations = [' '.join(comb) for comb in itertools.combinations(movie_tokens, k - 1)]

    # Build shingles by appending each genre token to each movie token combination
    shingles = []
    for genre_token in genre_tokens:
        for movie_comb in movie_combinations:
            shingles.append(f"{genre_token} {movie_comb}")

    return shingles


# Function to generate MinHash Forest (using fixed shingle size)

In [28]:
# Worker function to create a MinHash signature for a single movie
def minhash_worker(genre_text, movie_text, permutations, k):
    # Generate shingles guided by genre and movie metadata
    shingles = create_genre_guided_shingles(genre_text, movie_text, k)

    # Initialize MinHash object with the specified number of permutations
    m = MinHash(num_perm=permutations)

    # Update the MinHash object with each shingle (as bytes)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))

    # Return the resulting MinHash signature
    return m


# Function to build a MinHashLSHForest from genre/movie metadata
def generate_forest(genre_texts, movie_texts, permutations, k):
    """
    Builds an LSH Forest for fast approximate nearest-neighbor search using MinHash signatures.

    Args:
        genre_texts: list of genre tokens
        movie_texts: list of corresponding movie tokens
        permutations: number of MinHash permutations (controls accuracy)
        k: number of tokens per genre-guided shingle (shingle size)
    """

    start_time = time.time()

    # Determine number of CPU cores to use, leaving one free
    num_cores = max(1, multiprocessing.cpu_count() - 1)
    print(f"Using {num_cores} cores for parallel MinHash creation...")

    # Create MinHash signatures in parallel for all movie-genre pairs
    minhash_list = Parallel(n_jobs=num_cores)(
        delayed(minhash_worker)(genre_texts[i], movie_texts[i], permutations, k) 
        for i in range(len(genre_texts))
    )

    # Initialize the LSH Forest using the same number of permutations
    forest = MinHashLSHForest(num_perm=permutations)

    # Add each MinHash signature to the forest with its corresponding index
    for idx, m in enumerate(minhash_list):
        forest.add(idx, m)

    # Index the forest to prepare it for fast querying
    forest.index()

    print('It took %.2f seconds to build forest.' % (time.time() - start_time))

    return forest  # Return the indexed forest for use in similarity searches


# Function to query MinHash Forest (using fixed shingle size)

In [29]:
def predict(idx, df_cleaned, permutations, num_results, forest, k):
    """
    Predicts the most similar movies to the movie at index `idx` using MinHash LSH Forest.

    Returns:
        result: DataFrame of top similar movies sorted by Jaccard similarity
        shingles: List of shingles used to represent the input movie
    """

    # Extract genre and movie_combined text
    genre_text = df_cleaned.loc[idx, 'genres']  
    movie_text = df_cleaned.loc[idx, 'movie_combined']

    # Create shingles and MinHash signature for the input movie
    shingles = create_genre_guided_shingles(genre_text, movie_text, k)
    query_minhash = MinHash(num_perm=permutations)
    for shingle in shingles:
        query_minhash.update(shingle.encode('utf8'))

    # Query the forest for nearest neighbors
    idx_array = np.array(forest.query(query_minhash, num_results + 1))

    if len(idx_array) == 0:
        return None, shingles

    # Remove the query movie itself from results
    idx_array = idx_array[idx_array != idx]

    # Compute similarities
    similarities = []
    for neighbor_idx in idx_array:
        neighbor_text = df_cleaned.loc[neighbor_idx, 'movie_combined']
        neighbor_genre = df_cleaned.loc[neighbor_idx, 'genres']
        neighbor_shingles = create_genre_guided_shingles(neighbor_genre, neighbor_text, k)

        neighbor_minhash = MinHash(num_perm=permutations)
        for shingle in neighbor_shingles:
            neighbor_minhash.update(shingle.encode('utf8'))

        sim = query_minhash.jaccard(neighbor_minhash)
        similarities.append((neighbor_idx, sim))

    # Sort by similarity descending (nearest to furthest)
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Extract sorted indices
    sorted_indices = [idx for idx, _ in similarities]

    # Get movie details in order
    result = df_cleaned_english.loc[sorted_indices][['movie_title', 'content_rating', 'genres', 'directors', 'actors']]
    result['similarity'] = [sim for _, sim in similarities]

    return result


# Creating Forest (Using Fixed Shingle Size)

In [30]:
# Set number of Permutations
permutations = 1024
k = 2

In [31]:
forest = generate_forest(
    genre_texts = df_cleaned['genres'].tolist(),
    movie_texts = df_cleaned['movie_combined'].tolist(),
    permutations = permutations,
    k = k
)

Using 31 cores for parallel MinHash creation...
It took 5.41 seconds to build forest.


## Testing out the model 

In [None]:
# used to find the id of a movie by its name 
def search_movies(df, search_string):
    """
    Searches for movie titles containing the given search string (case-insensitive).
    Returns a DataFrame with the matching rows and their original indexes.
    """
    mask = df['movie_title'].str.contains(search_string, case=False, na=False)
    result_df = df[mask].copy()
    return result_df[['movie_title', 'directors', 'genres']]


In [None]:
# generates teh recommendations based on a movie index provided
def recommend_movies(idx):
    num_recommendations = 10
    input_title = df_cleaned_english.loc[idx]['movie_title']
    
    results = predict(idx, df_cleaned, permutations, num_recommendations, forest, k)
    
    movie_info = df_cleaned_english.loc[idx, ['movie_title', 'content_rating', 'genres', 'directors', 'actors']]
    
    print("Input Movie:\n", movie_info)
    print(f'\nTop {num_recommendations} recommendations for [{input_title}]:')
    
    return results


In [None]:
# search for the movie here to get its id 
search_list = search_movies(df_cleaned_english, "la la land")
search_list

Unnamed: 0,movie_title,directors,genres
8781,La La Land,Damien Chazelle,"Comedy, Drama, Musical & Performing Arts"


In [None]:
# run the code below to test for a specific move, make sure to input the correct id 

# recommendations = recommend_movies(idx=2934)
# recommendations

Now that we know what movie we want to search for, lets do a search for the top 5 recommendations 

In [35]:
recommendations = recommend_movies(idx=2934)
recommendations

Input Movie:
 movie_title                                             Avengers: Age of Ultron
content_rating                                                            PG-13
genres                            Action & Adventure, Science Fiction & Fantasy
directors                                                           Joss Whedon
actors            Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth
Name: 2934, dtype: object

Top 10 recommendations for [Avengers: Age of Ultron]:


Unnamed: 0,movie_title,content_rating,genres,directors,actors,similarity
2936,Avengers: Infinity War,PG-13,"Action & Adventure, Science Fiction & Fantasy",Anthony Russo,"Robert Downey Jr., Chris Hemsworth, Mark Ruffalo, Chris Evans",0.314453
4136,Captain America: Civil War,PG-13,"Action & Adventure, Science Fiction & Fantasy",Anthony Russo,"Chris Evans, Robert Downey Jr., Scarlett Johansson, Sebastian Stan",0.121094
15477,Thor: The Dark World,PG-13,"Action & Adventure, Science Fiction & Fantasy",Alan Taylor,"Chris Hemsworth, Natalie Portman, Tom Hiddleston, Stellan Skarsgård",0.048828
4137,Captain America: The First Avenger,PG-13,"Action & Adventure, Mystery & Suspense",Joe Johnston,"Chris Evans, Tommy Lee Jones, Hugo Weaving, Hayley Atwell",0.041992
15475,Thor,PG-13,"Action & Adventure, Drama, Science Fiction & Fantasy",Kenneth Branagh,"Chris Hemsworth, Natalie Portman, Anthony Hopkins, Tom Hiddleston",0.039062
14538,The Huntsman: Winter's War,PG-13,"Action & Adventure, Drama, Science Fiction & Fantasy",Cedric Nicolas-Troyan,"Chris Hemsworth, Charlize Theron, Emily Blunt, Jessica Chastain",0.035156
9872,Men in Black International,PG-13,"Action & Adventure, Comedy, Science Fiction & Fantasy",F. Gary Gray,"Chris Hemsworth, Tessa Thompson, Rebecca Ferguson, Kumail Nanjiani",0.032227
7977,In the Heart of the Sea,PG-13,"Action & Adventure, Drama",Ron Howard,"Chris Hemsworth, Benjamin Walker, Cillian Murphy, Ben Whishaw",0.016602
3556,Blackhat,R,"Action & Adventure, Drama",Michael Mann,"Chris Hemsworth, Viola Davis, Tang Wei, Leehom Wang",0.012695
5970,Extraction,R,"Action & Adventure, Drama, Mystery & Suspense",Sam Hargrave,"Chris Hemsworth, David Harbour, Derek Luke, Nico Pimparé Gaetan",0.009766


In [36]:
recommendations = recommend_movies(idx=209)
recommendations

Input Movie:
 movie_title                                                     The Hangover
content_rating                                                             R
genres                                                                Comedy
directors                                                      Todd Phillips
actors            Bradley Cooper, Ed Helms, Zach Galifianakis, Justin Bartha
Name: 209, dtype: object

Top 10 recommendations for [The Hangover]:


Unnamed: 0,movie_title,content_rating,genres,directors,actors,similarity
14448,The Hangover Part II,R,Comedy,Todd Phillips,"Bradley Cooper, Ed Helms, Zach Galifianakis, Justin Bartha",0.683594
14449,The Hangover Part III,R,Comedy,Todd Phillips,"Bradley Cooper, Ed Helms, Zach Galifianakis, Ken Jeong",0.539062
1267,Road Trip,R,Comedy,Todd Phillips,"Breckin Meyer, Seann William Scott, Amy Smart, Paulo Costanzo",0.125
4018,Burnt,R,Comedy,John Wells,"Bradley Cooper, Sienna Miller, Lily James, Uma Thurman",0.110352
12761,Silver Linings Playbook,R,Comedy,David O. Russell,"Bradley Cooper, Jennifer Lawrence, Robert De Niro, Jacki Weaver",0.109375
6100,Father Figures,R,Comedy,Lawrence Sher,"Owen Wilson, Ed Helms, J.K. Simmons, Katt Williams",0.092773
13594,Tag,R,Comedy,Jeff Tomsic,"Ed Helms, Jake Johnson (XVI), Hannibal Buress, Jon Hamm",0.082031
2449,All About Steve,PG-13,Comedy,Phil Traill,"Sandra Bullock, Bradley Cooper, Thomas Haden Church, Ken Jeong",0.06543
2508,Aloha,PG-13,"Action & Adventure, Comedy",Cameron Crowe,"Bradley Cooper, Emma Stone, Rachel McAdams, Danny McBride (IV)",0.057617
2177,The A-Team,PG-13,"Action & Adventure, Comedy",Joe Carnahan,"Liam Neeson, Bradley Cooper, Sharlto Copley, Jessica Biel",0.030273


In [37]:
recommendations = recommend_movies(idx=8781)
recommendations

Input Movie:
 movie_title                                                La La Land
content_rating                                                  PG-13
genres                       Comedy, Drama, Musical & Performing Arts
directors                                             Damien Chazelle
actors            Ryan Gosling, Emma Stone, John Legend, J.K. Simmons
Name: 8781, dtype: object

Top 10 recommendations for [La La Land]:


Unnamed: 0,movie_title,content_rating,genres,directors,actors,similarity
13020,Song to Song,R,"Comedy, Drama",Terrence Malick,"Ryan Gosling, Rooney Mara, Michael Fassbender, Natalie Portman",0.101562
6241,First Man,PG-13,Drama,Damien Chazelle,"Ryan Gosling, Claire Foy, Jason Clarke, Kyle Chandler",0.094727
7099,Guy and Madeline on a Park Bench,NR,"Drama, Musical & Performing Arts",Damien Chazelle,"Jason Palmer, Desiree Garcia, Sandha Khin, Frank Garvin",0.068359
14940,The Place Beyond The Pines,R,Drama,Derek Cianfrance,"Bradley Cooper, Ryan Gosling, Eva Mendes, Rose Byrne",0.045898
12842,The Slaughter Rule,R,Drama,Alex Smith,"Ryan Gosling, David Morse, Clea DuVall, Eddie Spears",0.044922
10735,The Notebook,PG-13,"Drama, Romance",Nick Cassavetes,"Ryan Gosling, Rachel McAdams, James Garner, Gena Rowlands",0.039062
15985,The United States of Leland,R,"Drama, Mystery & Suspense",Matthew Ryan Hoge,"Don Cheadle, Ryan Gosling, Chris Klein, Jena Malone",0.036133
3565,Blade Runner 2049,R,"Action & Adventure, Drama, Science Fiction & Fantasy",Denis Villeneuve,"Ryan Gosling, Harrison Ford, Ana de Armas, Sylvia Hoeks",0.03418
5558,Drive,R,"Action & Adventure, Drama, Mystery & Suspense",Nicolas Winding Refn,"Ryan Gosling, Carey Mulligan, Albert Brooks, Bryan Cranston",0.032227
13253,Stay,R,"Drama, Mystery & Suspense, Science Fiction & Fantasy",Marc Forster,"Ewan McGregor, Ryan Gosling, Naomi Watts, Kate Burton",0.026367
