In [1]:
%%time

! pip install datasketch

CPU times: user 20.2 ms, sys: 12.6 ms, total: 32.9 ms
Wall time: 1.94 s


In [2]:
import numpy as np
import pandas as pd
import time
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from datasketch import MinHash
from datasketch import MinHashLSHForest
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laertxhumari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/laertxhumari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/laertxhumari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Import the CSV file
df = pd.read_csv("https://raw.githubusercontent.com/LaertXh/LLM-Project/refs/heads/main/Data/rotten_tomatoes_movies.csv")

In [5]:
df.shape

(17712, 22)

In [6]:
df.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


### Dataset Cleaning

In [7]:
#Drop unneeded columns
df.drop(['rotten_tomatoes_link','critics_consensus', 'original_release_date', 'streaming_release_date', 'runtime', 'authors','production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'], axis=1, inplace = True)


In [8]:
df.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."


#### Keep only the first director, and the first 4 actors

In [9]:
#Limit the number of directors to the first listed director
df['directors'] = df['directors'].str.split(',').str[0]

In [10]:
df['directors'].head()

0       Chris Columbus
1    Nicole Holofcener
2        Blake Edwards
3         Sidney Lumet
4    Richard Fleischer
Name: directors, dtype: object

In [11]:
#Limit the number of actors to the first four listed actors
df['actors'] = df['actors'].str.split(',').str[:4].str.join(',')

In [12]:
df['actors'].head()

0    Logan Lerman, Brandon T. Jackson, Alexandra Da...
1    Catherine Keener, Amanda Peet, Oliver Platt, R...
2    Dudley Moore, Bo Derek, Julie Andrews, Robert ...
3    Martin Balsam, John Fiedler, Lee J. Cobb, E.G....
4    James Mason, Kirk Douglas, Paul Lukas, Peter L...
Name: actors, dtype: object

In [13]:
df.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."


#### Check Duplicates and drop all null values

In [14]:
#Check for duplicate rows
duplicate_rows = df[df.duplicated()]

In [15]:
print(duplicate_rows)

Empty DataFrame
Columns: [movie_title, movie_info, content_rating, genres, directors, actors]
Index: []


In [16]:
#Identify null values
df.isnull().sum()

movie_title         0
movie_info        321
content_rating      0
genres             19
directors         194
actors            352
dtype: int64

In [17]:
#Drop rows with null values
df_cleaned = df.dropna(subset=['movie_info', 'genres', 'directors', 'actors'])

In [18]:
df_cleaned.isnull().sum()

movie_title       0
movie_info        0
content_rating    0
genres            0
directors         0
actors            0
dtype: int64

In [19]:
df_cleaned.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."


#### Reset index, we will use this dataset moving forward 

In [20]:
# reset index 
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned_english = df_cleaned.copy()
df_cleaned

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."
...,...,...,...,...,...,...
16903,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",PG,"Action & Adventure, Comedy, Kids & Family",Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Spencer..."
16904,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,R,"Drama, Musical & Performing Arts",Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid..."
16905,Zootopia,From the largest elephant to the smallest shre...,PG,"Action & Adventure, Animation, Comedy",Byron Howard,"J.K. Simmons, Kristen Bell, Octavia Spencer, A..."
16906,Zulu,"In 1879, the Zulu nation hands colonial Britis...",PG,"Classics, Drama",Cy Endfield,"Stanley Baker, Jack Hawkins, Ulla Jacobsson, J..."


In [21]:
# Display the full contents of each cell 
pd.set_option('display.max_colwidth', None)

In [22]:
df_cleaned['movie_info'].head()

0                                       Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
1       Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
2         

In [23]:
df_cleaned.shape

(16908, 6)

In [24]:
df_cleaned['movie_info'] = df_cleaned['movie_info'].replace({r'[\r\n\t]+': ' '}, regex=True)

In [25]:
df_cleaned['movie_info'].head()

0                                       Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
1       Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
2         

### Remove Spaces from names of actors and directors

In [26]:
#Function to clean_text
def clean_text(text):
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))
    punct = string.punctuation
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', punct)).lower()
    tokens = re.split(r'\W+', text)
    tokens = [lem.lemmatize(word) for word in tokens if word not in stop]
    return ' '.join(tokens)

In [27]:
df_cleaned['directors'] = df_cleaned['directors'].str.replace(' ', '')
df_cleaned['actors'] = df_cleaned['actors'].apply(
    lambda x: ', '.join(name.replace(' ', '') for name in x.split(','))
)
df_cleaned['genres'] = df_cleaned['genres'].apply(
    lambda x: ', '.join(name.replace(' ', '') for name in x.split(','))
)

In [28]:
def weight_and_salt_names(name_string, n):
    names = [name.strip().replace(' ', '') for name in name_string.split(',')]
    salted_names = []

    def salt_name(name, weight):
        if weight <= 0:
            return []
        salted = [name]  # first occurrence without salt
        salted += [f"{name}__salt{i}" for i in range(weight - 1)]
        return salted

    if len(names) > 0:
        salted_names.extend(salt_name(names[0], n))
    if len(names) > 1:
        salted_names.extend(salt_name(names[1], (n // 2) + 1))
    if len(names) > 2:
        salted_names.extend(salt_name(names[2], n // 2))
    if len(names) > 3:
        salted_names.extend(salt_name(names[3], n // 3))

    return ' '.join(salted_names)


# Apply to actors and directors
df_cleaned['actors'] = df_cleaned['actors'].apply(lambda x: weight_and_salt_names(x, n=6))
df_cleaned['directors'] = df_cleaned['directors'].apply(lambda x: weight_and_salt_names(x, n=5))


df_cleaned['movie_title'] = df_cleaned['movie_title'].apply(clean_text)
df_cleaned['movie_title'] = df_cleaned['movie_title'].str.replace(' ', ',', regex=False)
df_cleaned['movie_title'] = df_cleaned['movie_title'].apply(lambda x: weight_and_salt_names(x, n=3))


Get key words from movie_info

In [29]:
corpus = df_cleaned['movie_info'].fillna('')

vectorizer = TfidfVectorizer(stop_words='english')  # No max_features now
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()

def extract_keywords(row_index, top_n=8):
    row_vector = X[row_index]
    sorted_items = zip(row_vector.indices, row_vector.data)
    sorted_items = sorted(sorted_items, key=lambda x: -x[1])
    keywords = [features[idx] for idx, score in sorted_items[:top_n]]
    return " ".join(keywords)

df_cleaned['movie_info_reduced'] = [extract_keywords(i) for i in range(X.shape[0])]


In [30]:
# Combining relevant columns for cleaning and then LSH model training
df_cleaned['movie_combined'] =df_cleaned['movie_title'] + ' ' + df_cleaned['actors']+ ' ' + df_cleaned['directors'] + ' ' + df_cleaned['movie_info_reduced'] + ' ' + df_cleaned['content_rating']

In [31]:
#Function to clean_text
def clean_text(text):
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))
    punct = string.punctuation
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', punct)).lower()
    tokens = re.split(r'\W+', text)
    tokens = [lem.lemmatize(word) for word in tokens if word not in stop]
    return ' '.join(tokens)

In [32]:
#Use function to clean the movie_combined column
df_cleaned['movie_combined'] = df_cleaned['movie_combined'].apply(clean_text)
df_cleaned['genres'] = df_cleaned['genres'].apply(clean_text)
df_cleaned.head(5)

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors,movie_info_reduced,movie_combined
0,percy percy__salt0 percy__salt1 jackson jackson__salt0 olympian lightning,"Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.",PG,actionadventure comedy drama sciencefictionfantasy,ChrisColumbus ChrisColumbus__salt0 ChrisColumbus__salt1 ChrisColumbus__salt2 ChrisColumbus__salt3,LoganLerman LoganLerman__salt0 LoganLerman__salt1 LoganLerman__salt2 LoganLerman__salt3 LoganLerman__salt4 BrandonT.Jackson BrandonT.Jackson__salt0 BrandonT.Jackson__salt1 BrandonT.Jackson__salt2 AlexandraDaddario AlexandraDaddario__salt0 AlexandraDaddario__salt1 JakeAbel JakeAbel__salt0,percy god olympians deities erupting learns poseidon hades,percy percysalt0 percysalt1 jackson jacksonsalt0 olympian lightning loganlerman loganlermansalt0 loganlermansalt1 loganlermansalt2 loganlermansalt3 loganlermansalt4 brandontjackson brandontjacksonsalt0 brandontjacksonsalt1 brandontjacksonsalt2 alexandradaddario alexandradaddariosalt0 alexandradaddariosalt1 jakeabel jakeabelsalt0 chriscolumbus chriscolumbussalt0 chriscolumbussalt1 chriscolumbussalt2 chriscolumbussalt3 percy god olympian deity erupting learns poseidon hades pg
1,please please__salt0 please__salt1 give give__salt0,"Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.",R,comedy,NicoleHolofcener NicoleHolofcener__salt0 NicoleHolofcener__salt1 NicoleHolofcener__salt2 NicoleHolofcener__salt3,CatherineKeener CatherineKeener__salt0 CatherineKeener__salt1 CatherineKeener__salt2 CatherineKeener__salt3 CatherineKeener__salt4 AmandaPeet AmandaPeet__salt0 AmandaPeet__salt1 AmandaPeet__salt2 OliverPlatt OliverPlatt__salt0 OliverPlatt__salt1 RebeccaHall RebeccaHall__salt0,kate tidy reselling cheaply remodel occupant granddaughters assuage,please pleasesalt0 pleasesalt1 give givesalt0 catherinekeener catherinekeenersalt0 catherinekeenersalt1 catherinekeenersalt2 catherinekeenersalt3 catherinekeenersalt4 amandapeet amandapeetsalt0 amandapeetsalt1 amandapeetsalt2 oliverplatt oliverplattsalt0 oliverplattsalt1 rebeccahall rebeccahallsalt0 nicoleholofcener nicoleholofcenersalt0 nicoleholofcenersalt1 nicoleholofcenersalt2 nicoleholofcenersalt3 kate tidy reselling cheaply remodel occupant granddaughter assuage r
2,10 10__salt0 10__salt1,"A successful, middle-aged Hollywood songwriter falls hopelessly in love with the woman of his dreams, and even follows the girl and her new husband to their Mexican honeymoon resort. While his behavior seems sure to land him in trouble, out of the blue fate plays into his hands.",R,comedy romance,BlakeEdwards BlakeEdwards__salt0 BlakeEdwards__salt1 BlakeEdwards__salt2 BlakeEdwards__salt3,DudleyMoore DudleyMoore__salt0 DudleyMoore__salt1 DudleyMoore__salt2 DudleyMoore__salt3 DudleyMoore__salt4 BoDerek BoDerek__salt0 BoDerek__salt1 BoDerek__salt2 JulieAndrews JulieAndrews__salt0 JulieAndrews__salt1 RobertWebber RobertWebber__salt0,hopelessly honeymoon songwriter sure blue aged resort plays,10 10salt0 10salt1 dudleymoore dudleymooresalt0 dudleymooresalt1 dudleymooresalt2 dudleymooresalt3 dudleymooresalt4 boderek bodereksalt0 bodereksalt1 bodereksalt2 julieandrews julieandrewssalt0 julieandrewssalt1 robertwebber robertwebbersalt0 blakeedwards blakeedwardssalt0 blakeedwardssalt1 blakeedwardssalt2 blakeedwardssalt3 hopelessly honeymoon songwriter sure blue aged resort play r
3,12 12__salt0 12__salt1 angry angry__salt0 men twelve,"Following the closing arguments in a murder trial, the 12 members of the jury must deliberate, with a guilty verdict meaning death for the accused, an inner-city teen. As the dozen men try to reach a unanimous decision while sequestered in a room, one juror (Henry Fonda) casts considerable doubt on elements of the case. Personal issues soon rise to the surface, and conflict threatens to derail the delicate process that will decide one boy's fate.",NR,classic drama,SidneyLumet SidneyLumet__salt0 SidneyLumet__salt1 SidneyLumet__salt2 SidneyLumet__salt3,MartinBalsam MartinBalsam__salt0 MartinBalsam__salt1 MartinBalsam__salt2 MartinBalsam__salt3 MartinBalsam__salt4 JohnFiedler JohnFiedler__salt0 JohnFiedler__salt1 JohnFiedler__salt2 LeeJ.Cobb LeeJ.Cobb__salt0 LeeJ.Cobb__salt1 E.G.Marshall E.G.Marshall__salt0,unanimous sequestered arguments juror deliberate verdict delicate dozen,12 12salt0 12salt1 angry angrysalt0 men twelve martinbalsam martinbalsamsalt0 martinbalsamsalt1 martinbalsamsalt2 martinbalsamsalt3 martinbalsamsalt4 johnfiedler johnfiedlersalt0 johnfiedlersalt1 johnfiedlersalt2 leejcobb leejcobbsalt0 leejcobbsalt1 egmarshall egmarshallsalt0 sidneylumet sidneylumetsalt0 sidneylumetsalt1 sidneylumetsalt2 sidneylumetsalt3 unanimous sequestered argument juror deliberate verdict delicate dozen nr
4,20000 20000__salt0 20000__salt1 league league__salt0 sea,"In 1866, Professor Pierre M. Aronnax (Paul Lukas) and his assistant Conseil (Peter Lorre), stranded in San Francisco by reports of a giant sea monster attacking ships in the Pacific Ocean, are invited to join an expedition to search for the creature. During the search, they and harpooner Ned Land (Kirk Douglas) are thrown overboard during an attack, eventually discovering that the supposed monster is actually a submarine piloted by the brilliant but haunted Captain Nemo (James Mason).",G,actionadventure drama kidsfamily,RichardFleischer RichardFleischer__salt0 RichardFleischer__salt1 RichardFleischer__salt2 RichardFleischer__salt3,JamesMason JamesMason__salt0 JamesMason__salt1 JamesMason__salt2 JamesMason__salt3 JamesMason__salt4 KirkDouglas KirkDouglas__salt0 KirkDouglas__salt1 KirkDouglas__salt2 PaulLukas PaulLukas__salt0 PaulLukas__salt1 PeterLorre PeterLorre__salt0,monster search 1866 aronnax conseil harpooner nemo lorre,20000 20000salt0 20000salt1 league leaguesalt0 sea jamesmason jamesmasonsalt0 jamesmasonsalt1 jamesmasonsalt2 jamesmasonsalt3 jamesmasonsalt4 kirkdouglas kirkdouglassalt0 kirkdouglassalt1 kirkdouglassalt2 paullukas paullukassalt0 paullukassalt1 peterlorre peterlorresalt0 richardfleischer richardfleischersalt0 richardfleischersalt1 richardfleischersalt2 richardfleischersalt3 monster search 1866 aronnax conseil harpooner nemo lorre g


# Shingles Size is Fixed

In [33]:
import itertools

def create_genre_guided_shingles(genre_text, movie_text, k):
    genre_tokens = genre_text.split()
    movie_tokens = movie_text.split()

    if k <= 1:
        return genre_tokens

    if len(movie_tokens) < (k - 1):
        movie_combinations = [' '.join(movie_tokens)] if movie_tokens else []
    else:
        movie_combinations = [' '.join(comb) for comb in itertools.combinations(movie_tokens, k - 1)]

    shingles = []
    for genre_token in genre_tokens:
        for movie_comb in movie_combinations:
            shingles.append(f"{genre_token} {movie_comb}")

    return shingles


# Creating function to generate MinHash Forest (using fixed shingle size)

- Initializing number of permutations in MinHash
- MinHashing the string on all shingles in each document
- Storing the MinHash of the string
- Generating a forest of all MinHashed strings
- Indexing the forest to make it searchable

In [34]:
import time
from joblib import Parallel, delayed
from datasketch import MinHash, MinHashLSHForest
import multiprocessing

def minhash_worker(genre_text, movie_text, permutations, k):
    shingles = create_genre_guided_shingles(genre_text, movie_text, k)
    m = MinHash(num_perm=permutations)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))
    return m

def generate_forest(genre_texts, movie_texts, permutations, k):
    """
    genre_texts: list of genre strings (same length as movie_texts)
    movie_texts: list of combined movie metadata strings
    permutations: number of MinHash permutations
    k: shingle size (total words per shingle)
    """

    start_time = time.time()

    num_cores = max(1, multiprocessing.cpu_count() - 1)
    print(f"Using {num_cores} cores for parallel MinHash creation...")

    # Parallel MinHash creation (note: inputs must be paired genre/movie for each record)
    minhash_list = Parallel(n_jobs=num_cores)(
        delayed(minhash_worker)(genre_texts[i], movie_texts[i], permutations, k) 
        for i in range(len(genre_texts))
    )

    # Build the LSH Forest
    forest = MinHashLSHForest(num_perm=permutations)

    for idx, m in enumerate(minhash_list):
        forest.add(idx, m)

    forest.index()

    print('It took %.2f seconds to build forest.' % (time.time() - start_time))

    return forest


#Creating a  function to query MinHash Forest (using fixed shingle size)

- Preprocessing input text into fixed size shingles
- Using the same number of permutations for the MinHash as was used to build the forest
- Creating a MinHash on the input text using all shingles
- Querying the forest with MinHash and return the number of requested recommendations
- Providing the titles of each conference paper recommended

In [35]:
def predict(idx, df_cleaned, permutations, num_results, forest, k):
    """
    Find num_results nearest neighbors for the movie at index idx,
    excluding the movie itself from the recommendations.
    Also returns the shingles used for the query.
    """
    genre_text = df_cleaned.loc[idx, 'genres']  # Or whatever your genre column is named
    movie_text = df_cleaned.loc[idx, 'movie_combined']

    shingles = create_genre_guided_shingles(genre_text, movie_text, k)

    m = MinHash(num_perm=permutations)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))

    # Query for num_results + 1 to account for the movie itself
    idx_array = np.array(forest.query(m, num_results + 1))

    if len(idx_array) == 0:
        return None, shingles

    # Remove the movie itself from the results
    idx_array = idx_array[idx_array != idx]

    result = df_cleaned_english.loc[idx_array][['movie_title', 'content_rating', 'genres', 'directors', 'actors']]
    return result, shingles


# Using forest (with fixed shingle size) to make recommendations for one/more movies using 2 approaches. First is index and second is using Movie title

## Testing with dynamic approach 

First lets locate the move record based on the title 

In [36]:
def search_movies(df, search_string):
    """
    Searches for movie titles containing the given search string (case-insensitive).
    Returns a DataFrame with the matching rows and their original indexes.
    """
    mask = df['movie_title'].str.contains(search_string, case=False, na=False)
    result_df = df[mask].copy()
    return result_df[['movie_title', 'directors']]


In [37]:
search_list = search_movies(df_cleaned_english, "avenger")
search_list

Unnamed: 0,movie_title,directors
1179,The Avengers,Jeremiah S. Chechik
2934,Avengers: Age of Ultron,Joss Whedon
2935,Avengers: Endgame,Anthony Russo
2936,Avengers: Infinity War,Anthony Russo
4137,Captain America: The First Avenger,Joe Johnston
15211,The Toxic Avenger: Part II,Michael Herz
15670,The Toxic Avenger,Michael Herz
15671,Citizen Toxie: The Toxic Avenger IV,Lloyd Kaufman


# Creating Forest (Using Fixed Shingle Size)

In [38]:
# Set number of Permutations
permutations = 512
k = 2

In [39]:
forest = generate_forest(
    genre_texts = df_cleaned['genres'].tolist(),
    movie_texts = df_cleaned['movie_combined'].tolist(),
    permutations = permutations,
    k = k
)

Using 11 cores for parallel MinHash creation...
It took 17.44 seconds to build forest.


Now that we know what movie we want to search for, lets do a search for the top 5 recommendations 

In [40]:
idx = 2934
num_recommendations = 10
input_title = df_cleaned_english.loc[idx]['movie_title']
input_text = df_cleaned.loc[idx]['movie_combined']
results, test_shingles = predict(idx, df_cleaned, permutations, num_recommendations, forest, k)
movie_info = df_cleaned_english.loc[idx, ['movie_title', 'content_rating', 'genres', 'directors', 'actors']]
print("Input Movie:\n", movie_info)
print(f'\nTop {num_recommendations} recommendations for [{input_title}]:')
results 

Input Movie:
 movie_title                                             Avengers: Age of Ultron
content_rating                                                            PG-13
genres                            Action & Adventure, Science Fiction & Fantasy
directors                                                           Joss Whedon
actors            Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth
Name: 2934, dtype: object

Top 10 recommendations for [Avengers: Age of Ultron]:


Unnamed: 0,movie_title,content_rating,genres,directors,actors
4136,Captain America: Civil War,PG-13,"Action & Adventure, Science Fiction & Fantasy",Anthony Russo,"Chris Evans, Robert Downey Jr., Scarlett Johansson, Sebastian Stan"
4137,Captain America: The First Avenger,PG-13,"Action & Adventure, Mystery & Suspense",Joe Johnston,"Chris Evans, Tommy Lee Jones, Hugo Weaving, Hayley Atwell"
7977,In the Heart of the Sea,PG-13,"Action & Adventure, Drama",Ron Howard,"Chris Hemsworth, Benjamin Walker, Cillian Murphy, Ben Whishaw"
14538,The Huntsman: Winter's War,PG-13,"Action & Adventure, Drama, Science Fiction & Fantasy",Cedric Nicolas-Troyan,"Chris Hemsworth, Charlize Theron, Emily Blunt, Jessica Chastain"
9872,Men in Black International,PG-13,"Action & Adventure, Comedy, Science Fiction & Fantasy",F. Gary Gray,"Chris Hemsworth, Tessa Thompson, Rebecca Ferguson, Kumail Nanjiani"
5970,Extraction,R,"Action & Adventure, Drama, Mystery & Suspense",Sam Hargrave,"Chris Hemsworth, David Harbour, Derek Luke, Nico Pimparé Gaetan"
15475,Thor,PG-13,"Action & Adventure, Drama, Science Fiction & Fantasy",Kenneth Branagh,"Chris Hemsworth, Natalie Portman, Anthony Hopkins, Tom Hiddleston"
15477,Thor: The Dark World,PG-13,"Action & Adventure, Science Fiction & Fantasy",Alan Taylor,"Chris Hemsworth, Natalie Portman, Tom Hiddleston, Stellan Skarsgård"
2936,Avengers: Infinity War,PG-13,"Action & Adventure, Science Fiction & Fantasy",Anthony Russo,"Robert Downey Jr., Chris Hemsworth, Mark Ruffalo, Chris Evans"
12504,Serenity,PG-13,"Action & Adventure, Science Fiction & Fantasy",Joss Whedon,"Nathan Fillion, Gina Torres, Alan Tudyk, Morena Baccarin"


In [41]:
test_shingles

['actionadventure avenger',
 'actionadventure avengersalt0',
 'actionadventure avengersalt1',
 'actionadventure age',
 'actionadventure agesalt0',
 'actionadventure ultron',
 'actionadventure robertdowneyjr',
 'actionadventure robertdowneyjrsalt0',
 'actionadventure robertdowneyjrsalt1',
 'actionadventure robertdowneyjrsalt2',
 'actionadventure robertdowneyjrsalt3',
 'actionadventure robertdowneyjrsalt4',
 'actionadventure chrisevans',
 'actionadventure chrisevanssalt0',
 'actionadventure chrisevanssalt1',
 'actionadventure chrisevanssalt2',
 'actionadventure markruffalo',
 'actionadventure markruffalosalt0',
 'actionadventure markruffalosalt1',
 'actionadventure chrishemsworth',
 'actionadventure chrishemsworthsalt0',
 'actionadventure josswhedon',
 'actionadventure josswhedonsalt0',
 'actionadventure josswhedonsalt1',
 'actionadventure josswhedonsalt2',
 'actionadventure josswhedonsalt3',
 'actionadventure ultron',
 'actionadventure maximoff',
 'actionadventure peacekeeping',
 'actio