In [1]:
%%time

! pip install datasketch

Defaulting to user installation because normal site-packages is not writeable
CPU times: total: 0 ns
Wall time: 1.25 s



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import time
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from datasketch import MinHash
from datasketch import MinHashLSHForest
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Import the CSV file
df = pd.read_csv("https://raw.githubusercontent.com/LaertXh/LLM-Project/refs/heads/main/Data/rotten_tomatoes_movies.csv")

In [5]:
df.shape

(17712, 22)

In [6]:
df.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


### Dataset Cleaning

In [7]:
#Drop unneeded columns
df.drop(['rotten_tomatoes_link','critics_consensus', 'original_release_date', 'streaming_release_date', 'runtime', 'authors','production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'], axis=1, inplace = True)


In [8]:
df.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da..."
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R..."
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ..."
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G...."
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L..."


#### Keep only the first director, and the first 4 actors

In [9]:
#Limit the number of directors to the first listed director
df['directors'] = df['directors'].str.split(',').str[0]

In [10]:
df['directors'].head()

0       Chris Columbus
1    Nicole Holofcener
2        Blake Edwards
3         Sidney Lumet
4    Richard Fleischer
Name: directors, dtype: object

In [11]:
#Limit the number of actors to the first two listed actors
df['actors'] = df['actors'].str.split(',').str[:2].str.join(',')

In [12]:
df['actors'].head()

0    Logan Lerman, Brandon T. Jackson
1       Catherine Keener, Amanda Peet
2              Dudley Moore, Bo Derek
3         Martin Balsam, John Fiedler
4           James Mason, Kirk Douglas
Name: actors, dtype: object

In [13]:
df.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson"
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet"
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek"
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler"
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas"


#### Check Duplicates and drop all null values

In [14]:
#Check for duplicate rows
duplicate_rows = df[df.duplicated()]

In [15]:
print(duplicate_rows)

Empty DataFrame
Columns: [movie_title, movie_info, content_rating, genres, directors, actors]
Index: []


In [16]:
#Identify null values
df.isnull().sum()

movie_title         0
movie_info        321
content_rating      0
genres             19
directors         194
actors            352
dtype: int64

In [17]:
#Drop rows with null values
df_cleaned = df.dropna(subset=['movie_info', 'genres', 'directors', 'actors'])

In [18]:
df_cleaned.isnull().sum()

movie_title       0
movie_info        0
content_rating    0
genres            0
directors         0
actors            0
dtype: int64

In [19]:
df_cleaned.head()

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson"
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet"
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek"
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler"
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas"


#### Reset index, we will use this dataset moving forward 

In [20]:
# reset index 
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors
0,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson"
1,Please Give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet"
2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek"
3,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler"
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas"
...,...,...,...,...,...,...
16903,Zoom,"Capt. Zoom, or Jack (Tim Allen), as he is now ...",PG,"Action & Adventure, Comedy, Kids & Family",Peter Hewitt,"Tim Allen, Courteney Cox"
16904,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,R,"Drama, Musical & Performing Arts",Luis Valdez,"Daniel Valdez, Edward James Olmos"
16905,Zootopia,From the largest elephant to the smallest shre...,PG,"Action & Adventure, Animation, Comedy",Byron Howard,"J.K. Simmons, Kristen Bell"
16906,Zulu,"In 1879, the Zulu nation hands colonial Britis...",PG,"Classics, Drama",Cy Endfield,"Stanley Baker, Jack Hawkins"


In [21]:
# Display the full contents of each cell 
pd.set_option('display.max_colwidth', None)

In [22]:
df_cleaned['movie_info'].head()

0                                       Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
1       Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
2         

In [23]:
df_cleaned.shape

(16908, 6)

In [24]:
#df_cleaned[df_cleaned['movie_info'].apply(lambda x: isinstance(x, str) and x.strip() == '')]
df_cleaned['movie_info'] = df_cleaned['movie_info'].replace({r'[\r\n\t]+': ' '}, regex=True)

In [25]:
df_cleaned['movie_info'].head()

0                                       Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
1       Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
2         

### Remove Spaces from names of actors and directors

In [26]:
df_cleaned['directors'] = df_cleaned['directors'].str.replace(' ', '')
df_cleaned['actors'] = df_cleaned['actors'].apply(
    lambda x: ', '.join(name.replace(' ', '') for name in x.split(','))
)

Get key words from movie_info

In [27]:
corpus = df_cleaned['movie_info'].fillna('')

vectorizer = TfidfVectorizer(stop_words='english')  # No max_features now
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()

def extract_keywords(row_index, top_n=8):
    row_vector = X[row_index]
    sorted_items = zip(row_vector.indices, row_vector.data)
    sorted_items = sorted(sorted_items, key=lambda x: -x[1])
    keywords = [features[idx] for idx, score in sorted_items[:top_n]]
    return " ".join(keywords)

df_cleaned['movie_info_reduced'] = [extract_keywords(i) for i in range(X.shape[0])]


In [28]:
# Combining relevant columns for cleaning and then LSH model training
df_cleaned['movie_combined'] = df_cleaned['movie_title']+ ' ' + df_cleaned['content_rating']+ ' ' + df_cleaned['genres']+ ' ' + df_cleaned['actors']+ ' ' + df_cleaned['directors'] + ' ' + df_cleaned['movie_info_reduced']

In [29]:
df_cleaned.head(5)

Unnamed: 0,movie_title,movie_info,content_rating,genres,directors,actors,movie_info_reduced,movie_combined
0,Percy Jackson & the Olympians: The Lightning Thief,"Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.",PG,"Action & Adventure, Comedy, Drama, Science Fiction & Fantasy",ChrisColumbus,"LoganLerman, BrandonT.Jackson",percy god olympians deities erupting learns poseidon hades,"Percy Jackson & the Olympians: The Lightning Thief PG Action & Adventure, Comedy, Drama, Science Fiction & Fantasy LoganLerman, BrandonT.Jackson ChrisColumbus percy god olympians deities erupting learns poseidon hades"
1,Please Give,"Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.",R,Comedy,NicoleHolofcener,"CatherineKeener, AmandaPeet",kate tidy reselling cheaply remodel occupant granddaughters assuage,"Please Give R Comedy CatherineKeener, AmandaPeet NicoleHolofcener kate tidy reselling cheaply remodel occupant granddaughters assuage"
2,10,"A successful, middle-aged Hollywood songwriter falls hopelessly in love with the woman of his dreams, and even follows the girl and her new husband to their Mexican honeymoon resort. While his behavior seems sure to land him in trouble, out of the blue fate plays into his hands.",R,"Comedy, Romance",BlakeEdwards,"DudleyMoore, BoDerek",hopelessly honeymoon songwriter sure blue aged resort plays,"10 R Comedy, Romance DudleyMoore, BoDerek BlakeEdwards hopelessly honeymoon songwriter sure blue aged resort plays"
3,12 Angry Men (Twelve Angry Men),"Following the closing arguments in a murder trial, the 12 members of the jury must deliberate, with a guilty verdict meaning death for the accused, an inner-city teen. As the dozen men try to reach a unanimous decision while sequestered in a room, one juror (Henry Fonda) casts considerable doubt on elements of the case. Personal issues soon rise to the surface, and conflict threatens to derail the delicate process that will decide one boy's fate.",NR,"Classics, Drama",SidneyLumet,"MartinBalsam, JohnFiedler",unanimous sequestered arguments juror deliberate verdict delicate dozen,"12 Angry Men (Twelve Angry Men) NR Classics, Drama MartinBalsam, JohnFiedler SidneyLumet unanimous sequestered arguments juror deliberate verdict delicate dozen"
4,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Lukas) and his assistant Conseil (Peter Lorre), stranded in San Francisco by reports of a giant sea monster attacking ships in the Pacific Ocean, are invited to join an expedition to search for the creature. During the search, they and harpooner Ned Land (Kirk Douglas) are thrown overboard during an attack, eventually discovering that the supposed monster is actually a submarine piloted by the brilliant but haunted Captain Nemo (James Mason).",G,"Action & Adventure, Drama, Kids & Family",RichardFleischer,"JamesMason, KirkDouglas",monster search 1866 aronnax conseil harpooner nemo lorre,"20,000 Leagues Under The Sea G Action & Adventure, Drama, Kids & Family JamesMason, KirkDouglas RichardFleischer monster search 1866 aronnax conseil harpooner nemo lorre"


In [30]:
#Function to clean_text
def clean_text(text):
    lem = WordNetLemmatizer()
    stop = set(stopwords.words('english'))
    punct = string.punctuation
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', punct)).lower()
    tokens = re.split(r'\W+', text)
    tokens = [lem.lemmatize(word) for word in tokens if word not in stop]
    return ' '.join(tokens)

In [31]:
#Use function to clean the movie_combined column
df_cleaned['movie_combined'] = df_cleaned['movie_combined'].apply(clean_text)
df_cleaned['movie_combined'].head()

0    percy jackson olympian lightning thief pg action adventure comedy drama science fiction fantasy loganlerman brandontjackson chriscolumbus percy god olympian deity erupting learns poseidon hades
1                                                                  please give r comedy catherinekeener amandapeet nicoleholofcener kate tidy reselling cheaply remodel occupant granddaughter assuage
2                                                                                      10 r comedy romance dudleymoore boderek blakeedwards hopelessly honeymoon songwriter sure blue aged resort play
3                                           12 angry men twelve angry men nr classic drama martinbalsam johnfiedler sidneylumet unanimous sequestered argument juror deliberate verdict delicate dozen
4                                                20000 league sea g action adventure drama kid family jamesmason kirkdouglas richardfleischer monster search 1866 aronnax conseil harpooner nemo lorre
Name:

# Shingles Size is Fixed

In [32]:
# Creating function to create shingles
def create_shingles(text, shingle_size=7):
    return [text[i:i+shingle_size] for i in range(len(text)-shingle_size+1)]

In [33]:
# Set number of Permutations
permutations = 512

# Creating function to generate MinHash Forest (using fixed shingle size)

- Initializing number of permutations in MinHash
- MinHashing the string on all shingles in each document
- Storing the MinHash of the string
- Generating a forest of all MinHashed strings
- Indexing the forest to make it searchable

In [34]:
import time
from joblib import Parallel, delayed
from datasketch import MinHash, MinHashLSHForest
import multiprocessing

def minhash_worker(doc, permutations):
    shingles = create_shingles(doc)
    m = MinHash(num_perm=permutations)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))
    return m

def generate_forest(docs, permutations):
    start_time = time.time()

    num_cores = max(1, multiprocessing.cpu_count() - 1)
    print(f"Using {num_cores} cores for parallel MinHash creation...")

    # Step 1: Parallel MinHash creation (safe version)
    minhash_list = Parallel(n_jobs=num_cores)(
        delayed(minhash_worker)(doc, permutations) for doc in docs
    )

    # Step 2: Build LSH forest sequentially
    forest = MinHashLSHForest(num_perm=permutations)

    for idx, m in enumerate(minhash_list):
        forest.add(idx, m)

    forest.index()

    print('It took %.2f seconds to build forest.' % (time.time() - start_time))

    return forest

#Creating a  function to query MinHash Forest (using fixed shingle size)

- Preprocessing input text into fixed size shingles
- Using the same number of permutations for the MinHash as was used to build the forest
- Creating a MinHash on the input text using all shingles
- Querying the forest with MinHash and return the number of requested recommendations
- Providing the titles of each conference paper recommended

In [35]:
def predict(idx, df_cleaned, permutations, num_results, forest):
    """
    Find num_results nearest neighbors for the movie at index idx,
    excluding the movie itself from the recommendations.
    """
    text = df_cleaned.loc[idx, 'movie_combined']  # or any text field you used for creating shingles
    shingles = create_shingles(text)
    m = MinHash(num_perm=permutations)
    for shingle in shingles:
        m.update(shingle.encode('utf8'))

    # Query for num_results + 1 because the movie itself will be returned
    idx_array = np.array(forest.query(m, num_results + 1))
    if len(idx_array) == 0:
        return None


    # Remove the movie itself from the results
    idx_array = idx_array[idx_array != idx]

    result = df_cleaned.loc[idx_array][['movie_title', 'content_rating', 'genres', 'directors', 'actors', 'movie_combined']]
    return result


# Creating Forest (Using Fixed Shingle Size)

In [36]:
# Creating forest (using fixed shingle size)
forest = generate_forest(df_cleaned['movie_combined'], permutations)

Using 31 cores for parallel MinHash creation...
It took 4.19 seconds to build forest.


# Using forest (with fixed shingle size) to make recommendations for one/more movies using 2 approaches. First is index and second is using Movie title

## Testing with dynamic approach 

First lets locate the move record based on the title 

In [37]:
def search_movies(df, search_string):
    """
    Searches for movie titles containing the given search string (case-insensitive).
    Returns a DataFrame with the matching rows and their original indexes.
    """
    mask = df['movie_title'].str.contains(search_string, case=False, na=False)
    result_df = df[mask].copy()
    return result_df[['movie_title', 'directors']]


In [38]:
search_list = search_movies(df_cleaned, "avengers")
search_list

Unnamed: 0,movie_title,directors
1179,The Avengers,JeremiahS.Chechik
2934,Avengers: Age of Ultron,JossWhedon
2935,Avengers: Endgame,AnthonyRusso
2936,Avengers: Infinity War,AnthonyRusso


Now that we know what movie we want to search for, lets do a search for the top 5 recommendations 

In [None]:
idx = 2934
num_recommendations = 10
input_title = df_cleaned.loc[idx]['movie_title']
input_text = df_cleaned.loc[idx]['movie_combined']
results = predict(idx, df_cleaned, permutations, num_recommendations, forest)
movie_info = df_cleaned.loc[idx, ['movie_title', 'content_rating', 'genres', 'directors', 'actors', 'movie_combined']]
print("Input Movie:\n", movie_info)
print(f'\nTop {num_recommendations} recommendations for [{input_title}]:')
results

Input Movie:
 movie_title                                                                                                                                                           Avengers: Age of Ultron
content_rating                                                                                                                                                                          PG-13
genres                                                                                                                                          Action & Adventure, Science Fiction & Fantasy
directors                                                                                                                                                                          JossWhedon
actors                                                                                                                                                            RobertDowneyJr., ChrisEvans
movie_combined    avenger age ultron

Unnamed: 0,movie_title,content_rating,genres,directors,actors,movie_combined
16389,Westworld,PG,"Action & Adventure, Classics, Science Fiction & Fantasy, Western",MichaelCrichton,"RichardBenjamin, JamesBrolin",westworld pg action adventure classic science fiction fantasy western richardbenjamin jamesbrolin michaelcrichton blane paying martin westworld saloon escapist android unwind
6,The 39 Steps,NR,"Action & Adventure, Classics, Mystery & Suspense",AlfredHitchcock,"RobertDonat, MadeleineCarroll",39 step nr action adventure classic mystery suspense robertdonat madeleinecarroll alfredhitchcock hannay mannheim tearle lucie donat 39 godfrey annabella
8200,"It's a Mad, Mad, Mad, Mad World",G,"Action & Adventure, Classics, Comedy, Drama",StanleyKramer,"SpencerTracy, MiltonBerle",mad mad mad mad world g action adventure classic comedy drama spencertracy miltonberle stanleykramer fortune smiler cryptically durante bucket motorist grogan kicking
12299,Samson,PG-13,"Action & Adventure, Drama",BruceMacDonald,"JacksonRathbone, BillyZane",samson pg13 action adventure drama jacksonrathbone billyzane brucemacdonald philistine samson final temptress surrender direct mount tribal
12300,Samurai Marathon,NR,"Action & Adventure, Drama",BernardRose,"TakeruSatoh, NanaKomatsu",samurai marathon nr action adventure drama takerusatoh nanakomatsu bernardrose shogunate wrecking alliance prevent spy race win
...,...,...,...,...,...,...
12276,Saints and Soldiers,PG-13,"Action & Adventure, Drama",RyanLittle,"CorbinAllred, AlexanderPolinsky",saint soldier pg13 action adventure drama corbinallred alexanderpolinsky ryanlittle allied oberon soldier heyborne bagby allred deacon pvt
8181,It Happened at the World's Fair,G,"Action & Adventure, Classics, Musical & Performing Arts",NormanTaurog,"ElvisPresley, JoanO'Brien",happened world fair g action adventure classic musical performing art elvispresley joanobrien normantaurog plane mike tiu duster attraction thumb lockwood vicky
12279,Salmon Fishing in the Yemen,PG-13,"Comedy, Drama",LasseHallstrom,"EwanMcGregor, EmilyBlunt",salmon fishing yemen pg13 comedy drama ewanmcgregor emilyblunt lassehallstrom sheik jones harriet request chetwode amr waked fishery
14328,The Fountain,PG-13,"Drama, Science Fiction & Fantasy",DarrenAronofsky,"HughJackman, RachelWeisz",fountain pg13 drama science fiction fantasy hughjackman rachelweisz darrenaronofsky century 26th conquistador grasp fountain mystery tomas immortality
