In [42]:
import numpy as np 
import pandas as pd

In [43]:
movies = pd.read_csv('movies_metadata.csv', low_memory=False)
credits = pd.read_csv('credits.csv')
keyword = pd.read_csv('keywords.csv')

In [44]:
movies.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [45]:
# 1. Force the 'id' column to be numeric
# 'errors='coerce'' turns bad data (like dates) into NaN instead of crashing
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')

# 2. Drop the rows where the ID turned out to be bad (NaN)
movies = movies.dropna(subset=['id'])

# 3. Convert the clean column to integers (to match the credits dataset)
movies['id'] = movies['id'].astype(int)

# 4. NOW you can merge them safely
movies = movies.merge(credits, on='id')
movies = movies.merge(keyword, on='id')

# Check the result
print("Merge successful!")
print(movies.head())

Merge successful!
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  

In [46]:
movies.shape

(46628, 27)

In [47]:
credits.shape


(45476, 3)

In [48]:
keyword.shape

(46419, 2)

In [49]:
movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."


In [50]:
# Filter rows where original_language is 'en' and select specific columns
movies = movies.loc[movies['original_language'] == 'en', 
                    ['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'imdb_id', 'release_date', 'runtime']]

In [51]:
movies.info(
    
)

<class 'pandas.core.frame.DataFrame'>
Index: 32937 entries, 0 to 46627
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            32937 non-null  int64  
 1   title         32934 non-null  object 
 2   overview      32865 non-null  object 
 3   genres        32937 non-null  object 
 4   keywords      32937 non-null  object 
 5   cast          32937 non-null  object 
 6   crew          32937 non-null  object 
 7   imdb_id       32924 non-null  object 
 8   release_date  32869 non-null  object 
 9   runtime       32928 non-null  float64
dtypes: float64(1), int64(1), object(8)
memory usage: 2.8+ MB


In [52]:
movies.head()


Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tt0114709,1995-10-30,81.0
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",tt0113497,1995-12-15,104.0
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",tt0113228,1995-12-22,101.0
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",tt0114885,1995-12-22,127.0
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",tt0113041,1995-02-10,106.0


In [53]:
movies.isnull().sum()

id               0
title            3
overview        72
genres           0
keywords         0
cast             0
crew             0
imdb_id         13
release_date    68
runtime          9
dtype: int64

In [54]:
movies.duplicated().sum()

np.int64(682)

In [55]:
movies = movies.drop_duplicates()

In [56]:
movies.duplicated().sum()

np.int64(0)

In [57]:
movies.iloc[0].genres

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [58]:
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L    

In [59]:
movies['genres'].apply(convert)

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
46621               [Science Fiction]
46622        [Drama, Action, Romance]
46625       [Action, Drama, Thriller]
46626                              []
46627                              []
Name: genres, Length: 32255, dtype: object

In [60]:
import ast
ast.literal_eval("[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]")

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [61]:
# Assign the output back to the column
movies['genres'] = movies['genres'].apply(convert)

# Now check the head again
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tt0114709,1995-10-30,81.0
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",tt0113497,1995-12-15,104.0
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",tt0113228,1995-12-22,101.0
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",tt0114885,1995-12-22,127.0
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",tt0113041,1995-02-10,106.0


In [62]:
movies['keywords'] =movies['keywords'].apply(convert)

In [63]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tt0114709,1995-10-30,81.0


In [64]:
import ast

# 1. Extract only the first 3 actor names from the 'cast' dictionary
def convert_cast(obj):
    L = []
    counter = 0
    # If it's already a list of dicts, iterate. If it's a string, use literal_eval.
    try:
        data = ast.literal_eval(obj) if isinstance(obj, str) else obj
        for i in data:
            if counter != 3:
                L.append(i['name'])
                counter += 1
            else:
                break
        return L
    except:
        return []



# Apply the cleaning
movies['cast'] = movies['cast'].apply(convert_cast)

In [65]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",tt0114709,1995-10-30,81.0


In [66]:
import ast

def get_director(obj):
    try:
        # Convert string to list of dictionaries
        data = ast.literal_eval(obj)
        for i in data:
            if i.get('job') == 'Director':
                # We return the name in a list so it matches your 'cast' and 'genres' format
                return [i['name']]
        return []
    except (ValueError, SyntaxError, TypeError):
        return []

# 1. Apply the fix
movies['crew'] = movies['crew'].apply(get_director)

# 2. PROOF CHECK: Run this immediately to verify
print("--- Director Extraction Check ---")
print(movies[['title', 'crew']].head())

--- Director Extraction Check ---
                         title               crew
0                    Toy Story    [John Lasseter]
1                      Jumanji     [Joe Johnston]
2             Grumpier Old Men    [Howard Deutch]
3            Waiting to Exhale  [Forest Whitaker]
4  Father of the Bride Part II    [Charles Shyer]


In [67]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],tt0114709,1995-10-30,81.0


In [68]:
# This will show us the REAL structure of your crew data
print("Raw Crew Data for first movie:")
print(type(movies['crew'].iloc[0]))
print(movies['crew'].iloc[0])

Raw Crew Data for first movie:
<class 'list'>
['John Lasseter']


In [69]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],tt0114709,1995-10-30,81.0
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],tt0113497,1995-12-15,104.0
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],tt0113228,1995-12-22,101.0
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],tt0114885,1995-12-22,127.0
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],tt0113041,1995-02-10,106.0


In [70]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],tt0114709,1995-10-30,81.0
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],tt0113497,1995-12-15,104.0
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],tt0113228,1995-12-22,101.0
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],tt0114885,1995-12-22,127.0
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],tt0113041,1995-02-10,106.0


In [71]:
# 1. Ensure everything is a string first (this handles lists or NaNs)
movies['overview'] = movies['overview'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

# 2. Fill missing values
movies['overview'] = movies['overview'].fillna('')

# 3. NOW you can split them into a list of words safely
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [72]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],tt0114709,1995-10-30,81.0
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],tt0113497,1995-12-15,104.0
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],tt0113228,1995-12-22,101.0
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],tt0114885,1995-12-22,127.0
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],tt0113041,1995-02-10,106.0


In [73]:
# 1. Helper to handle lists, strings, and NaNs safely
def force_string(x):
    if isinstance(x, list):
        return " ".join(map(str, x))
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    return str(x)

import numpy as np

# 2. Force every column into a clean string Series
s_overview = movies['overview'].apply(force_string)
s_title    = movies['title'].apply(force_string)
s_genres   = movies['genres'].apply(force_string)
s_cast     = movies['cast'].apply(force_string)
s_crew     = movies['crew'].apply(force_string)
s_keywords = movies['keywords'].apply(force_string) # Adding keywords for better context

# 3. Concatenate using ONLY these string series
movies['tag'] = (
    s_overview + " " +               
    s_title + " " +            
    s_genres + " "+           
    s_cast + " "+             
    s_crew + " "+
    s_keywords 
)



In [74]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,imdb_id,release_date,runtime,tag
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],tt0114709,1995-10-30,81.0,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],tt0113497,1995-12-15,104.0,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],tt0113228,1995-12-22,101.0,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],tt0114885,1995-12-22,127.0,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],tt0113041,1995-02-10,106.0,Just when George Banks has recovered from his ...


In [75]:
new_df = movies[['id','title','tag']]

In [76]:
new_df

Unnamed: 0,id,title,tag
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
46621,222848,Caged Heat 3000,It's the year 3000 AD. The world's most danger...
46622,30840,Robin Hood,"Yet another version of the classic epic, with ..."
46625,67758,Betrayal,"When one of her hits goes wrong, a professiona..."
46626,227506,Satan Triumphant,"In a small town live two brothers, one a minis..."


In [77]:
new_df['tag'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Toy Story Animation Comedy Family Tom Hanks Tim Allen Don Rickles John Lasseter jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life"

In [78]:
new_df['tag'] = new_df['tag'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tag'] = new_df['tag'].apply(lambda x:x.lower())


In [79]:
new_df['tag'][5000]

'five bored, occasionally high and always ineffective vermont state troopers must prove their worth to the governor or lose their jobs. after stumbling on a drug ring, they plan to make a bust, but a rival police force is out to steal the glory. super troopers comedy crime mystery jay chandrasekhar steve lemme kevin heffernan jay chandrasekhar alcohol radio police chief highway cops broken lizard marijuana drug humor police corruption aftercreditsstinger duringcreditsstinger shenanigans'

In [80]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [81]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.


In [82]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded successfully!")



Embedding model loaded successfully!


In [83]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random

# === STEP 1: Use FULL DATASET ===
full_df = movies.reset_index(drop=True)
print("Total movies in dataset:", len(full_df))

# === STEP 2: Safe keyword-dense tag builder ===
def build_clean_tags(row):
    tokens = []

    def add_tokens(value):
        if not value or (isinstance(value, float) and np.isnan(value)):
            return
        if isinstance(value, list):
            for v in value:
                if isinstance(v, str):
                    tokens.append(v)
        elif isinstance(value, str):
            tokens.extend(value.split())

    add_tokens(row.get('title'))
    add_tokens(row.get('genres'))
    add_tokens(row.get('cast'))
    add_tokens(row.get('crew'))
    add_tokens(row.get('keywords'))

    clean = [t.lower().strip() for t in tokens if isinstance(t, str)]
    return ", ".join(sorted(set(clean)))

full_df['tag'] = full_df.apply(build_clean_tags, axis=1)

# === STEP 3: Encode tags into embeddings ===
print("\nEncoding full dataset tags into vectors…")
vectors = model.encode(full_df['tag'].tolist(), show_progress_bar=True)

# === STEP 4: Fit K-NN on vectors ===
nn_model = NearestNeighbors(n_neighbors=15, metric='cosine')
nn_model.fit(vectors)

# === STEP 5: Store vectors globally so next cell can access ===
new_df = full_df  # optional alias if you still want `new_df` name
print("\nModel is ready. You can now call `final_recommend()` in the next cell.")

# === Recommendation function (defined once, reusable later) ===
def final_recommend(movie_title, top_k=5):
    try:
        idx = full_df[full_df['title'].str.lower() == movie_title.lower()].index[0]
        distances, indices = nn_model.kneighbors([vectors[idx]])

        print(f"\nTop {top_k} Recommendations for '{movie_title}':")
        print("-" * 30)
        for i in indices[0][1:top_k+1]:
            print(full_df.iloc[i]['title'])

    except IndexError:
        print("Movie not found.")


Total movies in dataset: 32255

Encoding full dataset tags into vectors…


Batches:   0%|          | 0/1008 [00:00<?, ?it/s]


Model is ready. You can now call `final_recommend()` in the next cell.


In [None]:
final_recommend('Batman Begins')