In [3]:
import numpy as np
import pandas as pd
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/tmdb-movie-metadata


In [4]:
movies_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [5]:
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [43]:
df = movies_df.merge(credits_df,on='title')

In [45]:
df = df[['id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']].copy()

In [46]:
import ast
def parse_json(column):
    try:
        return ast.literal_eval(column)
    except (ValueError, SyntaxError):
        return []

In [47]:
for col in ['genres', 'keywords', 'cast', 'crew']:
    df[col] = df[col].fillna('[]').apply(parse_json)

In [48]:
df.head(5)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':...","In the 22nd century, a paraplegic Marine is di...","[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,285,Pirates of the Caribbean: At World's End,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...","Captain Barbossa, long believed to be dead, ha...","[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,206647,Spectre,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",A cryptic message from Bond’s past sends him o...,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,49026,The Dark Knight Rises,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'id': 849, 'name': 'dc comics'}, {'id': 853,...",Following the death of District Attorney Harve...,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,49529,John Carter,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","John Carter is a war-weary, former military ca...","[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [49]:
def get_list_of_names(obj_list, key='name', top_n=None):
    names = []
    for i, elem in enumerate(obj_list):
        if top_n and i >= top_n:
            break
        name = elem.get(key, '')
        # replace spaces with no‐space (so "Science Fiction" → "sciencefiction")
        cleaned = name.replace(" ", "").lower()
        if cleaned:
            names.append(cleaned)
    return names

In [50]:
df['genres_list'] = df['genres'].apply(lambda x: get_list_of_names(x, key='name'))
df['keywords_list'] = df['keywords'].apply(lambda x: get_list_of_names(x, key='name'))
df['cast_list'] = df['cast'].apply(lambda x: get_list_of_names(x, key='name', top_n=3))

In [51]:
def get_director(crew_list):
    for person in crew_list:
        if person.get('job', '').lower() == 'director':
            return person.get('name', '').replace(" ", "").lower()
    return ''

df['director'] = df['crew'].apply(get_director)

In [52]:
def clean_overview(text):
    text = text.lower()
    words = text.split()
    return [w for w in words if w]

df['overview_list'] = df['overview'].fillna('').apply(clean_overview)

In [53]:
def merge(row):
    components = []
    components += row['genres_list']
    components += row['keywords_list']
    components += row['cast_list']
    if row['director']:
        components.append(row['director'])
    components += row['overview_list']
    # return one long string
    return " ".join(components)

df['merged_details'] = df.apply(merge, axis=1)

In [54]:
df['merged_details'].head(10)

Unnamed: 0,merged_details
0,action adventure fantasy sciencefiction cultur...
1,adventure fantasy action ocean drugabuse exoti...
2,action adventure crime spy basedonnovel secret...
3,action crime drama thriller dccomics crimefigh...
4,action adventure sciencefiction basedonnovel m...
5,fantasy action adventure dualidentity amnesia ...
6,animation family hostage magic horse fairytale...
7,action adventure sciencefiction marvelcomic se...
8,adventure fantasy family witch magic broom sch...
9,action adventure fantasy dccomics vigilante su...


In [59]:
df_new = df[['id', 'title', 'merged_details']].copy()

In [57]:
df_new['merged_details'] = df_new['merged_details'].apply(lambda word: " ".join(word))

In [64]:
df_new

Unnamed: 0,id,title,merged_details
0,19995,Avatar,action adventure fantasy sciencefiction cultur...
1,285,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...
2,206647,Spectre,action adventure crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,action crime drama thriller dccomics crimefigh...
4,49529,John Carter,action adventure sciencefiction basedonnovel m...
...,...,...,...
4804,9367,El Mariachi,action crime thriller unitedstates–mexicobarri...
4805,72766,Newlyweds,comedy romance edwardburns kerrybishé marshadi...
4806,231617,"Signed, Sealed, Delivered",comedy drama romance tvmovie date loveatfirsts...
4807,126186,Shanghai Calling,danielhenney elizacoupe billpaxton danielhsia ...


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [65]:
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

In [66]:
vector = vectorizer.fit_transform(df_new['merged_details']).toarray()

In [67]:
vector.shape

(4809, 5000)

In [68]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [69]:
similarity

array([[1.        , 0.09107651, 0.06071767, ..., 0.02492224, 0.02672612,
        0.        ],
       [0.09107651, 1.        , 0.06451613, ..., 0.02648136, 0.        ,
        0.        ],
       [0.06071767, 0.06451613, 1.        , ..., 0.02648136, 0.        ,
        0.        ],
       ...,
       [0.02492224, 0.02648136, 0.02648136, ..., 1.        , 0.06993786,
        0.04783649],
       [0.02672612, 0.        , 0.        , ..., 0.06993786, 1.        ,
        0.05129892],
       [0.        , 0.        , 0.        , ..., 0.04783649, 0.05129892,
        1.        ]], shape=(4809, 4809))

In [71]:
df_new[df_new['title'] == 'The Lego Movie'].index[0]

np.int64(744)

In [73]:
def recommend(movie):
    index = df_new[df_new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df_new.iloc[i[0]].title)

In [90]:
recommend('Se7en')

Zodiac
The Bone Collector
2:13
Murder by Numbers
Kiss the Girls
