In [1]:
import pandas as pd
import numpy as np
import ast
from difflib import get_close_matches

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

print("movies shape:", movies.shape)
print("credits shape:", credits.shape)

# show a quick preview
movies.head()


movies shape: (4803, 20)
credits shape: (4803, 4)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
if 'movie_id' in credits.columns:
    credits = credits.rename(columns={'movie_id':'id'})

# ensure id columns are of same dtype
if 'id' in credits.columns and 'id' in movies.columns:
    credits['id'] = credits['id'].astype(int)
    movies['id']  = movies['id'].astype(int)
    # keep only needed columns from credits
    credits_small = credits[['id','cast','crew']].copy()
    movies = movies.merge(credits_small, on='id', how='left')
else:
    # if merge not possible, create empty cast/crew
    movies['cast'] = movies.get('cast', '')
    movies['crew'] = movies.get('crew', '')

movies = movies.reset_index(drop=True)
print("After merge movies shape:", movies.shape)


After merge movies shape: (4803, 22)


In [6]:
def safe_literal_eval(x):
    """Safely parse a stringified list-of-dicts, return [] if parsing fails."""
    if pd.isna(x):
        return []
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

def extract_names_from_list_of_dicts(x, key='name', n_top=None):
    """Given a string/list-of-dicts, return list of names (no spaces) optionally truncated."""
    L = x
    if isinstance(x, str):
        L = safe_literal_eval(x)
    out = []
    if isinstance(L, list):
        for item in L:
            if isinstance(item, dict) and key in item:
                name = item[key]
                if isinstance(name, str):
                    out.append(name.replace(" ", ""))  # remove spaces to treat as tokens
    if n_top:
        return out[:n_top]
    return out

def get_director_from_crew(x):
    L = safe_literal_eval(x)
    directors = []
    for item in L:
        if isinstance(item, dict) and item.get('job') == 'Director':
            name = item.get('name', '')
            if name:
                directors.append(name.replace(" ", ""))
    return directors


In [7]:
# genres and keywords in movies CSV are usually lists of dicts with 'name'
movies['genres_parsed'] = movies['genres'].apply(lambda x: extract_names_from_list_of_dicts(x, 'name'))
movies['keywords_parsed'] = movies['keywords'].apply(lambda x: extract_names_from_list_of_dicts(x, 'name'))

# cast & crew came from the merged credits file
movies['cast_parsed'] = movies['cast'].apply(lambda x: extract_names_from_list_of_dicts(x, 'name', n_top=3))
movies['director_parsed'] = movies['crew'].apply(get_director_from_crew)

# overview (text) — keep as is (fill NaN)
movies['overview'] = movies['overview'].fillna('')

# quick check
movies[['title','genres_parsed','keywords_parsed','cast_parsed','director_parsed','overview']].head()


Unnamed: 0,title,genres_parsed,keywords_parsed,cast_parsed,director_parsed,overview
0,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],Following the death of District Attorney Harve...
4,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"John Carter is a war-weary, former military ca..."


In [8]:
def list_to_string(x):
    if isinstance(x, list):
        return " ".join(x)
    elif pd.isna(x):
        return ""
    else:
        return str(x)

movies['soup'] = (
    movies['genres_parsed'].apply(list_to_string) + " " +
    movies['keywords_parsed'].apply(list_to_string) + " " +
    movies['cast_parsed'].apply(list_to_string) + " " +
    movies['director_parsed'].apply(list_to_string) + " " +
    movies['overview']
)

# small cleaning (optional)
movies['soup'] = movies['soup'].str.replace(r'\s+', ' ', regex=True).str.strip()

# show example
movies[['title','soup']].head(3)


Unnamed: 0,title,soup
0,Avatar,Action Adventure Fantasy ScienceFiction cultur...
1,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drugabuse exoti...
2,Spectre,Action Adventure Crime spy basedonnovel secret...


In [9]:
tfidf = TfidfVectorizer(stop_words='english', max_features=20000)  # stop words remove common words
tfidf_matrix = tfidf.fit_transform(movies['soup'])

# compute cosine similarity (dense pairwise matrix; for ~5k items this is fine)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# sanity
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (4803, 20000)


In [14]:
# create mapping from title (lowercase) to index — drop duplicates keeping first occurrence
indices = pd.Series(movies.index, index=movies['title'].str.lower()).drop_duplicates()

def get_recommendations(title, top_n=5, show_scores=False):
    """
    Return top_n most similar movie titles to the given title.
    If title not found, suggest close matches.
    """
    title_l = title.lower()
    if title_l not in indices:
        # try fuzzy matches
        close = get_close_matches(title_l, indices.index, n=5, cutoff=0.6)
        if close:
            print(f"Title not found exactly. Did you mean one of: {close}")
        else:
            print("Title not found and no close matches. Try a different title.")
        return []

    idx = indices[title_l]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # skip the first one because it is the movie itself (score 1.0)
    top_scores = sim_scores[1: top_n + 1]

    movie_indices = [i[0] for i in top_scores]
    scores = [i[1] for i in top_scores]
    recommendations = movies['title'].iloc[movie_indices].tolist()
    if show_scores:
        return list(zip(recommendations, scores))
    return recommendations


In [11]:
sample_titles = movies['title'].dropna().sample(3, random_state=42).tolist()
print("Sample titles chosen:", sample_titles)
for t in sample_titles:
    print("\n=== Recommendations for:", t)
    recs = get_recommendations(t, top_n=5, show_scores=True)
    if recs:
        for rank, (title, score) in enumerate(recs, start=1):
            print(f"{rank}. {title}  (sim={score:.4f})")


Sample titles chosen: ['I Spy', 'Split Second', 'Gossip']

=== Recommendations for: I Spy
1. True Lies  (sim=0.1051)
2. Street Fighter  (sim=0.1019)
3. Fascination  (sim=0.0961)
4. Stormbreaker  (sim=0.0955)
5. Doctor Dolittle  (sim=0.0948)

=== Recommendations for: Split Second
1. Surrogates  (sim=0.1251)
2. Fortress  (sim=0.1178)
3. Predator 2  (sim=0.1134)
4. Code 46  (sim=0.1076)
5. The Blood of Heroes  (sim=0.1049)

=== Recommendations for: Gossip
1. The World Is Mine  (sim=0.1242)
2. Mean Girls  (sim=0.1168)
3. Steppin: The Movie  (sim=0.1116)
4. Doubt  (sim=0.1101)
5. College  (sim=0.1078)


In [13]:
print(get_recommendations("The Dark Knight", top_n=5, show_scores=True))
print(get_recommendations("Toy Story", top_n=5, show_scores=True))
print(get_recommendations("Avatar", top_n=5, show_scores=True))


[('The Dark Knight Rises', np.float64(0.3841497215631734)), ('Batman Returns', np.float64(0.32430942009647495)), ('Batman Begins', np.float64(0.2814778290503691)), ('Batman: The Dark Knight Returns, Part 2', np.float64(0.27736803729444504)), ('Batman Forever', np.float64(0.2523442340488521))]
[('Toy Story 2', np.float64(0.45835194764483067)), ('Toy Story 3', np.float64(0.4552533383478408)), ('The 40 Year Old Virgin', np.float64(0.26351010073740133)), ('For Your Consideration', np.float64(0.1218345861521909)), ('Heartbeeps', np.float64(0.11420466695872705))]
[('Aliens', np.float64(0.14323872540783544)), ('Battle: Los Angeles', np.float64(0.13229060261760284)), ('Falcon Rising', np.float64(0.12501199553525266)), ('Apollo 18', np.float64(0.12297686646821665)), ('Star Trek Into Darkness', np.float64(0.12181406499956088))]
