## Data Loading

In [89]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
movies = pd.read_csv("../data/tmdb_5000_movies.csv")
credits = pd.read_csv("../data/tmdb_5000_credits.csv")

In [10]:
movies.shape

(4803, 20)

In [13]:
credits.shape

(4803, 4)

In [15]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [23]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [25]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

## Data Preprocessing

In [36]:
df = movies.merge(
    credits,
    left_on = 'id',
    right_on = 'movie_id',
    how = 'left',
)

In [38]:
df = df[['original_title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [40]:
df.columns

Index(['original_title', 'overview', 'genres', 'keywords', 'cast', 'crew'], dtype='object')

In [42]:
df.isnull().sum()

original_title    0
overview          3
genres            0
keywords          0
cast              0
crew              0
dtype: int64

In [44]:
df.fillna('',inplace=True)

In [46]:
df.isnull().sum()

original_title    0
overview          0
genres            0
keywords          0
cast              0
crew              0
dtype: int64

In [48]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [50]:
df['overview'] = df['overview'].apply(clean_text)

In [52]:
df['overview'].head()

0    in the 22nd century a paraplegic marine is dis...
1    captain barbossa long believed to be dead has ...
2    a cryptic message from bonds past sends him on...
3    following the death of district attorney harve...
4    john carter is a warweary former military capt...
Name: overview, dtype: object

In [54]:
def parse_genres(genres):
    genres = ast.literal_eval(genres)
    return ' '.join([g['name'] for g in genres])

In [56]:
df['genres'] = df['genres'].apply(parse_genres)

In [58]:
df['genres'].head()

0    Action Adventure Fantasy Science Fiction
1                    Adventure Fantasy Action
2                      Action Adventure Crime
3                 Action Crime Drama Thriller
4            Action Adventure Science Fiction
Name: genres, dtype: object

In [60]:
def parse_keywords(keywords):
    keywords = ast.literal_eval(keywords)
    return ' '.join([k['name'] for k in keywords])

In [62]:
df['keywords'] = df['keywords'].apply(parse_keywords)

In [64]:
df['keywords'].head()

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 bri...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: keywords, dtype: object

In [66]:
def parse_cast(cast):
    cast = ast.literal_eval(cast)
    return ' '.join([c['name'] for c in cast[:5]])

In [68]:
df['cast'] = df['cast'].apply(parse_cast)

In [70]:
df['cast'].head()

0    Sam Worthington Zoe Saldana Sigourney Weaver S...
1    Johnny Depp Orlando Bloom Keira Knightley Stel...
2    Daniel Craig Christoph Waltz Léa Seydoux Ralph...
3    Christian Bale Michael Caine Gary Oldman Anne ...
4    Taylor Kitsch Lynn Collins Samantha Morton Wil...
Name: cast, dtype: object

In [72]:
def get_director(crew):
    crew = ast.literal_eval(crew)
    for member in crew:
        if member['job'] == 'Director':
            return member['name']
    return ''

In [74]:
df['director'] = df['crew'].apply(get_director)

In [76]:
df['director'].head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4       Andrew Stanton
Name: director, dtype: object

In [78]:
df.drop(columns=['crew'] , inplace = True)

In [80]:
for col in ['genres','keywords','cast','director']:
    df[col] = df[col].apply(clean_text)

In [82]:
df['content'] = (
    df['overview'] + ' ' +
    df['genres'] + ' ' +
    df['keywords'] + ' ' +
    df['cast'] + ' ' +
    df['director']
    
)

In [84]:
df.to_csv('../data/movies_cleaned.csv' , index=False)

In [86]:
(df['content'].str.len() == 0).sum()

0

## Modeling — TF-IDF Vectorization

In [91]:
tfidf = TfidfVectorizer(
    max_features = 5000,
    stop_words = 'english'
)

In [93]:
df = pd.read_csv('../data/movies_cleaned.csv')

In [97]:
df['content'].head()

0    in the 22nd century a paraplegic marine is dis...
1    captain barbossa long believed to be dead has ...
2    a cryptic message from bonds past sends him on...
3    following the death of district attorney harve...
4    john carter is a warweary former military capt...
Name: content, dtype: object

In [99]:
tfidf_matrix = tfidf.fit_transform(df['content'])

In [101]:
tfidf_matrix.shape

(4803, 5000)

In [103]:
type(tfidf_matrix)

scipy.sparse._csr.csr_matrix

In [106]:
tfidf.get_feature_names_out()

array(['007', '10', '10yearold', ..., 'zoo', 'zooey', 'zucker'],
      dtype=object)

In [109]:
 tfidf_matrix[0]

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 49 stored elements in Compressed Sparse Row format>

In [111]:
movie_vectors = tfidf_matrix
vectorizer = tfidf

## Similarity Engine

In [114]:
from sklearn.metrics.pairwise import cosine_similarity

In [116]:
similarity_matrix = cosine_similarity(tfidf_matrix)

In [118]:
similarity_matrix.shape

(4803, 4803)

In [120]:
movie_indices = pd.Series(
    df.index,
    index = df['original_title']
)

In [122]:
movie_indices['Avatar']

0

## Inference — Recommendation Function

In [125]:
def recommend(movie_title , top_n=10):
    if movie_title not in movie_indices:
        return f"Movie '{movie_title}' not found in database"
    idx = movie_indices[movie_title]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    recommend_indices = [i[0] for i in sim_scores]

    return df['original_title'].iloc[recommend_indices].values

## Testing

In [129]:
recommend('Avatar')

array(['Aliens', 'Alien', 'Moonraker', 'Alien³', 'Lifeforce', 'Lockout',
       'Silent Running', 'Lost in Space', 'The Thing', 'Mission to Mars'],
      dtype=object)

In [131]:
recommend("Random Movie")

"Movie 'Random Movie' not found in database"

In [133]:
recommend("Inception", top_n=7)

array(['Heartbreakers', 'Cypher', 'The Helix... Loaded', 'Hesher',
       'Pitch Perfect 2', 'Much Ado About Nothing', 'Don Jon'],
      dtype=object)

In [135]:
recommend("Titanic", top_n=5)

array(['Captain Phillips', 'Ghost Ship', 'Poseidon', 'Triangle',
       'Life of Pi'], dtype=object)