In [131]:
import pandas as pd 
import numpy as np 
import ast

In [132]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')


In [133]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [134]:
movies = movies.merge(credits, left_on='title', right_on='title')

In [135]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [136]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [137]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [138]:
def convert(obj):
    L =[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [139]:
movies['genres'] = movies['genres'].apply(convert)

In [140]:
movies['genres']

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4809, dtype: object

In [141]:
movies['keywords'] = movies['keywords'].apply(convert)
movies['keywords']

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united states–mexico barrier, legs, arms, pap...
4805                                                   []
4806    [date, love at first sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4809, dtype: object

In [142]:
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])  #which is only 3 actors

movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])

movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

movies['tags']

0       [Action, Adventure, Fantasy, Science Fiction, ...
1       [Adventure, Fantasy, Action, ocean, drug abuse...
2       [Action, Adventure, Crime, spy, based on novel...
3       [Action, Crime, Drama, Thriller, dc comics, cr...
4       [Action, Adventure, Science Fiction, based on ...
                              ...                        
4804    [Action, Crime, Thriller, united states–mexico...
4805    [Comedy, Romance, Edward Burns, Kerry Bishé, M...
4806    [Comedy, Drama, Romance, TV Movie, date, love ...
4807    [Daniel Henney, Eliza Coupe, Bill Paxton, Dani...
4808    [Documentary, obsession, camcorder, crush, dre...
Name: tags, Length: 4809, dtype: object

In [143]:
movies = movies[['movie_id', 'title', 'overview', 'tags']]
movies['tags'] = movies['tags'].apply(lambda x: [tag.lower() for tag in x])
movies.head()


Unnamed: 0,movie_id,title,overview,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[action, adventure, fantasy, science fiction, ..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[adventure, fantasy, action, ocean, drug abuse..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[action, adventure, crime, spy, based on novel..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[action, crime, drama, thriller, dc comics, cr..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[action, adventure, science fiction, based on ..."


In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
tfidf = TfidfVectorizer(stop_words= 'english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

In [145]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim= cosine_similarity(tfidf_matrix, tfidf_matrix)


In [146]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

print(get_recommendations('The Dark Knight Rises'))

65               The Dark Knight
119                Batman Begins
1360                      Batman
210               Batman & Robin
428               Batman Returns
1361                      Batman
1197                The Prestige
303                     Catwoman
4644    Amidst the Devil's Wings
72                 Suicide Squad
Name: title, dtype: object


In [147]:
import pickle
with open('movie_data.pkl', 'wb') as file:
    pickle.dump((movies,cosine_sim,),file)