In [366]:
import os 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval 

import torch
import torch.nn as nn
import torch.nn.functional as F

pd.set_option("display.max_columns", 100)

In [367]:
os.listdir("dataset")

['tmdb_5000_credits.csv', 'tmdb_5000_movies.csv']

In [368]:
movie_df = pd.read_csv("dataset/tmdb_5000_movies.csv")
credit_df = pd.read_csv("dataset/tmdb_5000_credits.csv")

In [369]:
movie_df.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [370]:
credit_df.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [371]:
credit_df.rename(columns = {"movie_id": "id"}, inplace = True)
credit_df.drop(columns = ["title"], inplace = True)

In [372]:
full_df = movie_df.merge(credit_df, on="id")

In [373]:
full_df.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [374]:
list_features = ["genres", "keywords", "cast", "crew"]
for feature in list_features:
    full_df[feature] = full_df[feature].apply(literal_eval)

In [375]:
def extract_job(x, title):
    for data in x:
        if data["job"] == title:
            return data["name"]
    return np.nan

In [376]:
job_title = ["Director"]
for title in job_title:
    full_df[title] = full_df.crew.apply(lambda x: extract_job(x, title))

In [377]:
def extract_data(x, num_data = 3):
    name_list = []
    for data in x:
        name_list.append(data["name"])
    if num_names < len(name_list):
        return name_list[: num_names - 1]
    return name_list
            

In [378]:
def normalize(x):
    return [data.strip() for data in x]

In [379]:
for feature in ["cast", "keywords", "genres"]:
    full_df[feature] = full_df[feature].apply(lambda x: extract_cast(x, 4))
    full_df[feature] = full_df[feature].apply(normalize)

In [380]:
columns = ["cast", "genres", "title", "Director"]

In [381]:
# to focus on recommender system, drop null values
droped_df = full_df[columns].dropna()
droped_df.drop_duplicates(subset=['title'], inplace = True)

In [382]:
def create_text(x):
    return f"{','.join(x['cast'])},{','.join(x['genres'])},{x['title']},{x['Director']}"

In [383]:
droped_df["important"] = droped_df.apply(create_text, axis = 1).drop_duplicates()

In [384]:
droped_df["important"][10]

'Brandon Routh,Kevin Spacey,Kate Bosworth,Adventure,Fantasy,Action,Science Fiction,Superman Returns,Bryan Singer'

In [385]:
cv = CountVectorizer(stop_words = "english", token_pattern=r'[^,]+')
X = cv.fit_transform(droped_df["important"])

In [405]:
#cv.get_feature_names()

In [388]:
similarity = cosine_similarity(X, X)

In [420]:
len(X.toarray()[0])

12691

In [414]:
len(similarity[0])

4770

In [417]:
droped_df["important"]

0       Sam Worthington,Zoe Saldana,Sigourney Weaver,A...
1       Johnny Depp,Orlando Bloom,Keira Knightley,Adve...
2       Daniel Craig,Christoph Waltz,Léa Seydoux,Actio...
3       Christian Bale,Michael Caine,Gary Oldman,Actio...
4       Taylor Kitsch,Lynn Collins,Samantha Morton,Act...
                              ...                        
4798    Carlos Gallardo,Jaime de Hoyos,Peter Marquardt...
4799    Edward Burns,Kerry Bishé,Marsha Dietlein,Comed...
4800    Eric Mabius,Kristin Booth,Crystal Lowe,Comedy,...
4801    Daniel Henney,Eliza Coupe,Bill Paxton,,Shangha...
4802    Drew Barrymore,Brian Herzlinger,Corey Feldman,...
Name: important, Length: 4770, dtype: object

In [415]:
len(cv.get_feature_names())

12691

In [406]:
def get_recommend(title):
    index = droped_df[droped_df["title"] == title].index[0]
    top10 = pd.DataFrame(similarity[0], columns=["scores"]).sort_values(by="scores", ascending = False).iloc[1: 11]
    return droped_df.iloc[top10.index]

In [407]:
top10_similar_movies = get_recommend("Avatar")

In [408]:
top10_similar_movies

Unnamed: 0,cast,genres,title,Director,important
47,"[Chris Pine, Zachary Quinto, Zoe Saldana]","[Action, Adventure, Science Fiction]",Star Trek Into Darkness,J.J. Abrams,"Chris Pine,Zachary Quinto,Zoe Saldana,Action,A..."
206,"[Sam Worthington, Liam Neeson, Ralph Fiennes]","[Adventure, Fantasy, Action]",Clash of the Titans,Louis Leterrier,"Sam Worthington,Liam Neeson,Ralph Fiennes,Adve..."
94,"[Chris Pratt, Zoe Saldana, Dave Bautista]","[Action, Science Fiction, Adventure]",Guardians of the Galaxy,James Gunn,"Chris Pratt,Zoe Saldana,Dave Bautista,Action,S..."
46,"[Hugh Jackman, James McAvoy, Michael Fassbender]","[Action, Adventure, Fantasy, Science Fiction]",X-Men: Days of Future Past,Bryan Singer,"Hugh Jackman,James McAvoy,Michael Fassbender,A..."
10,"[Brandon Routh, Kevin Spacey, Kate Bosworth]","[Adventure, Fantasy, Action, Science Fiction]",Superman Returns,Bryan Singer,"Brandon Routh,Kevin Spacey,Kate Bosworth,Adven..."
813,"[Christopher Reeve, Marlon Brando, Margot Kidder]","[Action, Adventure, Fantasy, Science Fiction]",Superman,Richard Donner,"Christopher Reeve,Marlon Brando,Margot Kidder,..."
232,"[Hugh Jackman, Hiroyuki Sanada, Famke Janssen]","[Action, Science Fiction, Adventure, Fantasy]",The Wolverine,James Mangold,"Hugh Jackman,Hiroyuki Sanada,Famke Janssen,Act..."
870,"[Gene Hackman, Christopher Reeve, Ned Beatty]","[Action, Adventure, Fantasy, Science Fiction]",Superman II,Richard Lester,"Gene Hackman,Christopher Reeve,Ned Beatty,Acti..."
3494,"[Marc Singer, Kari Wuhrer, Sarah Douglas]","[Action, Adventure, Fantasy, Science Fiction]",Beastmaster 2: Through the Portal of Time,Sylvio Tabet,"Marc Singer,Kari Wuhrer,Sarah Douglas,Action,A..."
14,"[Henry Cavill, Amy Adams, Michael Shannon]","[Action, Adventure, Fantasy, Science Fiction]",Man of Steel,Zack Snyder,"Henry Cavill,Amy Adams,Michael Shannon,Action,..."
