In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Load dataset
movies = pd.read_csv('Dataset/tmdb_5000_movies.csv')
credit = pd.read_csv('Dataset/tmdb_5000_credits.csv')

### Understand your data

In [6]:
# How big data is?
print(f"Movies: {movies.shape}")
print(f"Credit: {credit.shape}")

Movies: (4803, 20)
Credit: (4803, 4)


In [21]:
# How data look like?
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [22]:
credit.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [23]:
# Merge both datasets
movies_data = movies.merge(credit, on="title")

In [24]:
# Get info of all columns 
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [25]:
# check null values
movies_data.isnull().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

In [28]:
# Check duplicates entries
movies_data.duplicated().sum()

0

In [26]:
# Drop not required columns
not_required_features = ['production_companies', 'popularity', 'status', 'id', 'original_language',
                         'tagline', 'vote_count', 'revenue', 'runtime', 'production_countries',
                          'homepage', 'budget', 'release_date', 'vote_average', 'spoken_languages', 'original_title']
movies_data1 = movies_data.drop(not_required_features, axis='columns')
movies_data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genres    4809 non-null   object
 1   keywords  4809 non-null   object
 2   overview  4806 non-null   object
 3   title     4809 non-null   object
 4   movie_id  4809 non-null   int64 
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB


In [29]:
movies_data1.sample(1)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
2837,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 878, ""na...","[{""id"": 6898, ""name"": ""quarantine""}, {""id"": 97...",A television reporter and her cameraman are tr...,Quarantine,13812,"[{""cast_id"": 1, ""character"": ""Angela Vidal"", ""...","[{""credit_id"": ""5834decbc3a36829d900d7a5"", ""de..."


In [34]:
movies_data1['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [35]:
movies_data1['keywords'][0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [37]:
# movies_data1['cast'][0]

In [49]:
import ast
def get_name_data(obj):
    keyword_list = []
    for data in ast.literal_eval(obj):
        keyword_list.append(data['name'].replace(' ',''))
    return keyword_list

movies_data1['genres'] = movies_data1['genres'].apply(get_name_data)
movies_data1['keywords'] = movies_data1['keywords'].apply(get_name_data)

In [51]:
# get first 5 cast name
def get_cast_data(obj):
    cast_list = []
    for index, data in enumerate(ast.literal_eval(obj)):
        if index > 5:
            break
        cast_list.append(data['name'].replace(' ',''))
    return cast_list
    
movies_data1['cast'] = movies_data1['cast'].apply(get_name_data)

In [56]:
# get crew data (Producer, Director)
def get_crew_data(obj):
    crew_data = []
    i = 0
    find_crew = ['Director','Producer']
    crew_len = len(find_crew)

    for data in ast.literal_eval(obj):
        if data['job'] in find_crew:
            i += 1
            crew_data.append(data['name'].replace(' ',''))
        
        if i == crew_len: break
    return crew_data

movies_data1['crew'] = movies_data1['crew'].apply(get_crew_data)

In [61]:
movies_data1['overview'] = movies_data1['overview'].apply(lambda x: str(x).split())

In [65]:
movies_data1.sample(1)

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew,tags
292,"[Fantasy, Action, Adventure, Family]","[basedonnovel, mythicalcreature, dragon, fanta...","[In, his, homeland, of, Alagaesia,, a, farm, b...",Eragon,2486,"[EdSpeleers, JeremyIrons, SiennaGuillory, Robe...","[JohnDavis, WyckGodfrey]","[In, his, homeland, of, Alagaesia,, a, farm, b..."


In [66]:
movies_data1['tags'] = movies_data1['overview'] + movies_data1['genres'] + movies_data1['keywords'] + movies_data1['cast'] + movies_data1['crew']

In [67]:
movies_data1.columns

Index(['genres', 'keywords', 'overview', 'title', 'movie_id', 'cast', 'crew',
       'tags'],
      dtype='object')

In [68]:
movies_data2 = movies_data1[['movie_id', 'title', 'tags']]

In [69]:
movies_data2.sample()

Unnamed: 0,movie_id,title,tags
3691,16337,The Texas Chainsaw Massacre 2,"[A, radio, host, is, victimized, by, the, cann..."


In [70]:
movies_data2['tags'] = movies_data2['tags'].apply(lambda x: " ".join(x))
movies_data2['tags'] = movies_data2['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_data2['tags'] = movies_data2['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_data2['tags'] = movies_data2['tags'].apply(lambda x:x.lower())


In [77]:
movies_data2.sample(1)

Unnamed: 0,movie_id,title,tags
2305,25195,Leap Year,when yet another anniversary passes without a ...


Perform stemming

In [79]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
# Apply stemming on tag field
movies_data2['tags'] = movies_data2['tags'].apply(stem)

Perform vectorization

In [102]:
# Performing vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words="english")

In [103]:
# cv.get_stop_words()

In [104]:
vectors = cv.fit_transform(movies_data2['tags']).toarray()

In [106]:
cv.get_feature_names_out()

array(['000', '10', '11', ..., 'zoo', 'zooeydeschanel', 'zoëkravitz'],
      dtype=object)

In [113]:
# find distance of one movie with all other movies
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [117]:
import pickle
pickle.dump(movies_data2.to_dict(), open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity_movie_metric.pkl', 'wb'))