In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import json
import matplotlib.pyplot as plt
import scipy.stats as stats

# Always make it pretty.
plt.style.use('ggplot')

# 1. Load Data

In [2]:
tmdb_credits_path = 'Capstons_datasets/tmdb-5000-movie-dataset/tmdb_5000_credits.csv'
tmdb_movies_path ='Capstons_datasets/tmdb-5000-movie-dataset/tmdb_5000_movies.csv'

In [3]:
origin_credits_df = pd.read_csv(tmdb_credits_path)
origin_movies_df = pd.read_csv(tmdb_movies_path)

# 2. Clean Data

In [4]:
cleaning_df = pd.merge(origin_credits_df, origin_movies_df, left_on='movie_id', right_on='id')

In [5]:
cleaning_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 24 columns):
movie_id                4803 non-null int64
title_x                 4803 non-null object
cast                    4803 non-null object
crew                    4803 non-null object
budget                  4803 non-null int64
genres                  4803 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4803 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null ob

In [6]:
cleaning_df.head()

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,homepage,id,keywords,original_language,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title_y,vote_average,vote_count
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


# # clean duplicate data

In [7]:
#  'movie_id', 'id' same
(cleaning_df['movie_id'] == cleaning_df['id']).all()

True

In [8]:
# 'title_x', 'title_y' same
(cleaning_df['title_x'] == cleaning_df['title_y']).all()

True

In [9]:
(cleaning_df['title_x'] == cleaning_df['original_title']).all()

False

In [10]:
cleaning_df.drop(columns=['id', 'title_y']).head()

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,homepage,keywords,original_language,original_title,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",6.9,4500
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,...,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,6.3,4466
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,...,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,7.6,9106
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",6.1,2124


# # clean uselessly data , Fix missing data

In [11]:
cleaning_df.isnull().sum().sort_values(ascending=False)

homepage                3091
tagline                  844
overview                   3
runtime                    2
release_date               1
original_language          0
title_x                    0
cast                       0
crew                       0
budget                     0
genres                     0
id                         0
keywords                   0
vote_count                 0
original_title             0
vote_average               0
popularity                 0
production_companies       0
production_countries       0
revenue                    0
spoken_languages           0
status                     0
title_y                    0
movie_id                   0
dtype: int64

In [12]:
cleaning_df = cleaning_df.drop(columns=['homepage'])

In [13]:
cleaning_df[cleaning_df['tagline'].isnull()==True].head()

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,id,keywords,original_language,original_title,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title_y,vote_average,vote_count
10,1452,Superman Returns,"[{""cast_id"": 3, ""character"": ""Superman / Clark...","[{""credit_id"": ""553bef6a9251416874003c8f"", ""de...",270000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",1452,"[{""id"": 83, ""name"": ""saving the world""}, {""id""...",en,Superman Returns,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2006-06-28,391081192,154.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,,Superman Returns,5.4,1400
56,188927,Star Trek Beyond,"[{""cast_id"": 9, ""character"": ""James T. Kirk"", ...","[{""credit_id"": ""52fe4d489251416c751360bb"", ""de...",185000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",188927,"[{""id"": 9663, ""name"": ""sequel""}, {""id"": 9743, ...",en,Star Trek Beyond,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2016-07-07,343471816,122.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Star Trek Beyond,6.6,2568
66,14160,Up,"[{""cast_id"": 4, ""character"": ""Carl Fredricksen...","[{""credit_id"": ""5683b5f8c3a3684be90168e1"", ""de...",175000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 35, ""...",14160,"[{""id"": 965, ""name"": ""age difference""}, {""id"":...",en,Up,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-05-13,735099082,96.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Up,7.7,6870
78,278927,The Jungle Book,"[{""cast_id"": 12, ""character"": ""Mowgli"", ""credi...","[{""credit_id"": ""571507b692514105a80032f0"", ""de...",175000000,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 12, ""...",278927,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,The Jungle Book,...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2016-04-07,966550600,106.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,The Jungle Book,6.7,2892
128,13448,Angels & Demons,"[{""cast_id"": 4, ""character"": ""Robert Langdon"",...","[{""credit_id"": ""52fe456a9251416c750559ef"", ""de...",150000000,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 9648, ...",13448,"[{""id"": 588, ""name"": ""rome""}, {""id"": 716, ""nam...",en,Angels & Demons,...,"[{""iso_3166_1"": ""IT"", ""name"": ""Italy""}, {""iso_...",2009-05-13,356613439,138.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,,Angels & Demons,6.5,2129


In [14]:
cleaning_df[cleaning_df['overview'].isnull()==True]

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,id,keywords,original_language,original_title,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title_y,vote_average,vote_count
2656,370980,Chiamatemi Francesco - Il Papa della gente,"[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de...",15000000,"[{""id"": 18, ""name"": ""Drama""}]",370980,"[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...",it,Chiamatemi Francesco - Il Papa della gente,...,"[{""iso_3166_1"": ""IT"", ""name"": ""Italy""}]",2015-12-03,0,,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,,Chiamatemi Francesco - Il Papa della gente,7.3,12
4140,459488,"To Be Frank, Sinatra at 100","[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de...",2,"[{""id"": 99, ""name"": ""Documentary""}]",459488,"[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...",en,"To Be Frank, Sinatra at 100",...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2015-12-12,0,,[],Released,,"To Be Frank, Sinatra at 100",0.0,0
4431,292539,Food Chains,[],"[{""credit_id"": ""5470c3b1c3a368085e000abd"", ""de...",913000,"[{""id"": 99, ""name"": ""Documentary""}]",292539,[],de,Food Chains,...,[],2014-04-26,0,83.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Food Chains,7.4,8


'tagline', 'overview' null < ''

In [15]:
cleaning_df.loc[cleaning_df[cleaning_df['tagline'].isnull()].index,'tagline'] = ''
cleaning_df.loc[cleaning_df[cleaning_df['overview'].isnull()].index,'overview'] = ''

In [16]:
cleaning_df[cleaning_df['runtime'].isnull()==True]

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,id,keywords,original_language,original_title,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title_y,vote_average,vote_count
2656,370980,Chiamatemi Francesco - Il Papa della gente,"[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de...",15000000,"[{""id"": 18, ""name"": ""Drama""}]",370980,"[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...",it,Chiamatemi Francesco - Il Papa della gente,...,"[{""iso_3166_1"": ""IT"", ""name"": ""Italy""}]",2015-12-03,0,,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,,Chiamatemi Francesco - Il Papa della gente,7.3,12
4140,459488,"To Be Frank, Sinatra at 100","[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de...",2,"[{""id"": 99, ""name"": ""Documentary""}]",459488,"[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...",en,"To Be Frank, Sinatra at 100",...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2015-12-12,0,,[],Released,,"To Be Frank, Sinatra at 100",0.0,0


In [17]:
cleaning_df.loc[2656, 'runtime'] = 98.0
cleaning_df.loc[4140, 'runtime'] = 81.0

In [18]:
cleaning_df[cleaning_df['release_date'].isnull()==True]

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,id,keywords,original_language,original_title,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title_y,vote_average,vote_count
4553,380097,America Is Still the Place,[],[],0,[],380097,[],en,America Is Still the Place,...,[],,0,0.0,[],Released,,America Is Still the Place,0.0,0


In [19]:
movies = cleaning_df.drop(index=4553)

In [20]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4802 entries, 0 to 4802
Data columns (total 23 columns):
movie_id                4802 non-null int64
title_x                 4802 non-null object
cast                    4802 non-null object
crew                    4802 non-null object
budget                  4802 non-null int64
genres                  4802 non-null object
id                      4802 non-null int64
keywords                4802 non-null object
original_language       4802 non-null object
original_title          4802 non-null object
overview                4802 non-null object
popularity              4802 non-null float64
production_companies    4802 non-null object
production_countries    4802 non-null object
release_date            4802 non-null object
revenue                 4802 non-null int64
runtime                 4802 non-null float64
spoken_languages        4802 non-null object
status                  4802 non-null object
tagline                 4802 non-null ob

In [21]:
movies.isnull().sum().sort_values(ascending=False)

vote_count              0
overview                0
title_x                 0
cast                    0
crew                    0
budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
popularity              0
vote_average            0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title_y                 0
movie_id                0
dtype: int64

3. 