In [1]:
import json
import pandas as pd

In [2]:
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df


def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [27]:
movies = load_tmdb_movies("./upload/tmdb_5000_movies.csv")
credits = load_tmdb_credits("./upload/tmdb_5000_credits.csv")

In [28]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [29]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [31]:
working = pd.read_csv("./upload/working_tmdb_5000_movies.csv", index_col=0)
working.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4803 non-null   int64 
 1   genres    4803 non-null   object
 2   overview  4800 non-null   object
 3   title     4803 non-null   object
 4   cast      4803 non-null   object
 5   crew      4803 non-null   object
 6   combine   4803 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.2+ KB


In [32]:
movies[movies['overview'].isnull()]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2656,15000000,"[{'id': 18, 'name': 'Drama'}]",,370980,"[{'id': 717, 'name': 'pope'}, {'id': 5565, 'na...",it,Chiamatemi Francesco - Il Papa della gente,,0.738646,"[{'name': 'Taodue Film', 'id': 45724}]","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2015-12-03,0,,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,,Chiamatemi Francesco - Il Papa della gente,7.3,12
4140,2,"[{'id': 99, 'name': 'Documentary'}]",,459488,"[{'id': 6027, 'name': 'music'}, {'id': 225822,...",en,"To Be Frank, Sinatra at 100",,0.050625,"[{'name': 'Eyeline Entertainment', 'id': 60343}]","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]",2015-12-12,0,,[],Released,,"To Be Frank, Sinatra at 100",0.0,0
4431,913000,"[{'id': 99, 'name': 'Documentary'}]",,292539,[],de,Food Chains,,0.795698,[],[],2014-04-26,0,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Food Chains,7.4,8


In [33]:
credits[credits['movie_id'].isin(working[working['crew'].isnull()]['id'])]

Unnamed: 0,movie_id,title,cast,crew


In [34]:
movies.loc[0]

budget                                                          237000000
genres                  [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{'id': 1463, 'name': 'culture clash'}, {'id':...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{'name': 'Ingenious Film Partners', 'id': 289...
production_countries    [{'iso_3166_1': 'US', 'name': 'United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

In [35]:
working.loc[0]

id                                                      19995
genres                     ['Action', 'Adventure', 'Fantasy']
overview    In the 22nd century, a paraplegic Marine is di...
title                                                  Avatar
cast        ['Sam Worthington', 'Zoe Saldana', 'Sigourney ...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
combine     ['Sam Worthington', 'Zoe Saldana', 'Sigourney ...
Name: 0, dtype: object

In [41]:
[key['name'] for key in movies.loc[0, 'keywords']]

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']