In [1]:
# basic python data handling analysis modules
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
sns.set()
import pickle, os, gc, re
# small utility functions
from utility import *

# interactive jupyter widgets!
# https://towardsdatascience.com/interactive-controls-for-jupyter-notebooks-f5c94829aee6
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

%matplotlib inline

In [2]:
import shutil
import tempfile
import urllib.request
import gzip
import datetime
import json
import time

Dataset_directory = ".."+os.sep+"Datasets"+os.sep
tmdb_dir = create_subdir(Dataset_directory, "themoviedb")

In [3]:
secondary_tmdb_dir = "e:\\Datasets\\themoviedb\\"

In [6]:
secondary_tmdb_dir

'e:\\Datasets\\themoviedb\\'

In [None]:
## lets have a look at the person_id API return data

In [8]:
tmdb_df = pd.read_msgpack( secondary_tmdb_dir+"person_ids_df_as_of_10_07_2019_final_part"+".msgpack")

In [None]:
tmdb_df.columns

In [8]:
# adult: bool,complete mostly false
# also_known_as: complete, but mostly "()", string, brackets comma separated list - read as tuple
# biography: complete, but mostly "", string modal length 0, then 27
# birthday: incomplete
    # mode ="2018-05-01" - doesn't seem likely! (maybe check the people??)
    # follow-ups realistic, but 19XX-01-01 -> break into day/month/year and set those as NaN
# credits - obscenely large dictionary listing all movies in as 'cast' or 'crew'
# deathday: incomplete
    # mode = "2018-05-01"
    # similar pattern of 19XX-01-01
# external_ids: 'freebase_id', 'instagram_id', 'tvrage_id', 'twitter_id', 'freebase_mid', 'imdb_id', 'facebook_id'
# gender: complete, but 2/3 "unknown" category
# homepage: incomplete (obviously!), webpage strings (format inconsistent - with and without http/https)
# id: complete and unique
# imdb_id: incomplete (has some blanks) and 2 pairs of entries with same imdb_id
# known_for_department: incomplete (but mostly filled in), categorical, reasonable set of categories (except for one value "Actors" with only 7 instances!)
# name: complete, strings but some silly values ("Someone",".","~")
# place_of_birth: incomplete, mostly "city, state, country", sometimes just country
# popularity: complete, float, usual range 0.6 appears to be the default option
# profile_path: incomplete, string filepath, some overlap (~10 values)


# new/useful information:
    # birthday/deathday -> age (can check for people who star in things after their death! or before birth?)
    # known_for_department -> easy to use
    # place_of_birth -> hassle but potentially interesting

adult                                                               False
also_known_as           (George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...
biography               Arguably the most important film innovator in ...
birthday                                                       1944-05-14
credits                 {'cast': ({'popularity': 10.908, 'vote_count':...
deathday                                                             None
external_ids            {'freebase_id': '/en/george_lucas', 'instagram...
gender                                                                  2
homepage                                                             None
id                                                                      1
images                  {'profiles': ({'iso_639_1': None, 'aspect_rati...
imdb_id                                                         nm0000184
known_for_department                                           Production
name                                  

In [9]:
tmdb_df.isnull().sum()

adult                        0
also_known_as                0
biography                    0
birthday                392951
credits                      0
deathday                406301
external_ids                 0
gender                       0
homepage                405664
id                           0
images                       0
imdb_id                      0
known_for_department       162
name                         0
place_of_birth          394060
popularity                   0
profile_path            376998
dtype: int64

In [13]:
tmdb_df["also_known_as"].value_counts().head(10)

()                                266745
(The Grateful Dead,)                   8
(Evanescence,)                         7
(The Kelly Family,)                    6
(Les Luthiers,)                        6
(Steps Ahead,)                         6
(Def Leppard,)                         6
(Nightwish,)                           6
(The Beach Boys,)                      6
(Eric Burdon and The Animals,)         5
Name: also_known_as, dtype: int64

In [16]:
tmdb_df["also_known_as"].apply(lambda x: len(x)).value_counts().head(10)

0    266745
1     22071
2      9502
3      4725
4      2686
5      1533
6       897
7       569
8       411
9       319
Name: also_known_as, dtype: int64

In [19]:
tmdb_df["biography"].value_counts().head()

                                                                                      273308
Professional mixed martial artist formerly competing in the UFC.                          23
Professional mixed martial artist competing in the UFC Welterweight division.             10
Professional mixed martial artist competing in the UFC Light Heavyweight division.         9
Professional mixed martial artist competing in the UFC Middleweight division.              8
Name: biography, dtype: int64

In [20]:
tmdb_df["biography"].apply(lambda x: len(x)).value_counts().head()

0     273308
27       121
26       120
25       116
28       111
Name: biography, dtype: int64

In [23]:
tmdb_df["birthday"].value_counts().head(10)

2018-05-01    217
1964-01-01    129
1970-01-01    118
1967-01-01    116
1968-01-01    116
1971-01-01    115
1965-01-01    109
1974-01-01    108
1969-01-01    103
1966-01-01     99
Name: birthday, dtype: int64

In [31]:
tmdb_df["deathday"].value_counts().head(50)

2018-05-01    99
1995-01-01    18
1993-01-01    16
1982-01-01    14
2006-01-01    14
1999-01-01    14
1989-01-01    14
1977-01-01    13
2003-01-01    13
2002-01-01    12
2009-01-01    12
1983-01-01    12
1997-01-01    11
1971-01-01    11
1996-01-01    11
1987-01-01    11
1990-01-01    11
1972-01-01    10
2004-01-01    10
2012-01-01    10
2001-01-01    10
1998-01-01     9
1988-01-01     9
2005-01-01     9
1976-01-01     9
1986-01-01     9
1981-01-01     8
1992-01-01     8
1978-01-01     8
2003-04-01     8
1994-01-01     8
2002-12-22     8
1980-01-01     8
2008-10-11     7
1975-01-01     7
1968-01-01     7
1967-01-01     7
1974-01-01     7
1961-01-01     7
1984-01-01     7
1992-11-04     7
2012-07-27     7
1985-01-01     7
1991-01-01     7
2013-01-01     7
1979-01-01     7
1979-11-30     7
1942-01-01     6
2016-03-11     6
2013-03-05     6
Name: deathday, dtype: int64

In [34]:
tmdb_df["gender"].value_counts()



0    201068
2     72825
1     36410
Name: gender, dtype: int64

In [39]:
tmdb_df["imdb_id"].value_counts().head(10)

             36385
nm2604321        2
nm2214307        2
nm0024825        1
nm0005137        1
nm0387075        1
nm0474340        1
nm3316569        1
nm4055169        1
nm0052186        1
Name: imdb_id, dtype: int64

In [40]:
tmdb_df["known_for_department"].value_counts()

Acting               195870
Writing               29188
Directing             26256
Production            22021
Sound                  9509
Editing                6165
Art                    5359
Camera                 5246
Crew                   4689
Costume & Make-Up      3727
Visual Effects         1548
Lighting                502
Creator                 128
Actors                    7
Name: known_for_department, dtype: int64

In [42]:
tmdb_df["name"].value_counts().head()

Someone         10
.                9
~                8
Justin Smith     7
Robert Smith     7
Name: name, dtype: int64

In [45]:
tmdb_df["place_of_birth"].value_counts().head(10)

New York City, New York, USA              1293
Los Angeles, California, USA               860
London, England, UK                        677
Berlin, Germany                            668
Paris, France                              627
Chicago, Illinois, USA                     466
Rio de Janeiro, Rio de Janeiro, Brazil     332
USA                                        303
Tokyo, Japan                               283
Brooklyn, New York, USA                    282
Name: place_of_birth, dtype: int64

In [48]:
tmdb_df["popularity"].value_counts().head()

0.60    252809
0.98     10033
1.40      3768
0.84      2976
1.38      2953
Name: popularity, dtype: int64

In [50]:
tmdb_df["profile_path"].value_counts()

/azRn7U2RKTkB9cHBO4GwJZm2jxy.jpg    3
/zUqyn3aQXTzeP1n8yd8Udt1twYA.jpg    3
/67ezLWzb4bV3Z6IO3iCEgAFacqW.jpg    2
/o8WiseagTKA5oEsFpM3rqhLDGsB.jpg    2
/j9kWyurXkp7pimeAeL2WLbRhBsB.jpg    2
/knXPd2RBIqCUWx6fpF2K07MpAK7.jpg    2
/uS4a3epqXVtjTUGRR37zG0yOQFS.jpg    2
/pqe7uwiUrPXruIuYefjO5MOtEFI.jpg    2
/wli1LESBUnNeDLVxroDsGwXRyus.jpg    2
/iMsbRept9OMnKB6zLHYHgED6iyv.jpg    2
/4ANjlx0mGA62JMsBGnwkk4fJQwx.jpg    2
/9MRjrm1nmPxHpWVjaEiG6529TGn.jpg    2
/l1A084aEevlg7OB2iLZUnfshIzu.jpg    2
/sZEItOgZ2kC0cWgCf9Q1LHPgN6p.jpg    2
/4sz8raVJXylbix14CQTyXXhHXa7.jpg    2
/7MLLqCXPFEF9PwnV34CsngPyPnq.jpg    2
/8ZbiPeahSuO81Hzm50azfRWQNWY.jpg    2
/kZHs1MFzLSHiiDAA1lVBZkGSWko.jpg    2
/qTXPRqbpcVgBYiCiRhplaIaIqlq.jpg    2
/hNtCcQHLnZvgREL6vc04CUCiiS3.jpg    2
/bNKAo8IdtMP6Fd85MfXsKoED7Bb.jpg    2
/cQDfW2prR2JKnq5XSTHMEFnxAY4.jpg    2
/n37JEygdq9JMB64hwnDiSOalWTY.jpg    2
/73a9ypM4CS08rL4v3YwazKOaRIR.jpg    2
/xpRC1aZcvYkpvoDat8cdlcDYE6L.jpg    2
/4cTskb3GFtQ2uuS75Fs1kyeYCO8.jpg    2
/vXC9xIRPuI6

In [11]:
tmdb_df.isnull().sum()

adult                        0
also_known_as                0
biography                    0
birthday                230229
credits                      0
deathday                284814
external_ids                 0
gender                       0
homepage                302709
id                           0
images                       0
imdb_id                      0
known_for_department        88
name                         0
place_of_birth          247206
popularity                   0
profile_path            218227
dtype: int64

In [9]:
tmdb_df.dtypes

adult                      bool
also_known_as            object
biography                object
birthday                 object
credits                  object
deathday                 object
external_ids             object
gender                    int64
homepage                 object
id                        int64
images                   object
imdb_id                  object
known_for_department     object
name                     object
place_of_birth           object
popularity              float64
profile_path             object
dtype: object

In [28]:
len( tmdb_df.loc[1,"credits"]['cast'] ), len( tmdb_df.loc[1,"credits"]['crew'] ), 

(74, 135)

In [33]:
tmdb_df.loc[1,"external_ids"].keys()

dict_keys(['freebase_id', 'instagram_id', 'tvrage_id', 'twitter_id', 'freebase_mid', 'imdb_id', 'facebook_id'])

In [None]:
#######################################################

In [None]:
# AWARDS

In [52]:
Dataset_directory + "350-000-movies-from-themoviedborg" + os.sep

'..\\Datasets\\350-000-movies-from-themoviedborg\\'

In [55]:
df = pd.read_csv( Dataset_directory + "350-000-movies-from-themoviedborg" + os.sep + "220k_awards_by_directors.csv" )

In [57]:
df.describe(include='all')
# director_name: complete, 29503 distinct directors
# ceremony: complete, 3428 distinct ceremonies
# year: 1925-2018
# category: 8286 distinct (1 missing value?)
# outcome: Nominated, Won, 2nd place, 3rd place (110k, 110k, 3k, 1k - weird distribution!)
# original_language: 93 distinct languages, english top

Unnamed: 0,director_name,ceremony,year,category,outcome,original_language
count,225675,225675,225675.0,225674,225675,225675
unique,29503,3428,,8286,4,93
top,Steven Spielberg,Primetime Emmy Awards,,Audience Award,Nominated,en
freq,331,5660,,8002,110694,154317
mean,,,2002.827327,,,
std,,,14.054848,,,
min,,,1925.0,,,
25%,,,1998.0,,,
50%,,,2007.0,,,
75%,,,2013.0,,,


In [58]:
df["outcome"].value_counts()

Nominated    110694
Won          110456
2nd place      3226
3rd place      1299
Name: outcome, dtype: int64

In [61]:
df["category"].value_counts().head(10)

Audience Award      8002
Primetime Emmy      5653
Jury Award          3819
Oscar               3455
Grand Jury Prize    2648
Festival Prize      2287
Golden Globe        2272
Best Film           2200
Gold Hugo           2074
Grand Prix          1968
Name: category, dtype: int64

In [66]:
df = pd.read_csv( Dataset_directory + "350-000-movies-from-themoviedborg" + os.sep + "900_acclaimed_directors_awards.csv", delimiter=";" )


In [68]:
df.describe(include='all')

Unnamed: 0,name,tmdbID,imdbID,Total awards,Academy Fellowship,Academy Fellowship_won,Academy Fellowship_nominated,Gold Hugo,Gold Hugo_won,Gold Hugo_nominated,...,Orpheus Award_nominated,Arte France Cinéma Award,Arte France Cinéma Award_won,Arte France Cinéma Award_nominated,Citizens Award,Citizens Award_won,Citizens Award_nominated,Cowboy Award,Cowboy Award_won,Cowboy Award_nominated
count,893,893.0,893,893.0,893.0,893.0,893.0,893.0,893.0,893.0,...,893.0,893.0,893.0,893.0,893.0,893.0,893.0,893.0,893.0,893.0
unique,893,,893,,,,,,,,...,,,,,,,,,,
top,Roy Rowland,,nm0896533,,,,,,,,...,,,,,,,,,,
freq,1,,1,,,,,,,,...,,,,,,,,,,
mean,,167334.1,,32.447928,0.025756,0.025756,0.0,0.303471,0.023516,0.279955,...,0.0,0.00112,0.00112,0.0,0.00112,0.00112,0.0,0.00112,0.00112,0.0
std,,339837.3,,43.371128,0.165417,0.165417,0.0,0.774278,0.158843,0.73051,...,0.0,0.033464,0.033464,0.0,0.033464,0.033464,0.0,0.033464,0.033464,0.0
min,,7.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,11720.0,,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,40199.0,,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,102561.0,,44.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
# 900 top directors by
# name, tmbdID,imdbID, Total awards
# 2224 triplets (<award>(total),<award>_won,<award>_nominated)
# total sum, not just dummied (you can be nominated for two Gold Hugos)

Unnamed: 0,name,tmdbID,imdbID,Total awards,Academy Fellowship,Academy Fellowship_won,Academy Fellowship_nominated,Gold Hugo,Gold Hugo_won,Gold Hugo_nominated,...,Orpheus Award_nominated,Arte France Cinéma Award,Arte France Cinéma Award_won,Arte France Cinéma Award_nominated,Citizens Award,Citizens Award_won,Citizens Award_nominated,Cowboy Award,Cowboy Award_won,Cowboy Award_nominated
0,Ingmar Bergman,6648,nm0000005,101,1,1,0,2,0,2,...,0,0,0,0,0,0,0,0,0,0
1,Marlon Brando,3084,nm0000008,58,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Federico Fellini,4415,nm0000019,90,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Alfred Hitchcock,2636,nm0000033,53,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buster Keaton,8635,nm0000036,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Stanley Kubrick,240,nm0000040,80,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Akira Kurosawa,5026,nm0000041,80,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Bruce Lee,19429,nm0000045,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Laurence Olivier,3359,nm0000059,70,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,François Truffaut,1650,nm0000076,54,0,0,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0


In [71]:
df.columns

Index(['name', 'tmdbID', 'imdbID', 'Total awards', 'Academy Fellowship',
       'Academy Fellowship_won', 'Academy Fellowship_nominated', 'Gold Hugo',
       'Gold Hugo_won', 'Gold Hugo_nominated',
       ...
       'Orpheus Award_nominated', 'Arte France Cinéma Award',
       'Arte France Cinéma Award_won', 'Arte France Cinéma Award_nominated',
       'Citizens Award', 'Citizens Award_won', 'Citizens Award_nominated',
       'Cowboy Award', 'Cowboy Award_won', 'Cowboy Award_nominated'],
      dtype='object', length=6676)

In [72]:
(6676-4)/3

2224.0