Jamison Hunter

June 29, 2023

# IMDB Data Analysis & Recommendations

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import tmdbsimple as tmdb
import json
import time
from tqdm import tqdm_notebook

In [2]:
def get_movie_with_rating(movie_id):
    """Copied from Coding Dojo Learning Platform"""
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            info['certification'] = c['certification']
    
    return info

In [3]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [4]:
# defining urls as variables
# Source: https://datasets.imdbws.com/
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [5]:
dfb = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [6]:
dfa = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [7]:
dfr = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [8]:
dfb.info()
dfb.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9995235 entries, 0 to 9995234
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 686.3+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [9]:
dfa.info()
dfa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36508033 entries, 0 to 36508032
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [10]:
dfr.info()
dfr.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1328584 entries, 0 to 1328583
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1328584 non-null  object 
 1   averageRating  1328584 non-null  float64
 2   numVotes       1328584 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.4+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
2,tt0000003,6.5,1845
3,tt0000004,5.5,178
4,tt0000005,6.2,2627


In [11]:
# replacing \N values with np.nan for all data frames loaded so far
dfb = dfb.replace({'\\N':np.nan})
dfa = dfa.replace({'\\N':np.nan})
dfr = dfr.replace({'\\N':np.nan})

In [12]:
# Eliminate movies that are null for runtimeMinute, genres, and startYear
dfb = dfb.dropna(subset=['runtimeMinutes','genres','startYear'])
dfb['startYear'] = dfb['startYear'].astype(float)
# keeping startYear 2000-2022
dfb = dfb[(dfb['startYear']>=2000)&(dfb['startYear']<2022)]
# allowing only Movies in the data frame
dfb.drop(dfb.loc[dfb['titleType']!="movie"].index, inplace=True)

In [13]:
# Exclude movies that are included in the documentary category.
is_documentary = dfb['genres'].str.contains('documentary',case=False)
dfb = dfb[~is_documentary]

In [14]:
# Filter the basics table down to only include the US by using the filter akas dataframe
df_us = dfa.drop(columns = ["ordering", "title", "language", "types", "attributes", "isOriginalTitle"])
df_us = df_us[~(df_us["region"] != "US")]
keepers = dfb['tconst'].isin(df_us['titleId'])
dfb = dfb[keepers]
# stating the number of rows and columns in the filtered data set
print(f"This data set has {dfb.shape[0]} rows and {dfb.shape[1]} columns.")
dfb.head()

This data set has 81869 rows and 9 columns.


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67667,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86794,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93931,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


Next I will filter the akas data frame in order to only include US movies.

In [15]:
# filtering akas frame to only include US films
dfa = dfa[~(dfa["region"] != "US")]
dfa["region"].unique()

array(['US'], dtype=object)

I will do the same with ratings so that only US movies have ratings available.

In [16]:
df_us = dfa.drop(columns = ["ordering", "title", "language", "types", "attributes", "isOriginalTitle"])
df_us = df_us[~(df_us["region"] != "US")]
keepers = dfr['tconst'].isin(df_us['titleId'])
dfr = dfr[keepers]
# stating the number of rows and columns in the filtered data set
print(f"This data set has {dfr.shape[0]} rows and {dfr.shape[1]} columns.")
dfr.head()

This data set has 503181 rows and 3 columns.


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
4,tt0000005,6.2,2627
5,tt0000006,5.1,182
6,tt0000007,5.4,820


In [17]:
# example making new folder with os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'final_results_indianapolis_pizza.csv.gz',
 'hero_information.csv',
 'results_in_progress_indianapolis_pizza.json',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

In [18]:
dfb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67667,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86794,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93931,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [19]:
## Save current dataframe to file.
dfb.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [20]:
dfa.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [21]:
## Save current dataframe to file.
dfa.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [22]:
dfr.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
4,tt0000005,6.2,2627
5,tt0000006,5.1,182
6,tt0000007,5.4,820


In [23]:
## Save current dataframe to file.
dfr.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

I will confirm that the data frames saved properly and can be accessed. 

In [24]:
# Open saved file and preview again
dfb = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
dfb.info()
dfb.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81869 entries, 0 to 81868
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81869 non-null  object 
 1   titleType       81869 non-null  object 
 2   primaryTitle    81869 non-null  object 
 3   originalTitle   81869 non-null  object 
 4   isAdult         81869 non-null  int64  
 5   startYear       81869 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81869 non-null  int64  
 8   genres          81869 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [25]:
# Open saved file and preview again
dfa = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
dfa.info()
dfa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450671 entries, 0 to 1450670
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1450671 non-null  object 
 1   ordering         1450671 non-null  int64  
 2   title            1450671 non-null  object 
 3   region           1450671 non-null  object 
 4   language         3997 non-null     object 
 5   types            981125 non-null   object 
 6   attributes       46954 non-null    object 
 7   isOriginalTitle  1449329 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 88.5+ MB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [26]:
# Open saved file and preview again
dfr = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
dfr.info()
dfr.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503181 entries, 0 to 503180
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         503181 non-null  object 
 1   averageRating  503181 non-null  float64
 2   numVotes       503181 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.5+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
2,tt0000005,6.2,2627
3,tt0000006,5.1,182
4,tt0000007,5.4,820


# Gathering Revenue With TMDB

In [27]:
with open('/Users/Jamison/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
# displaying json dictionary keys for the TMDB API
login.keys()

dict_keys(['api-token', 'api-key'])

In [28]:
tmdb.API_KEY =  login['api-key']

In [29]:
# referencing previous Data folder
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'final_results_indianapolis_pizza.csv.gz',
 'hero_information.csv',
 'results_in_progress_indianapolis_pizza.json',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

In [30]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv("Data/title_basics.csv.gz")

In [31]:
YEARS_TO_GET = [2000,2001]
errors = [ ]
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):

    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'

    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)

    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

print(f"- Total errors: {len(errors)}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(movie_ids_to_get,


Movies from 2000:   0%|          | 0/1456 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(movie_ids_to_get,


Movies from 2001:   0%|          | 0/1577 [00:00<?, ?it/s]

- Total errors: 441


In [34]:
pd.read_csv(r"Data/final_tmdb_data_2000.csv.gz").head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,"{'id': 1131062, 'name': 'Wong Kar-Wai’s Love T...",150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,14204632.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.112,2252.0,PG


In [35]:
pd.read_csv(r"Data/final_tmdb_data_2001.csv.gz").head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.33,1201.0,PG-13
2,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,
3,tt0116916,0.0,/rFpHBidSlhjflmnLu7BZilyKeQR.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,73549.0,en,The Dark Mist,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Dark Mist,0.0,3.5,2.0,PG
4,tt0118589,0.0,/9NZAirJahVilTiDNCHLFcdkwkiy.jpg,,22000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,10696.0,en,Glitter,...,5271666.0,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"In music she found her dream, her love, herself.",Glitter,0.0,4.449,128.0,PG-13
