# Data Enrichment Project 3 

# Part 1: Preprocessing

In [14]:
# Impors
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
basics_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url= "https://datasets.imdbws.com/title.ratings.tsv.gz"
aka_url= "https://datasets.imdbws.com/title.akas.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
aka = pd.read_csv(aka_url, sep='\t', low_memory=False)

In [4]:
# Mount and loading: Akas Dataset

In [5]:
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [6]:
# Checking for duplicates
aka.duplicated().sum()

0

In [7]:
# Identify and address missing values 
aka.isna().sum()

titleId              0
ordering             0
title                5
region             104
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [8]:
aka = aka[(aka['region']=='US')]

In [15]:
## Replace "\N" with np.nan.
aka.replace({'\\N':np.nan},inplace=True)

In [16]:
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [17]:
# Mount and loading: Basics Dataset

In [18]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [19]:
# Checking for duplicates
basics.duplicated().sum()

0

In [20]:
# Identify and address missing values 
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [21]:
 # Replace "\N" with np.nan.
basics.replace({'\\N':np.nan},inplace=True)

In [22]:
# Eliminate movies that are null for runtimeMinute & genres.
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [23]:
# Keep only titleType==Movie.
basics = basics[(basics['titleType']=='movie')]

In [24]:
# Convert startyear to numeric for slicing.
basics['startYear'] = basics['startYear'].astype(float)

In [25]:
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2022)]

In [26]:
# Eliminate movies that include "Documentary" in genre.
is_documentary = basics['genres'].str.contains('Documentary',case=False)
basics = basics[~is_documentary]

In [27]:
# Keep only US movies.
# Create the filter.
keep_US_movies = basics['tconst'].isin(aka['titleId'])
# Apply the filter to the dataset.
basics = basics[keep_US_movies]

In [28]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93907,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [29]:
# Mount and loading: Ratings Dataset

In [30]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,257
2,tt0000003,6.5,1715
3,tt0000004,5.6,169
4,tt0000005,6.2,2530


In [31]:
# Checking for duplicates
ratings.duplicated().sum()

0

In [32]:
# Identify and address missing values 
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [33]:
# Replace "\N" with np.nan
ratings.replace({'\\N':np.nan}, inplace=True)

In [34]:
# Keep only US movies.
# Create the filter.
keep_US_movies = ratings['tconst'].isin(aka['titleId'])
# Apply the filter to the dataset.
ratings = ratings[keep_US_movies]

In [35]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,257
4,tt0000005,6.2,2530
5,tt0000006,5.1,173
6,tt0000007,5.4,789


In [36]:
aka.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1353273 entries, 5 to 33280687
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1353273 non-null  object
 1   ordering         1353273 non-null  int64 
 2   title            1353273 non-null  object
 3   region           1353273 non-null  object
 4   language         3707 non-null     object
 5   types            964056 non-null   object
 6   attributes       45061 non-null    object
 7   isOriginalTitle  1351898 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.9+ MB


In [37]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79671 entries, 34792 to 9248215
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          79671 non-null  object 
 1   titleType       79671 non-null  object 
 2   primaryTitle    79671 non-null  object 
 3   originalTitle   79671 non-null  object 
 4   isAdult         79671 non-null  object 
 5   startYear       79671 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  79671 non-null  object 
 8   genres          79671 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.1+ MB


In [38]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477310 entries, 0 to 1261713
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         477310 non-null  object 
 1   averageRating  477310 non-null  float64
 2   numVotes       477310 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.6+ MB


In [39]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [40]:
## Save current dataframe to file.
aka.to_csv("Data/title_aka.csv.gz",compression='gzip',index=False)



In [41]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)



#  Part 2 Extract from TMDB

In [4]:
# Use an API to extract box office revenue and profit data to add to your IMDB data
# Perform exploratory data analysis.

In [5]:
import json
with open('/Users/lbodden/Documents/Data Enrichment/Project-3/Movie_APY.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()


dict_keys(['api-key'])

In [6]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']

In [18]:
## make a movie object using the .Movies function from tmdb
movie = tmdb.Movies()


In [19]:
## movie objects have a .info dictionary 
info = movie.info()
info

HTTPError: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?api_key=e6d443552e5193ad3e0a25f685877089

In [9]:
info['budget']

NameError: name 'info' is not defined

In [119]:
info['revenue']

132000000

In [120]:
info['imdb_id']

'tt1361336'

In [125]:
movie = tmdb.Movies('tt0848228')
info = movie.info()
info['budget']

220000000

In [129]:
response = movie.releases()
for c in movie.countries:
    if c['iso_3166_1'] == 'US':
        print(c['certification'])

PG-13


In [137]:
# Get the movie object for the current id
movie = tmdb.Movies('tt0848228')
# save the .info .releases dictionaries
info = movie.info()
releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']

In [135]:
movie2 = tmdb.Movies('tt0332280')
info2 = movie.info()
info2['budget']

29000000

In [130]:
# Get the movie object for the current id
movie = tmdb.Movies('tt0332280')
# save the .info .releases dictionaries
info = movie.info()
releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']

In [132]:
# Summary between both movies.
test_ids = ['tt0848228', 'tt0332280']
results = []

for movie_id in test_ids:  
    try:
        movie_info = get_movie_certification(movie_id)
        results.append(movie_info)        
    except: 
        pass
    
pd.DataFrame(results)

In [106]:
import os, time,json
import tmdbsimple as tmdb 
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_basics.csv.gz', 'title_aka.csv.gz', 'title_ratings.csv.gz']

In [107]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)