# Movie Success Analysis

Author: Lerato Matlala

## Data Loading

***Import Libraries***

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os

***Load Data***

In [None]:
# Read in title Basics.tsv
basics_url ="https://datasets.imdbws.com/title.basics.tsv.gz"

title_basic_df = pd.read_csv(basics_url, sep='\t', low_memory=False)
title_basic_df.head()

In [None]:
# Read in title.akas.tsv
title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
title_akas_df = pd.read_csv(title_akas_url, sep='\t', low_memory=False)
title_akas_df.head()

In [None]:
# Read in title.akas.tsv
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
title_ratings_df = pd.read_csv(ratings_url, sep='\t', low_memory=False)
title_ratings_df.head()

## Data Cleaning

### ***Handling \N Placeholder Values***

In [None]:
#Count null values in title basic before handling \N placeholder values
title_basic_df.isnull().sum()

In [None]:
#Count null values in title akas before handling \N placeholder values
title_akas_df.isnull().sum()

In [None]:
#Count null values in title ratings before handling \N placeholder values
title_ratings_df.isnull().sum()

In [None]:
# Replace '\N' with np.nan for each DataFrame 
title_basic_df.replace({'\\N': np.nan}, inplace=True)
title_akas_df.replace({'\\N': np.nan}, inplace=True)
title_ratings_df.replace({'\\N': np.nan}, inplace=True)

In [None]:
#Count null values in title basic after handling \N placeholder values
title_basic_df.isnull().sum()

In [None]:
#Count null values in title akas after handling \N placeholder values
title_akas_df.isnull().sum()

In [None]:
#Count null values in title ratings after handling \N placeholder values
title_ratings_df.isnull().sum()

### ***Filter Out Movies***

In [None]:
# title_basic_df Dataframe shape before filtering out movies
title_basic_df.shape

In [None]:
#title_akas_df Dataframe shape before filtering out movies
title_akas_df.shape

In [None]:
# title_ratings_df Dataframe shape before filtering out movies
title_ratings_df.shape

-  **Exclude any movie with missing values for genre or runtime**

In [None]:
# Remove rows where 'genres' or 'runtimeMinutes' is missing
filtered_title_basic_df = title_basic_df.dropna(subset=['genres', 'runtimeMinutes'])

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]

In [None]:
# title_basic_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_basic_df.shape

In [None]:
# title_akas_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_akas_df.shape

In [None]:
#title_ratings_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_ratings_df.shape

- **Include only full-length movies (titleType = "movie")**

In [None]:
# Include only rows where 'titleType' is 'movie'
filtered_title_basic_df = filtered_title_basic_df[filtered_title_basic_df['titleType'] == 'movie']

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]


In [None]:
# filtered_title_basic_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_basic_df.shape

In [None]:
#filtered_title_akas_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_akas_df.shape

In [None]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_ratings_df.shape

- **Include only fictional movies (not from documentary genre)**

In [None]:
# Include only rows where 'genres' does not contain 'Documentary'
filtered_title_basic_df = filtered_title_basic_df[(~filtered_title_basic_df['genres'].str.contains('Documentary',case=False))]

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]


In [None]:
#filtered_title_basic_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_basic_df.shape

In [None]:
#filtered_title_akas_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_akas_df.shape

In [None]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_ratings_df.shape

- **Include only movies that were released 2000 - 2021 (include 2000 and 2021)**

In [None]:
# Convert 'startYear' to numeric (integers) if it's in object format
filtered_title_basic_df['startYear'] = pd.to_numeric(filtered_title_basic_df['startYear'], errors='coerce')

# Filter 'startYear' values between 2000 and 2022
filtered_title_basic_df = filtered_title_basic_df[(filtered_title_basic_df['startYear'] >= 2000) & (filtered_title_basic_df['startYear'] <= 2022)]


# Include only rows where 'startYear' is either 2000 or 2001
#filtered_title_basic_df = filtered_title_basic_df[((filtered_title_basic_df['startYear'] == '2000') | (filtered_title_basic_df['startYear'] == '2001'))].copy()

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]

In [None]:
#filtered_title_basic_df Dataframe shape after including only rows where 'startYear' is between 2000 and 2022
filtered_title_basic_df.shape

In [None]:
#filtered_title_akas_df Dataframe shape after including only rows where 'startYear' is between 2000 and 2022
filtered_title_akas_df.shape

In [None]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'startYear' is between 2000 and 2022
filtered_title_ratings_df.shape

- **Include only movies that were released in the United States**

In [None]:
# Include only rows where 'country' is 'USA'
filtered_title_akas_df = filtered_title_akas_df[filtered_title_akas_df['region'] == 'US']

# Collect the IDs of the filtered rows
filtered_titleId = filtered_title_akas_df['titleId'].tolist()

# Filter corresponding rows in filtered_title_basic_df and filtered_title_ratings_df based on the collected IDs
filtered_title_basic_df = filtered_title_basic_df[filtered_title_basic_df['tconst'].isin(filtered_titleId)]
filtered_title_ratings_df = filtered_title_ratings_df[filtered_title_ratings_df['tconst'].isin(filtered_titleId)]


In [None]:
#filtered_title_basic_df Dataframe shape after filtering out movies that were released in the US
filtered_title_basic_df.shape

In [None]:
#filtered_title_akas_df Dataframe shape after filtering out movies that were released in the US
filtered_title_akas_df.shape

In [None]:
#filtered_title_ratings_df Dataframe shape after filtering out movies that were released in the US
filtered_title_ratings_df.shape

### ***Dataframes Summaries***

**title_basic_df**

In [None]:
filtered_title_basic_df.info()

**title_akas_df**

In [None]:
filtered_title_akas_df.info()

**title_ratings_df**

In [None]:
filtered_title_ratings_df.info()

### ***Save Dataframes***

In [None]:
#Create Data folder
os.makedirs('Data/',exist_ok=True) 

# Confirm folder created
os.listdir("Data/")

In [None]:
# Save filtered_title_basic_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_basic_df.to_csv('Data/filtered_title_basic.csv.gz', index=False, compression='gzip')

In [None]:
# Save filtered_title_akas_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_akas_df.to_csv('Data/filtered_title_akas.csv.gz', index=False, compression='gzip')

In [None]:
# Save filtered_title_ratings_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_ratings_df.to_csv('Data/filtered_title_ratings.csv.gz', index=False, compression='gzip')

***Project 3 - Part 2 Extract from TMDB***

In [10]:
#Import Libraries
import os, time,json
import tmdbsimple as tmdb 
from tqdm.notebook import tqdm_notebook
import pandas as pd
import glob
import locale

In [11]:
#Loading TMDB API Key from JSON File and Setting it for TMDb API Access
with open('/Users/leratomatlala/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

tmdb.API_KEY =  login['api-key']

In [13]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


***TMDB API Calls***

In [14]:
#Function to Retrieve Movie Information with Certification Rating for a Given Movie ID
def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    movie  = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
            
            return info

In [15]:
# Load in the dataframe from project part 1 as basics:
basics_url ="Data/filtered_title_basic.csv.gz"
basics = pd.read_csv(basics_url,  low_memory=False)

# Create Required Lists for the Loop
YEARS_TO_GET = [2000, 2001]
errors = []

# Start OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    
    # Define the folder to store results
    FOLDER = "Data/"
    os.makedirs(FOLDER, exist_ok=True)
    os.listdir(FOLDER)

    # Define the JSON file to store results for the year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # Check if the file exists
    file_exists = os.path.isfile(JSON_FILE)
    
    # If it does not exist: create it
    if not file_exists:
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id': 0}], f)
    
    # Define/filter the IDs to call
    df = basics.loc[basics['startYear'] == YEAR].copy()
    movie_ids = df['tconst'].copy()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # Check for and filter out movie IDs that already exist
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    # Start INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                              desc=f'Movies from {YEAR}',
                              position=1,
                              leave=True):
        try:
            temp = get_movie_with_rating(movie_id)
            temp_filtered = {
                    'imdb_id' : temp.get('imdb_id'),
                    'revenue': temp.get('revenue'),
                    'budget': temp.get('budget'),
                    'certification': temp.get('certification')
            }
            write_json(temp_filtered, JSON_FILE)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])
     # After the Inner Loop
# Read the JSON file into a dataframe
    print(f"Contents of {JSON_FILE}:")
    with open(JSON_FILE, 'r') as f:
        print(f.read())

    #final_year_df = pd.read_json(JSON_FILE)
       

    # After the Inner Loop
    # Read the JSON file into a dataframe
    final_year_df = pd.read_json(JSON_FILE)
    
    # Save the year's results as csv.gz file
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1464 [00:00<?, ?it/s]

Contents of Data/tmdb_api_results_2000.json:
[{"imdb_id": 0}, {"imdb_id": "tt0113026", "revenue": 0, "budget": 10000000, "certification": ""}, {"imdb_id": "tt0113092", "revenue": 0, "budget": 0, "certification": ""}, {"imdb_id": "tt0118694", "revenue": 14204632, "budget": 150000, "certification": "PG"}, {"imdb_id": "tt0118852", "revenue": 0, "budget": 0, "certification": "R"}, {"imdb_id": "tt0119273", "revenue": 0, "budget": 15000000, "certification": "R"}, {"imdb_id": "tt0119495", "revenue": 0, "budget": 0, "certification": "R"}, {"imdb_id": "tt0120202", "revenue": 9206279, "budget": 0, "certification": ""}, {"imdb_id": "tt0120263", "revenue": 80334, "budget": 0, "certification": ""}, {"imdb_id": "tt0120467", "revenue": 14904, "budget": 120000, "certification": "R"}, {"imdb_id": "tt0120630", "revenue": 224834564, "budget": 45000000, "certification": "G"}, {"imdb_id": "tt0120698", "revenue": 0, "budget": 0, "certification": ""}, {"imdb_id": "tt0120733", "revenue": 0, "budget": 0, "cert

Movies from 2001:   0%|          | 0/1584 [00:00<?, ?it/s]

Contents of Data/tmdb_api_results_2001.json:
[{"imdb_id": 0}, {"imdb_id": "tt0035423", "revenue": 76019048, "budget": 48000000, "certification": "PG-13"}, {"imdb_id": "tt0116916", "revenue": 0, "budget": 0, "certification": "PG"}, {"imdb_id": "tt0118589", "revenue": 5271666, "budget": 22000000, "certification": "PG-13"}, {"imdb_id": "tt0118652", "revenue": 0, "budget": 1000000, "certification": "R"}, {"imdb_id": "tt0120166", "revenue": 0, "budget": 0, "certification": "NR"}, {"imdb_id": "tt0120681", "revenue": 74558115, "budget": 35000000, "certification": "R"}, {"imdb_id": "tt0120737", "revenue": 871368364, "budget": 93000000, "certification": "PG-13"}, {"imdb_id": "tt0120807", "revenue": 0, "budget": 0, "certification": ""}, {"imdb_id": "tt0120824", "revenue": 24690441, "budget": 38000000, "certification": "R"}, {"imdb_id": "tt0123581", "revenue": 0, "budget": 0, "certification": ""}, {"imdb_id": "tt0124889", "revenue": 0, "budget": 0, "certification": ""}, {"imdb_id": "tt0125022", "

***Combine Data***

In [16]:
# Define the folder where CSV files are stored
folder = "Data/"

# Find all CSV files in the folder
csv_files = glob.glob(f"{folder}final_tmdb_data_*.csv.gz")

# Initialize an empty list to store dataframes
dfs = []

# Load and concatenate each CSV file
for csv_file in csv_files:
    df = pd.read_csv(csv_file, compression="gzip")
    dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Display the combined dataframe
combined_df.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0113026,0.0,10000000.0,
2,tt0113092,0.0,0.0,
3,tt0118694,14204632.0,150000.0,PG
4,tt0118852,0.0,0.0,R


***Save Combined Data***

In [22]:
combined_df.to_csv(f"{folder}combined_tmdb_data.csv.gz", compression="gzip", index=False)

***Exploratory Data Analysis***

In [17]:
# Task 1: Count movies with valid financial information
valid_financial_info_movies = combined_df[(combined_df['budget'] > 0) | (combined_df['revenue'] > 0)]
num_movies_with_valid_financial_info = len(valid_financial_info_movies)
print(f"Number of movies with valid financial information: {num_movies_with_valid_financial_info}")

Number of movies with valid financial information: 569


In [19]:
# Task 2: Count movies in each certification category
certification_counts = combined_df['certification'].value_counts()
print("Number of movies in each certification category:")
print(certification_counts)

Number of movies in each certification category:
R          449
PG-13      173
NR          68
PG          61
G           25
NC-17        5
Unrated      1
Name: certification, dtype: int64


In [20]:
# Task 3: Average revenue per certification category (excluding 0 values)
average_revenue_per_certification = valid_financial_info_movies.groupby('certification')['revenue'].mean()
locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')

# Format the numbers for better readability
average_revenue_per_certification_formatted = average_revenue_per_certification.apply(lambda x: locale.format_string("%d", x, grouping=True))

# Print the formatted results
print("Average revenue per certification category:")
print(average_revenue_per_certification_formatted)

Average revenue per certification category:
certification
G        110,957,400
NR         5,311,700
PG        98,197,389
PG-13     92,298,695
R         31,755,247
Name: revenue, dtype: object


In [21]:
# Task 4: Average budget per certification category (excluding 0 values)
average_budget_per_certification = valid_financial_info_movies.groupby('certification')['budget'].mean()
average_budget_per_certification_formatted = average_budget_per_certification.apply(lambda x: locale.format_string("%d", x, grouping=True))

print("Average budget per certification category:")
print(average_budget_per_certification_formatted)

Average budget per certification category:
certification
G        35,750,000
NR        2,259,649
PG       40,514,206
PG-13    42,407,788
R        19,060,474
Name: budget, dtype: object
