In [1]:
# Our primary dataset can be found at:
# https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies
# The file is over 500MB - too big for github!

In [2]:
# import dependencies
import pandas as pd
# import requests
# import os
# from dotenv import load_dotenv
# import json
# import subprocess

In [3]:
# Read the csv and store as a dataframe
tmdb_full_df = pd.read_csv("D://tmdb_database/TMDB_movie_dataset_v11.csv")

# tmdb_full_df.head()
# tmdb_full_df.dtypes
print(tmdb_full_df.columns.unique())

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')


In [4]:
# Select the columns required
tmdb_df = tmdb_full_df[['vote_average',
                        'vote_count',
                        'release_date',
                        'revenue',
                        'runtime',
                        'budget',
                        'imdb_id',
                        'overview',
                        'popularity',
                        'tagline',
                        'genres',
                        'production_companies',
                        'spoken_languages',
                        'keywords']]

In [5]:
tmdb_df.head()

Unnamed: 0,vote_average,vote_count,release_date,revenue,runtime,budget,imdb_id,overview,popularity,tagline,genres,production_companies,spoken_languages,keywords
0,8.364,34495,2010-07-15,825532764,148,160000000,tt1375666,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,8.417,32571,2014-11-05,701729206,169,165000000,tt0816692,The adventures of a group of explorers who mak...,140.241,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...",English,"rescue, future, spacecraft, race against time,..."
2,8.512,30619,2008-07-16,1004558444,152,185000000,tt0468569,Batman raises the stakes in his war on crime. ...,130.643,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,7.573,29815,2009-12-15,2923706026,162,237000000,tt0499549,"In the 22nd century, a paraplegic Marine is di...",79.932,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","future, society, culture clash, space travel, ..."
4,7.71,29166,2012-04-25,1518815515,143,220000000,tt0848228,When an unexpected enemy emerges and threatens...,98.082,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [6]:
# Convert "release_date" to datetime
tmdb_df.loc[:, 'release_date'] = pd.to_datetime(tmdb_df['release_date'])
print(tmdb_df.dtypes)


vote_average            float64
vote_count                int64
release_date             object
revenue                   int64
runtime                   int64
budget                    int64
imdb_id                  object
overview                 object
popularity              float64
tagline                  object
genres                   object
production_companies     object
spoken_languages         object
keywords                 object
dtype: object


In [7]:
# Split the genre strings into lists for manipulation
tmdb_df.loc[:, 'genres'] = tmdb_df['genres'].str.split(', ')
tmdb_df.head()

Unnamed: 0,vote_average,vote_count,release_date,revenue,runtime,budget,imdb_id,overview,popularity,tagline,genres,production_companies,spoken_languages,keywords
0,8.364,34495,2010-07-15 00:00:00,825532764,148,160000000,tt1375666,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,8.417,32571,2014-11-05 00:00:00,701729206,169,165000000,tt0816692,The adventures of a group of explorers who mak...,140.241,Mankind was born on Earth. It was never meant ...,"[Adventure, Drama, Science Fiction]","Legendary Pictures, Syncopy, Lynda Obst Produc...",English,"rescue, future, spacecraft, race against time,..."
2,8.512,30619,2008-07-16 00:00:00,1004558444,152,185000000,tt0468569,Batman raises the stakes in his war on crime. ...,130.643,Welcome to a world without rules.,"[Drama, Action, Crime, Thriller]","DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,7.573,29815,2009-12-15 00:00:00,2923706026,162,237000000,tt0499549,"In the 22nd century, a paraplegic Marine is di...",79.932,Enter the world of Pandora.,"[Action, Adventure, Fantasy, Science Fiction]","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","future, society, culture clash, space travel, ..."
4,7.71,29166,2012-04-25 00:00:00,1518815515,143,220000000,tt0848228,When an unexpected enemy emerges and threatens...,98.082,Some assembly required.,"[Science Fiction, Action, Adventure]",Marvel Studios,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [8]:
# Using code sourced from GTP 4o, we are verifying the genre column is cleaned up
# This was done due to some un expected 'NoneType' errors
# Drop rows with NaN values in the 'genres' column
tmdb_df = tmdb_df.dropna(subset=['genres'])

# Ensure all entries in 'genres' are lists
tmdb_df['genres'] = tmdb_df['genres'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists in the 'genres' column
all_genres = [genre for sublist in tmdb_df['genres'] for genre in sublist]

# Get unique values
unique_genres = set(all_genres)

# Display the unique values
unique_genres

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [9]:
# Build a list of tuples that assigns a value to each genre
# By using tuples the values are immutable
genre_map = [
    ('Action', 1),
    ('Adventure', 2),
    ('Animation', 3),
    ('Comedy', 4),
    ('Crime', 5),
    ('Documentary', 6),
    ('Drama', 7),
    ('Family', 8),
    ('Fantasy', 9),
    ('History', 10),
    ('Horror', 11),
    ('Music', 12),
    ('Mystery', 13),
    ('Romance', 14),
    ('Science Fiction', 15),
    ('TV Movie', 16),
    ('Thriller', 17),
    ('War', 18),
    ('Western', 19),
]


In [10]:
# Exploding the df into additional rows will significanty increase df size.
# df_genres_exploded = df_genres.explode('genres')
# df_genres_exploded.head()

In [11]:
# Use comprehensions and lamdba to convert genres to their corresponding values

tmdb_df['genres'] = tmdb_df['genres'].apply(lambda genres: [value for genre in genres for key, value in genre_map if genre == key])

# Display the result
print(tmdb_df['genres'].head())


0       [1, 15, 2]
1       [2, 7, 15]
2    [7, 1, 5, 17]
3    [1, 2, 9, 15]
4       [15, 1, 2]
Name: genres, dtype: object


In [12]:
tmdb_df.head()

Unnamed: 0,vote_average,vote_count,release_date,revenue,runtime,budget,imdb_id,overview,popularity,tagline,genres,production_companies,spoken_languages,keywords
0,8.364,34495,2010-07-15 00:00:00,825532764,148,160000000,tt1375666,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[1, 15, 2]","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,8.417,32571,2014-11-05 00:00:00,701729206,169,165000000,tt0816692,The adventures of a group of explorers who mak...,140.241,Mankind was born on Earth. It was never meant ...,"[2, 7, 15]","Legendary Pictures, Syncopy, Lynda Obst Produc...",English,"rescue, future, spacecraft, race against time,..."
2,8.512,30619,2008-07-16 00:00:00,1004558444,152,185000000,tt0468569,Batman raises the stakes in his war on crime. ...,130.643,Welcome to a world without rules.,"[7, 1, 5, 17]","DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,7.573,29815,2009-12-15 00:00:00,2923706026,162,237000000,tt0499549,"In the 22nd century, a paraplegic Marine is di...",79.932,Enter the world of Pandora.,"[1, 2, 9, 15]","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","future, society, culture clash, space travel, ..."
4,7.71,29166,2012-04-25 00:00:00,1518815515,143,220000000,tt0848228,When an unexpected enemy emerges and threatens...,98.082,Some assembly required.,"[15, 1, 2]",Marvel Studios,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [13]:
# Set index to 'release_date' for analysis
tmdb_df.set_index('release_date', inplace=True)
tmdb_df.head()

  return Index(sequences[0], name=names)


Unnamed: 0_level_0,vote_average,vote_count,revenue,runtime,budget,imdb_id,overview,popularity,tagline,genres,production_companies,spoken_languages,keywords
release_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-07-15,8.364,34495,825532764,148,160000000,tt1375666,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[1, 15, 2]","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
2014-11-05,8.417,32571,701729206,169,165000000,tt0816692,The adventures of a group of explorers who mak...,140.241,Mankind was born on Earth. It was never meant ...,"[2, 7, 15]","Legendary Pictures, Syncopy, Lynda Obst Produc...",English,"rescue, future, spacecraft, race against time,..."
2008-07-16,8.512,30619,1004558444,152,185000000,tt0468569,Batman raises the stakes in his war on crime. ...,130.643,Welcome to a world without rules.,"[7, 1, 5, 17]","DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
2009-12-15,7.573,29815,2923706026,162,237000000,tt0499549,"In the 22nd century, a paraplegic Marine is di...",79.932,Enter the world of Pandora.,"[1, 2, 9, 15]","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","future, society, culture clash, space travel, ..."
2012-04-25,7.71,29166,1518815515,143,220000000,tt0848228,When an unexpected enemy emerges and threatens...,98.082,Some assembly required.,"[15, 1, 2]",Marvel Studios,"English, Hindi, Russian","new york city, superhero, shield, based on com..."
