# TMDB Box Office Prediction

This data is from https://www.kaggle.com/competitions/tmdb-box-office-prediction/data

## Libraries

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read data

In [8]:
train = pd.read_csv("data/train.csv")

In [9]:
test = pd.read_csv("data/test.csv")

## Preprocess

In [22]:
train.head(2).T

Unnamed: 0,0,1
id,1,2
belongs_to_collection,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...","[{'id': 107674, 'name': 'The Princess Diaries ..."
budget,14000000,40000000
genres,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
homepage,,
imdb_id,tt2637294,tt0368933
original_language,en,en
original_title,Hot Tub Time Machine 2,The Princess Diaries 2: Royal Engagement
overview,"When Lou, who has become the ""father of the In...",Mia Thermopolis is now a college graduate and ...
popularity,6.575393,8.248895


In [24]:
test.tail(10).T

Unnamed: 0,4388,4389,4390,4391,4392,4393,4394,4395,4396,4397
id,7389,7390,7391,7392,7393,7394,7395,7396,7397,7398
belongs_to_collection,,,,,"[{'id': 528, 'name': 'The Terminator Collectio...",,"[{'id': 146534, 'name': 'Without a Paddle Coll...",,,
budget,1000000,0,0,10000000,155000000,42000000,19000000,16000000,2000000,64000
genres,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...","[{'id': 35, 'name': 'Comedy'}]","[{'id': 18, 'name': 'Drama'}]","[{'id': 18, 'name': 'Drama'}]","[{'id': 878, 'name': 'Science Fiction'}, {'id'...","[{'id': 53, 'name': 'Thriller'}]","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 18, 'name': 'Drama'}]","[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...","[{'id': 18, 'name': 'Drama'}]"
homepage,http://www.specialthemovie.com/,http://www.inappropriatecomedy.com,,http://www.weareallprecious.com/,http://www.terminatormovie.com/,,,,,
imdb_id,tt0479162,tt1754811,tt0103253,tt0929632,tt1340138,tt0218922,tt0364751,tt0084855,tt3235888,tt0056663
original_language,en,en,en,en,en,en,en,en,en,fr
original_title,Special,InAPPropriate Comedy,Whore,Precious,Terminator Genisys,Original Sin,Without a Paddle,The Verdict,It Follows,Vivre sa vie: film en douze tableaux
overview,A lonely metermaid has a psychotic reaction to...,A no-nonsense cop has a flair for fashion and ...,This melodrama investigates the life of a pros...,"Set in Harlem in 1987, Claireece ""Precious"" Jo...","The year is 2029. John Connor, leader of the r...",A young man is plunged into a life of subterfu...,"Three friends, whose lives have been drifting ...","Frank Galvin is a down-on-his luck lawyer, red...","For 19-year-old Jay, fall should be about scho...",Twelve episodic tales in the life of a Parisia...
popularity,2.003309,3.097025,6.404093,8.293548,30.188198,9.970359,6.046516,9.596883,20.359336,11.30591


In [14]:
train.describe()

Unnamed: 0,id,budget,popularity,runtime,revenue
count,3000.0,3000.0,3000.0,2998.0,3000.0
mean,1500.5,22531330.0,8.463274,107.856571,66725850.0
std,866.169729,37026090.0,12.104,22.086434,137532300.0
min,1.0,0.0,1e-06,0.0,1.0
25%,750.75,0.0,4.018053,94.0,2379808.0
50%,1500.5,8000000.0,7.374861,104.0,16807070.0
75%,2250.25,29000000.0,10.890983,118.0,68919200.0
max,3000.0,380000000.0,294.337037,338.0,1519558000.0


In [15]:
train.isna().sum()


id                          0
belongs_to_collection    2396
budget                      0
genres                      7
homepage                 2054
imdb_id                     0
original_language           0
original_title              0
overview                    8
popularity                  0
poster_path                 1
production_companies      156
production_countries       55
release_date                0
runtime                     2
spoken_languages           20
status                      0
tagline                   597
title                       0
Keywords                  276
cast                       13
crew                       16
revenue                     0
dtype: int64

### release_date

In [19]:
train['release_date'] = pd.to_datetime(train['release_date'], format = '%m/%d/%y')
test['release_date'] = pd.to_datetime(test['release_date'], format = '%m/%d/%y')

In [21]:
# Extract year
train['year'] = train['release_date'].dt.year
test['year'] = test['release_date'].dt.year

# Extract month
train['month'] = train['release_date'].dt.month
test['month'] = test['release_date'].dt.month

# Extract weekday (0 = Monday, 6 = Sunday)
train['weekday'] = train['release_date'].dt.weekday
test['weekday'] = test['release_date'].dt.weekday

### homepage

In [26]:
train['homepage'] = train['homepage'].notnull().astype(int)
test['homepage'] = test['homepage'].notnull().astype(int)

### belongs_to_collection

In [28]:
#Convert all Nans in belongs_to_collection to string
train['belongs_to_collection'].fillna('None', inplace=True)
test['belongs_to_collection'].fillna('None', inplace=True)


In [40]:
train['belongs_to_collection'][60]

"[{'id': 645, 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlAolx3bKMrKgfi.jpg', 'backdrop_path': '/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg'}]"

In [36]:
train.tail(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,status,tagline,title,Keywords,cast,crew,revenue,year,month,weekday
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",1,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386,2004,1,4
2999,3000,,35000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",1,tt1600195,en,Abduction,A young man sets out to uncover the truth abou...,10.512109,...,Released,They stole his life. He's taking it back.,Abduction,"[{'id': 591, 'name': 'cia'}, {'id': 822, 'name...","[{'cast_id': 2, 'character': 'Nathan Harper', ...","[{'credit_id': '5391990d0e0a260fb5001629', 'de...",82087155,2011,9,3


In [41]:
train['belongs_to_collection'] = 0
train.loc[~(train['belongs_to_collection'] == 'None'), 'has_collection'] = 1
test['belongs_to_collection'] = 0
test.loc[~(test['belongs_to_collection'] == 'None'), 'has_collection'] = 1

In [44]:
train.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue',
       'year', 'month', 'weekday', 'has_collection'],
      dtype='object')

### genres

In [56]:
train['genres'][1000]

"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [57]:
train['genres'].fillna('none', inplace=True)
train['genre_lst'] = train.loc[~(train['genres'] == 'none'), 'genres'].apply(lambda x: [item['name'] for item in eval(x)])
train['genre_lst'].fillna('none', inplace=True)
test['genres'].fillna('none', inplace=True)
test['genre_lst'] = test.loc[~(test['genres'] == 'none'), 'genres'].apply(lambda x: [item['name'] for item in eval(x)])
test['genre_lst'].fillna('none', inplace=True)

In [62]:
train['genre_lst'].value_counts()

[Drama]                                         266
[Comedy]                                        186
[Drama, Romance]                                108
[Comedy, Romance]                                88
[Comedy, Drama]                                  85
                                               ... 
[Adventure, Action, War]                          1
[Comedy, Drama, Foreign, Romance]                 1
[Adventure, Drama, Fantasy, Family, Mystery]      1
[Comedy, Drama, History, Mystery]                 1
[Thriller, Action, Mystery]                       1
Name: genre_lst, Length: 873, dtype: int64