In [19]:
import json
import pandas as pd

path = 'MoviesDataSet/'

# Load Data from the Kaggle dataset
movies = pd.read_csv(path + 'movies_metadata.csv')

# Drop junk columns
movies = movies.drop('belongs_to_collection', axis=1)
movies = movies.drop('homepage', axis=1)
movies = movies.drop('original_language', axis=1)
movies = movies.drop('original_title', axis=1)
movies = movies.drop('spoken_languages', axis=1)
movies = movies.drop('overview', axis=1)
movies = movies.drop('tagline', axis=1)
movies = movies.drop('poster_path', axis=1)
movies = movies.drop('production_companies', axis=1)
movies = movies.drop('production_countries', axis=1)
movies = movies.drop('video', axis=1)

# Filter out any unreleased movies
movies = movies[movies['status'] == 'Released']
movies = movies.drop('status', axis=1)

# Filter down the columns to those we really care about (losing belongs_to_collection, homepage, original_language, production_companies, production_countries, poster_path, spoken_languages, original_title, video, status
# Note: budget and revenue are interesting, but are mostly 0's so are unusable here, so those are lost too
#movies = movies[['id', 'imdb_id',  'title', 'budget', 'revenue', 'adult', 'genres', 'popularity', 'overview', 'release_date', 'runtime', 'tagline', 'vote_average', 'vote_count']]

# Sort movies by title
movies = movies.sort_values('title')

movies.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,adult,budget,genres,id,imdb_id,popularity,release_date,revenue,runtime,title,vote_average,vote_count
18757,False,0,"[{'id': 99, 'name': 'Documentary'}]",55245,tt1699720,0.077485,2010-01-01,0.0,83.0,!Women Art Revolution,4.3,2.0
30961,False,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",41371,tt1637976,1.190427,2010-07-27,0.0,95.0,#1 Cheerleader Camp,3.4,23.0
36153,False,1500000,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...",301325,tt3526286,2.451342,2015-11-20,0.0,90.0,#Horror,3.4,53.0
23501,False,0,"[{'id': 99, 'name': 'Documentary'}]",267752,tt3060338,0.108531,2013-11-21,0.0,74.0,#chicagoGirl,7.0,1.0
28042,False,0,"[{'id': 37, 'name': 'Western'}]",143747,tt0060697,0.634857,1966-12-18,0.0,104.0,"$1,000 on the Black",6.0,2.0


In [20]:
# Detect Genres
def extract_genres(row):
    data = row['genres']
    arr = json.loads(data.replace("'", "\""))
    for genre in arr:
        row['Is ' + genre['name']] = 1

    return row

movies = movies.apply(extract_genres, axis=1)

# Set our NA values for non-genre membership to 0
movies = movies.fillna(0)

# Drop the no longer needed source column
movies = movies.drop('genres', axis=1)
movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,budget,id,imdb_id,popularity,release_date,revenue,runtime,title,vote_average,vote_count
18757,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,55245,tt1699720,0.077485,2010-01-01,0.0,83.0,!Women Art Revolution,4.3,2.0
30961,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,41371,tt1637976,1.190427,2010-07-27,0.0,95.0,#1 Cheerleader Camp,3.4,23.0
36153,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1500000,301325,tt3526286,2.451342,2015-11-20,0.0,90.0,#Horror,3.4,53.0
23501,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,267752,tt3060338,0.108531,2013-11-21,0.0,74.0,#chicagoGirl,7.0,1.0
28042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,143747,tt0060697,0.634857,1966-12-18,0.0,104.0,"$1,000 on the Black",6.0,2.0


In [21]:
# Split the date into year / quarter / month components for easier analysis
movies['release_year'] = pd.DatetimeIndex(movies['release_date']).year
movies['release_quarter'] = pd.DatetimeIndex(movies['release_date']).quarter
movies['release_month'] = pd.DatetimeIndex(movies['release_date']).month

# Remove the core release date column
movies = movies.drop('release_date', axis=1)

movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,imdb_id,popularity,revenue,runtime,title,vote_average,vote_count,release_year,release_quarter,release_month
18757,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,tt1699720,0.077485,0.0,83.0,!Women Art Revolution,4.3,2.0,2010,1,1
30961,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt1637976,1.190427,0.0,95.0,#1 Cheerleader Camp,3.4,23.0,2010,3,7
36153,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt3526286,2.451342,0.0,90.0,#Horror,3.4,53.0,2015,4,11
23501,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,tt3060338,0.108531,0.0,74.0,#chicagoGirl,7.0,1.0,2013,4,11
28042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,tt0060697,0.634857,0.0,104.0,"$1,000 on the Black",6.0,2.0,1966,4,12


In [22]:
# Convert our adult column from true / false to 1 / 0.
# Note we move from String -> Bool -> Int here
movies['adult'] = movies['adult'].astype(bool).astype(int)

movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,imdb_id,popularity,revenue,runtime,title,vote_average,vote_count,release_year,release_quarter,release_month
18757,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,tt1699720,0.077485,0.0,83.0,!Women Art Revolution,4.3,2.0,2010,1,1
30961,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt1637976,1.190427,0.0,95.0,#1 Cheerleader Camp,3.4,23.0,2010,3,7
36153,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt3526286,2.451342,0.0,90.0,#Horror,3.4,53.0,2015,4,11
23501,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,tt3060338,0.108531,0.0,74.0,#chicagoGirl,7.0,1.0,2013,4,11
28042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,tt0060697,0.634857,0.0,104.0,"$1,000 on the Black",6.0,2.0,1966,4,12


In [23]:
# These are IMDB IDs of movies known to be Christmas movies or classic Christmas movies. See other experiments for details.
labelled_movies = ['tt0993789','tt6433880','tt2402927','tt0081793','tt2709692','tt0064349','tt0041473','tt7736496','tt0032981','tt0043733','tt0338348','tt0117372','tt1430607','tt0099487','tt1401143','tt0086465','tt3530002','tt0044008','tt1067106','tt0037595','tt0096061','tt0116705','tt0457939','tt0034862','tt0087363','tt0039628','tt0110527','tt0033045','tt2083355','tt0039190','tt0319343','tt0085936','tt0047673','tt0104940','tt0095016','tt0058536','tt0097958','tt0107688','tt0114924','tt0111070','tt0085334','tt1268799','tt0037059','tt0314331','tt0071222','tt0060345','tt0099785','tt0373469','tt3850590','tt0059026','tt0038650','tt3824458','tt0307987']

# We need to label all rows based on if they're a Christmas movie or not so we can train a model
def set_label(row):
    imdb = row['imdb_id']
    if imdb in labelled_movies:
        row['Is Christmas Movie'] = 1
    else:
        row['Is Christmas Movie'] = 0
    return row

movies = movies.apply(set_label, axis=1)

movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,popularity,revenue,runtime,title,vote_average,vote_count,release_year,release_quarter,release_month,Is Christmas Movie
18757,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.077485,0.0,83.0,!Women Art Revolution,4.3,2.0,2010,1,1,0
30961,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.190427,0.0,95.0,#1 Cheerleader Camp,3.4,23.0,2010,3,7,0
36153,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.451342,0.0,90.0,#Horror,3.4,53.0,2015,4,11,0
23501,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.108531,0.0,74.0,#chicagoGirl,7.0,1.0,2013,4,11,0
28042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.634857,0.0,104.0,"$1,000 on the Black",6.0,2.0,1966,4,12,0


In [26]:
# Export the full data set
movies = movies.set_index('id')
movies.to_csv('Processed.csv')

In [27]:
# These columns are helpful, but bias the ML algorithms too much given the quantity of tiny movies
movies = movies.drop('budget', axis=1)
movies = movies.drop('revenue', axis=1)
movies = movies.drop('popularity', axis=1)
movies = movies.drop('vote_count', axis=1)
movies = movies.drop('vote_average', axis=1)

movies.head()

Unnamed: 0_level_0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,Is War,Is Western,adult,imdb_id,runtime,title,release_year,release_quarter,release_month,Is Christmas Movie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55245,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,tt1699720,83.0,!Women Art Revolution,2010,1,1,0
41371,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1,tt1637976,95.0,#1 Cheerleader Camp,2010,3,7,0
301325,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1,tt3526286,90.0,#Horror,2015,4,11,0
267752,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,tt3060338,74.0,#chicagoGirl,2013,4,11,0
143747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1,tt0060697,104.0,"$1,000 on the Black",1966,4,12,0


In [29]:
id_columns = ['title', 'imdb_id']

dieHard = movies[movies['title'] == 'Die Hard']
dieHard = dieHard.drop(id_columns, axis=1) # Remove the title
dieHard.to_csv('DieHard.csv')

train_test = movies[movies['title'] != 'Die Hard']
train_test = train_test.drop(id_columns, axis=1) # Remove the title
train_test.to_csv('traintest.csv')