In [137]:
import json
import pandas as pd

path = 'MoviesDataSet/'

# Load Data from the Kaggle dataset
movies = pd.read_csv(path + 'movies_metadata.csv')

movies = movies.set_index('id')

# Drop junk columns
movies = movies.drop('belongs_to_collection', axis=1)
movies = movies.drop('homepage', axis=1)
movies = movies.drop('original_language', axis=1)
movies = movies.drop('original_title', axis=1)
movies = movies.drop('overview', axis=1)
movies = movies.drop('tagline', axis=1)
movies = movies.drop('poster_path', axis=1)
movies = movies.drop('production_companies', axis=1)
movies = movies.drop('production_countries', axis=1)
movies = movies.drop('video', axis=1)

# These columns might be helpful, but they're not reliable in this dataset
movies = movies.drop('revenue', axis=1)
movies = movies.drop('budget', axis=1)

# Filter out any unreleased movies
movies = movies[movies['status'] == 'Released']
movies = movies.drop('status', axis=1)

# Only evaluate movies with ratings similar to our Christmas movies.
# This is important since classic Christmas movies are popular and the minimum rating is a 5.4
movies = movies[movies['vote_average'] >= 5.4]
movies = movies[movies['vote_average'] <= 8.0]

# Filter out extremely popular movies. Max popularity for a Christmas movie is 17.3
movies['popularity'] = movies['popularity'].astype(float)
movies = movies[movies['popularity'] <= 18]

# Split the date into year / quarter / month components for easier analysis
movies['release_year'] = pd.DatetimeIndex(movies['release_date']).year
movies['release_quarter'] = pd.DatetimeIndex(movies['release_date']).quarter
movies['release_month'] = pd.DatetimeIndex(movies['release_date']).month

# Filter out movies from outside of the year range where our labelled data lives
movies = movies[movies['release_year'] >= 1940 ]
movies = movies[movies['release_year'] <= 2015 ]

# Filter out extremely long movies
movies['runtime'] = movies['runtime'].astype(int, errors='ignore')
movies = movies[movies['runtime'] <= 150]

# Sort movies by title
movies = movies.sort_values('title')

movies.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,adult,genres,imdb_id,popularity,release_date,runtime,spoken_languages,title,vote_average,vote_count,release_year,release_quarter,release_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
267752,False,"[{'id': 99, 'name': 'Documentary'}]",tt3060338,0.108531,2013-11-21,74.0,[],#chicagoGirl,7.0,1.0,2013.0,4.0,11.0
143747,False,"[{'id': 37, 'name': 'Western'}]",tt0060697,0.634857,1966-12-18,104.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...","$1,000 on the Black",6.0,2.0,1966.0,4.0,12.0
4204,False,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",tt1024733,2.326842,2008-01-01,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",$5 a Day,6.0,24.0,2008.0,1.0,1.0
248268,False,"[{'id': 18, 'name': 'Drama'}]",tt2106284,0.984593,2014-01-10,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",$50K and a Call Girl: A Love Story,6.3,11.0,2014.0,1.0,1.0
19311,False,"[{'id': 16, 'name': 'Animation'}, {'id': 18, '...",tt0790799,3.350555,2008-09-04,78.0,"[{'iso_639_1': 'en', 'name': 'English'}]",$9.99,6.0,28.0,2008.0,3.0,9.0


In [138]:
def extract_language(row):
    data = row['spoken_languages']
    row['Is English'] = 'English' in data
    return row

movies = movies.apply(extract_language, axis=1)

# Filter down to only English movies since all labelled movies are English
movies = movies[movies['Is English'] == True]

movies = movies.drop('spoken_languages', axis=1)
movies = movies.drop('Is English', axis=1)
movies.head()

Unnamed: 0_level_0,adult,genres,imdb_id,popularity,release_date,runtime,title,vote_average,vote_count,release_year,release_quarter,release_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4204,False,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",tt1024733,2.326842,2008-01-01,98.0,$5 a Day,6.0,24.0,2008.0,1.0,1.0
248268,False,"[{'id': 18, 'name': 'Drama'}]",tt2106284,0.984593,2014-01-10,90.0,$50K and a Call Girl: A Love Story,6.3,11.0,2014.0,1.0,1.0
19311,False,"[{'id': 16, 'name': 'Animation'}, {'id': 18, '...",tt0790799,3.350555,2008-09-04,78.0,$9.99,6.0,28.0,2008.0,3.0,9.0
95383,False,"[{'id': 99, 'name': 'Documentary'}]",tt2258233,0.891024,2013-01-11,89.0,$ellebrity,5.5,9.0,2013.0,1.0,1.0
252178,False,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",tt2614684,7.08055,2014-10-10,99.0,'71,6.7,414.0,2014.0,4.0,10.0


In [139]:
# Detect Genres
def extract_genres(row):
    data = row['genres']
    arr = json.loads(data.replace("'", "\""))
    for genre in arr:
        row['Is ' + genre['name']] = 1

    return row

movies = movies.apply(extract_genres, axis=1)

# Set our NA values for non-genre membership to 0
movies = movies.fillna(0)

# Drop the no longer needed source column
movies = movies.drop('genres', axis=1)
movies.head()

Unnamed: 0_level_0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,imdb_id,popularity,release_date,release_month,release_quarter,release_year,runtime,title,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4204,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt1024733,2.326842,2008-01-01,1.0,1.0,2008.0,98.0,$5 a Day,6.0,24.0
248268,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt2106284,0.984593,2014-01-10,1.0,1.0,2014.0,90.0,$50K and a Call Girl: A Love Story,6.3,11.0
19311,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt0790799,3.350555,2008-09-04,9.0,3.0,2008.0,78.0,$9.99,6.0,28.0
95383,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,tt2258233,0.891024,2013-01-11,1.0,1.0,2013.0,89.0,$ellebrity,5.5,9.0
252178,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt2614684,7.08055,2014-10-10,10.0,4.0,2014.0,99.0,'71,6.7,414.0


In [140]:
# Convert our adult column from true / false to 1 / 0.
# Note we move from String -> Bool -> Int here
movies['adult'] = movies['adult'].astype(bool).astype(int)

movies.head()

Unnamed: 0_level_0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,imdb_id,popularity,release_date,release_month,release_quarter,release_year,runtime,title,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4204,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt1024733,2.326842,2008-01-01,1.0,1.0,2008.0,98.0,$5 a Day,6.0,24.0
248268,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt2106284,0.984593,2014-01-10,1.0,1.0,2014.0,90.0,$50K and a Call Girl: A Love Story,6.3,11.0
19311,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt0790799,3.350555,2008-09-04,9.0,3.0,2008.0,78.0,$9.99,6.0,28.0
95383,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,tt2258233,0.891024,2013-01-11,1.0,1.0,2013.0,89.0,$ellebrity,5.5,9.0
252178,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,tt2614684,7.08055,2014-10-10,10.0,4.0,2014.0,99.0,'71,6.7,414.0


In [141]:
# These are IMDB IDs of movies known to be Christmas movies or classic Christmas movies. See other experiments for details.
labelled_movies = ['tt0993789','tt6433880','tt2402927','tt0081793','tt2709692','tt0064349','tt0041473','tt7736496','tt0032981','tt0043733','tt0338348','tt0117372','tt1430607','tt0099487','tt1401143','tt0086465','tt3530002','tt0044008','tt1067106','tt0037595','tt0096061','tt0116705','tt0457939','tt0034862','tt0087363','tt0039628','tt0110527','tt0033045','tt2083355','tt0039190','tt0319343','tt0085936','tt0047673','tt0104940','tt0095016','tt0058536','tt0097958','tt0107688','tt0114924','tt0111070','tt0085334','tt1268799','tt0037059','tt0314331','tt0071222','tt0060345','tt0099785','tt0373469','tt3850590','tt0059026','tt0038650','tt3824458','tt0307987']

# We need to label all rows based on if they're a Christmas movie or not so we can train a model
def set_label(row):
    imdb = row['imdb_id']
    if imdb in labelled_movies:
        row['Is Christmas Movie'] = 1
    else:
        row['Is Christmas Movie'] = 0
    return row

movies = movies.apply(set_label, axis=1)

movies.head()

Unnamed: 0_level_0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,popularity,release_date,release_month,release_quarter,release_year,runtime,title,vote_average,vote_count,Is Christmas Movie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4204,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.326842,2008-01-01,1.0,1.0,2008.0,98.0,$5 a Day,6.0,24.0,0
248268,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.984593,2014-01-10,1.0,1.0,2014.0,90.0,$50K and a Call Girl: A Love Story,6.3,11.0,0
19311,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,3.350555,2008-09-04,9.0,3.0,2008.0,78.0,$9.99,6.0,28.0,0
95383,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.891024,2013-01-11,1.0,1.0,2013.0,89.0,$ellebrity,5.5,9.0,0
252178,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,7.08055,2014-10-10,10.0,4.0,2014.0,99.0,'71,6.7,414.0,0


In [142]:
# Export the full data set
movies.to_csv('Processed.csv')

In [143]:
# These columns are helpful, but bias the ML algorithms too much given the quantity of tiny movies
movies = movies.drop('popularity', axis=1)
movies = movies.drop('vote_count', axis=1)
movies = movies.drop('vote_average', axis=1)

movies.head()

Unnamed: 0_level_0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,Is Western,adult,imdb_id,release_date,release_month,release_quarter,release_year,runtime,title,Is Christmas Movie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4204,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1,tt1024733,2008-01-01,1.0,1.0,2008.0,98.0,$5 a Day,0
248268,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1,tt2106284,2014-01-10,1.0,1.0,2014.0,90.0,$50K and a Call Girl: A Love Story,0
19311,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1,tt0790799,2008-09-04,9.0,3.0,2008.0,78.0,$9.99,0
95383,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1,tt2258233,2013-01-11,1.0,1.0,2013.0,89.0,$ellebrity,0
252178,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1,tt2614684,2014-10-10,10.0,4.0,2014.0,99.0,'71,0


In [144]:
id_columns = ['title', 'imdb_id']

labelled = movies[movies['Is Christmas Movie'] == 1]
labelled.to_csv('Labelled.csv')

dieHard = movies[movies['title'] == 'Die Hard']
dieHard = dieHard.drop(id_columns, axis=1) # Remove the title
dieHard.to_csv('DieHard.csv')

train_test = movies[movies['title'] != 'Die Hard']
train_test = train_test.drop(id_columns, axis=1) # Remove the title
train_test.to_csv('traintest.csv')