In [64]:
import json
import pandas as pd

path = 'MoviesDataSet/'

# Load Data from the Kaggle dataset
movies = pd.read_csv(path + 'movies_metadata.csv')

# Drop junk columns
movies = movies.drop('belongs_to_collection', axis=1)
movies = movies.drop('homepage', axis=1)
movies = movies.drop('original_language', axis=1)
movies = movies.drop('original_title', axis=1)
#movies = movies.drop('overview', axis=1) # Keeping this in - Neural nets may find some use for them
#movies = movies.drop('tagline', axis=1) # Keeping this in - Neural nets may find some use for them
movies = movies.drop('poster_path', axis=1)
movies = movies.drop('production_companies', axis=1)
movies = movies.drop('production_countries', axis=1)
movies = movies.drop('video', axis=1)

# These columns might be helpful, but they're not reliable in this dataset
movies = movies.drop('revenue', axis=1)
movies = movies.drop('budget', axis=1)

# Filter out any unreleased movies
movies = movies[movies['status'] == 'Released']
movies = movies.drop('status', axis=1)

# Only evaluate movies with ratings similar to our Christmas movies.
# This is important since classic Christmas movies are popular and the minimum rating is a 5.4
movies = movies[movies['vote_average'] >= 5.4]
movies = movies[movies['vote_average'] <= 8.0]

# Filter out extremely popular movies. Max popularity for a Christmas movie is 17.3
movies['popularity'] = movies['popularity'].astype(float)
movies = movies[movies['popularity'] <= 18]

# Split the date into year / quarter / month components for easier analysis
movies['release_year'] = pd.DatetimeIndex(movies['release_date']).year
movies['release_quarter'] = pd.DatetimeIndex(movies['release_date']).quarter
movies['release_month'] = pd.DatetimeIndex(movies['release_date']).month

# Filter out movies from outside of the year range where our labelled data lives
movies = movies[movies['release_year'] >= 1940 ]
movies = movies[movies['release_year'] <= 2015 ]

# Filter out extremely long movies
movies['runtime'] = movies['runtime'].astype(int, errors='ignore')
movies = movies[movies['runtime'] <= 150]

# Convert ID to an int. Because why wouldn't it be?
movies['id'] = movies['id'].astype(int)

movies.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,adult,genres,id,imdb_id,overview,popularity,release_date,runtime,spoken_languages,tagline,title,vote_average,vote_count,release_year,release_quarter,release_month
1,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995.0,4.0,12.0
2,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995.0,4.0,12.0
3,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,1995.0,4.0,12.0
4,False,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,Just when George Banks has recovered from his ...,8.387519,1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,1995.0,1.0,2.0
6,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",11860,tt0114319,An ugly duckling having undergone a remarkable...,6.677277,1995-12-15,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",You are cordially invited to the most surprisi...,Sabrina,6.2,141.0,1995.0,4.0,12.0


In [65]:
def extract_language(row):
    data = row['spoken_languages']
    row['Is English'] = 'English' in data
    return row

movies = movies.apply(extract_language, axis=1)

# Filter down to only English movies since all labelled movies are English
movies = movies[movies['Is English'] == True]

movies = movies.drop('spoken_languages', axis=1)
movies = movies.drop('Is English', axis=1)
movies.head()

Unnamed: 0,adult,genres,id,imdb_id,overview,popularity,release_date,runtime,tagline,title,vote_average,vote_count,release_year,release_quarter,release_month
1,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995.0,4.0,12.0
2,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995.0,4.0,12.0
3,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,1995.0,4.0,12.0
4,False,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,Just when George Banks has recovered from his ...,8.387519,1995-02-10,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,1995.0,1.0,2.0
6,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",11860,tt0114319,An ugly duckling having undergone a remarkable...,6.677277,1995-12-15,127.0,You are cordially invited to the most surprisi...,Sabrina,6.2,141.0,1995.0,4.0,12.0


In [66]:
# Detect Genres
def extract_genres(row):
    data = row['genres']
    arr = json.loads(data.replace("'", "\""))
    for genre in arr:
        row['Is ' + genre['name']] = 1

    return row

movies = movies.apply(extract_genres, axis=1)

# Set our NA values for non-genre membership to 0
#movies = movies.fillna(0)

# Drop the no longer needed source column
movies = movies.drop('genres', axis=1)
movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,popularity,release_date,release_month,release_quarter,release_year,runtime,tagline,title,vote_average,vote_count
1,,1.0,,,,,,1.0,1.0,,...,17.015539,1995-12-15,12.0,4.0,1995.0,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,,,,1.0,,,,,,,...,11.7129,1995-12-22,12.0,4.0,1995.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,,,,1.0,,,1.0,,,,...,3.859495,1995-12-22,12.0,4.0,1995.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,,,,1.0,,,,,,,...,8.387519,1995-02-10,2.0,1.0,1995.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0
6,,,,1.0,,,,,,,...,6.677277,1995-12-15,12.0,4.0,1995.0,127.0,You are cordially invited to the most surprisi...,Sabrina,6.2,141.0


In [67]:
# Convert our adult column from true / false to 1 / 0.
# Note we move from String -> Bool -> Int here
movies['adult'] = movies['adult'].astype(bool).astype(int)

In [68]:
# These are IMDB IDs of movies known to be Christmas movies or classic Christmas movies. See other experiments for details.
labelled_movies = ['tt0993789','tt6433880','tt2402927','tt0081793','tt2709692','tt0064349','tt0041473','tt7736496','tt0032981','tt0043733','tt0338348','tt0117372','tt1430607','tt0099487','tt1401143','tt0086465','tt3530002','tt0044008','tt1067106','tt0037595','tt0096061','tt0116705','tt0457939','tt0034862','tt0087363','tt0039628','tt0110527','tt0033045','tt2083355','tt0039190','tt0319343','tt0085936','tt0047673','tt0104940','tt0095016','tt0058536','tt0097958','tt0107688','tt0114924','tt0111070','tt0085334','tt1268799','tt0037059','tt0314331','tt0071222','tt0060345','tt0099785','tt0373469','tt3850590','tt0059026','tt0038650','tt3824458','tt0307987']

# We need to label all rows based on if they're a Christmas movie or not so we can train a model
def set_label(row):
    imdb = row['imdb_id']
    if imdb in labelled_movies:
        row['Is Christmas Movie'] = 1
    else:
        row['Is Christmas Movie'] = 0
    return row

movies = movies.apply(set_label, axis=1)

movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,release_date,release_month,release_quarter,release_year,runtime,tagline,title,vote_average,vote_count,Is Christmas Movie
1,,1.0,,,,,,1.0,1.0,,...,1995-12-15,12.0,4.0,1995.0,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,0
2,,,,1.0,,,,,,,...,1995-12-22,12.0,4.0,1995.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,0
3,,,,1.0,,,1.0,,,,...,1995-12-22,12.0,4.0,1995.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,0
4,,,,1.0,,,,,,,...,1995-02-10,2.0,1.0,1995.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,0
6,,,,1.0,,,,,,,...,1995-12-15,12.0,4.0,1995.0,127.0,You are cordially invited to the most surprisi...,Sabrina,6.2,141.0,0


In [69]:
# Load our Keywords dataset
keywords = pd.read_csv(path + "keywords.csv")
keywords['id'] = keywords['id'].astype(int)

In [70]:
# Merge the keywords column into our dataset
movies = pd.merge(movies, keywords, on='id')
movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,release_month,release_quarter,release_year,runtime,tagline,title,vote_average,vote_count,Is Christmas Movie,keywords
0,,1.0,,,,,,1.0,1.0,,...,12.0,4.0,1995.0,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
1,,,,1.0,,,,,,,...,12.0,4.0,1995.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
2,,,,1.0,,,1.0,,,,...,12.0,4.0,1995.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
3,,,,1.0,,,,,,,...,2.0,1.0,1995.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
4,,,,1.0,,,,,,,...,12.0,4.0,1995.0,127.0,You are cordially invited to the most surprisi...,Sabrina,6.2,141.0,0,"[{'id': 90, 'name': 'paris'}, {'id': 380, 'nam..."


In [71]:
# Filter rows without keywords
movies = movies[movies['keywords'] != '[]']

In [72]:
# Detect Keywords
def extract_keywords(row):
    data = row['keywords']

    # Sorry, I suck at RegEx right now. The source data doesn't truly use JSON and doesn't encode apostrophes either
    data = data.replace("n's","ns") # Strip apostrophes not for serialization
    data = data.replace("r's","rs") # Strip apostrophes not for serialization
    data = data.replace("ladies'","ladies") # Strip apostrophes not for serialization
    data = data.replace("one's","ones") # Strip apostrophes not for serialization
    data = data.replace("d's","ds") # Strip apostrophes not for serialization
    data = data.replace("t's","ts") # Strip apostrophes not for serialization
    data = data.replace("l's","ls") # Strip apostrophes not for serialization
    data = data.replace("e's","es") # Strip apostrophes not for serialization
    data = data.replace("d'e","de") # Strip apostrophes not for serialization
    data = data.replace("girls' ","girls ") # Strip apostrophes not for serialization
    data = data.replace("boys' ","boys ") # Strip apostrophes not for serialization
    data = data.replace("y's","ys") # Strip apostrophes not for serialization
    data = data.replace("'s ","s") # Strip apostrophes not for serialization
    data = data.replace("ers' ","ers ") # Strip apostrophes not for serialization
    data = data.replace("s' ","s ") # Strip apostrophes not for serialization
    data = data.replace("'n'","n") # Strip apostrophes not for serialization
    data = data.replace("\\xa0","") # Weird data
    data = data.replace("p's", "ps")
    data = data.replace("o'h", "oh")
    data = data.replace("a'u", "au")
    data = data.replace("u'v", "uv")
    data = data.replace("d'a", "da")

    # The source data is only quasi-JSON so I need to change ' to " so it will parse
    data = data.replace("'", "\"")

    # These tags have to do with the labelled (Christmas movies) data
    # The full set of tags results in 14,400 columns and is too large to handle in this experiment
    # Note that a few of the things in this list are surprising and are likely due to the inclusion of Bad Santa and things like that
    labelled_tags = ['Angel','Bars and Restaurants','Burglar','Based on Novel','Chicago','Christmas','Christmas Party','Christmas Eve','Christmas Carol','Department Store','Female Nudity','Ghost','Gift','Holiday','Hoodlum','Hospital','Lawyer','LGBT','London England','Los Angeles','Love at First Sight','Monster','Multiple Storylines','Murder','Musical','New Years Eve','North Pole','Puppet','Road Trip','Santa Claus','Scrooge','Sex','Snow','Victorian England','Winter','Woman Director']
    
    # Because people can define Christmas movies by the things they're NOT, add in other common tags
    # this allows the algorithm to penalize movies with these common tags
    unlablled_tags = ['aftercreditsstinger','alien','based on a true story','biography','blood','corruption','dark comedy','death','detective','dog','drug','dystopia','england','escape','family','father','film noir','friendship','gangster','gay','high school','independent film','investigation','kidnapping','love','marriage','martial arts','money','music','new york','nudity','paris','party','prison','revenge','robbery','romance','secret','remake','sequel','serial killer','small town','sport','spy','stand-up comedy','suicide','suspense','teacher','teenager','vampire','violence','wedding','world war ii']

    try:
        arr = json.loads(data)
        for item in arr:
            name = item['name']
            if name in labelled_tags or name in unlablled_tags:
                row['Tag: ' + name] = 1
    except:
        print('Could not parse: ' + data)

    return row

movies = movies.apply(extract_keywords, axis=1)

# Set our NA values for non-keyword membership to 0
#movies = movies.fillna(0)

# Drop the no longer needed source column
movies = movies.drop('keywords', axis=1)
movies.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Christmas Movie,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,...,popularity,release_date,release_month,release_quarter,release_year,runtime,tagline,title,vote_average,vote_count
0,,1.0,,0,,,,,1.0,1.0,...,17.015539,1995-12-15,12.0,4.0,1995.0,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
1,,,,0,1.0,,,,,,...,11.7129,1995-12-22,12.0,4.0,1995.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
2,,,,0,1.0,,,1.0,,,...,3.859495,1995-12-22,12.0,4.0,1995.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
3,,,,0,1.0,,,,,,...,8.387519,1995-02-10,2.0,1.0,1995.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0
4,,,,0,1.0,,,,,,...,6.677277,1995-12-15,12.0,4.0,1995.0,127.0,You are cordially invited to the most surprisi...,Sabrina,6.2,141.0


In [73]:
# These pieces of logic were used to analyzed present tags
#stats = movies.describe(include='all')
#stats.to_csv('Stats.csv')

#stats = movies[movies['Is Christmas Movie'] == 1].describe(include='all')
#stats.to_csv('LabelledStats.csv')

In [74]:
# Set the index for clarity
movies_export = movies.set_index('id')

# Export the full data set
movies_export.to_csv('Processed.csv')

In [75]:
# These columns are helpful, but bias the ML algorithms too much given the quantity of tiny movies
movies_export = movies_export.drop('popularity', axis=1)
movies_export = movies_export.drop('vote_count', axis=1)
movies_export = movies_export.drop('vote_average', axis=1)

# Use 0 instead of NA
movies_export = movies_export.fillna(0)

movies_export.head()

Unnamed: 0_level_0,Is Action,Is Adventure,Is Animation,Is Christmas Movie,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,...,adult,imdb_id,overview,release_date,release_month,release_quarter,release_year,runtime,tagline,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8844,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,...,1,tt0113497,When siblings Judy and Peter discover an encha...,1995-12-15,12.0,4.0,1995.0,104.0,Roll the dice and unleash the excitement!,Jumanji
15602,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,tt0113228,A family wedding reignites the ancient feud be...,1995-12-22,12.0,4.0,1995.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
31357,0.0,0.0,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,...,1,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,12.0,4.0,1995.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale
11862,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,tt0113041,Just when George Banks has recovered from his ...,1995-02-10,2.0,1.0,1995.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II
11860,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,12.0,4.0,1995.0,127.0,You are cordially invited to the most surprisi...,Sabrina


In [76]:
dieHard = movies_export[movies_export['title'] == 'Die Hard']
dieHard = dieHard.drop('imdb_id', axis=1) # Remove the title
dieHard.to_csv('DieHard.csv')

# Make Sure Die Hard doesn't make it into the training or verification data
movies_export = movies_export[movies_export['title'] != 'Die Hard']

# Save our labelled data
labelled = movies_export[movies_export['Is Christmas Movie'] == 1]
labelled.to_csv('Labelled.csv')

# Get our unlabelled data
unlabelled = movies_export[movies_export['Is Christmas Movie'] == 0]
unlabelled.to_csv('Unlabelled.csv')

In [78]:
# We have FAR more movies that are not Christmas movies than movies that are. So let's take every 57th row and make that our sample of the larger dataset
# 57 was chosen to reduce the dataset to a size where 1 out of 5 items would be a Christmas Movie.
sampled = unlabelled.iloc[::57, :]

unioned = pd.concat([labelled, sampled])
unioned = unioned.sort_values('release_date')

train_test = unioned.drop('imdb_id', axis=1)
train_test.to_csv('traintest.csv')