In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import holidays

from joblib import load, dump
from fastai.tabular import transform
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
data = '../data/'

In [4]:
showings = load(data+'showings_dropped_NaN.joblib')
showings.replace(np.inf, 0, inplace=True)
showings.reset_index(drop=True, inplace=True)
genres = load(data+'genres.joblib')

# Extract features realted to the showtime

Features extracted:
- day of week (mon-sun as 0-6)
- month of year (jan-des as 0-11)
- year
- hour of day (0-23)
- minutes after hour (0-59)
- days since release
  - Because of inaccuracies and anomalies I have set some restriction to this metric. If the days since release is more than 150 days, roughly half a year, than it is set to 150 days. If the showtime is set before the release date then I view it as a preshowing and it is set to -1 no matter the actual difference.

In [7]:
showings.showtime.head()

0   2017-12-14 11:30:00
1   2018-07-19 20:00:00
2   2019-12-25 16:30:00
3   2018-07-21 19:45:00
4   2019-12-14 16:00:00
Name: showtime, dtype: datetime64[ns]

In [8]:
transform.add_datepart(showings, 'showtime', drop=False, time=True)
transform.add_cyclic_datepart(showings, 'showtime', drop=False, time=True)
showtimes = showings.showtime.tolist()
norway_holidays = holidays.Norway(include_sundays=False)
showings_holidays = [t in norway_holidays for t in showtimes]
showings['is_holiday'] = showings_holidays

In [9]:
times   = showings.showtime.tolist()
release = showings.tmdb_release_date.tolist()

In [10]:
%%time
since_release = list()
for t, r in zip(times, release):
    if r != '':
        days_since_release = (t - dt.datetime.strptime(r, '%Y-%m-%d')).days
    else:
        days_since_release = 100
    if days_since_release < 0: days_since_release = -1
    if days_since_release > 150: days_since_release = 150
    since_release.append(days_since_release)

Wall time: 33.4 s


In [11]:
showings['days_since_release'] = since_release

# One-Hot encoding of the genre feature

In [8]:
%%time

genre_list = showings.tmdb_genre_ids.tolist()

one_hot_genres = {i: list() for _, i in genres.items()}

for i, j in enumerate(genre_list):
    i += 1
    for k in j:
        one_hot_genres[genres[k]].append(True)  # True if the movie belonged to the genre
    for _, n in one_hot_genres.items():
        if len(n) != i: n.append(False)         # False if the movie did not belong to the genre

for i, j in one_hot_genres.items():
    showings[i] = j

Wall time: 6.07 s


# This was discarded for use in the final model, but the code is kept for future use if needed


# Getting TF-IDF score on text based fields

- title
- stripped_title
- tmdb_original_title
- tmdb_overview

In [15]:
full_titles     = showings.full_title.tolist()
stripped_titles = showings.stripped_title.tolist()
tmdb_titles     = showings.tmdb_original_title.tolist()
overviews       = showings.tmdb_overview.tolist()

## TF-IDF of overviews

In [16]:
overveiw_vectorizer = TfidfVectorizer()
tfidf_overview = overveiw_vectorizer.fit_transform(overviews)

sum_tfidf_overview = np.asarray(tfidf_overview.sum(axis=1).T)[0]

tfidf_overview.shape, sum_tfidf_overview.shape

((1134202, 23242), (1134202,))

## TF-IDF of full_titles

In [17]:
full_titles_vectorizer = TfidfVectorizer()
tfidf_full_titles = full_titles_vectorizer.fit_transform(full_titles)

sum_tfidf_full_titles = np.asarray(tfidf_full_titles.sum(axis=1).T)[0]

tfidf_full_titles.shape, sum_tfidf_full_titles.shape

((1134202, 6025), (1134202,))

## TF-IDF of |stripped_titles

In [18]:
stripped_titles_vectorizer = TfidfVectorizer()
tfidf_stripped_titles = stripped_titles_vectorizer.fit_transform(stripped_titles)

sum_tfidf_stripped_titles = np.asarray(tfidf_stripped_titles.sum(axis=1).T)[0]

tfidf_stripped_titles.shape, sum_tfidf_stripped_titles.shape

((1134202, 5873), (1134202,))

## TF-IDF of tmdb_titles

In [19]:
tmdb_titles_vectorizer = TfidfVectorizer()
tfidf_tmdb_titles = tmdb_titles_vectorizer.fit_transform(tmdb_titles)

sum_tfidf_tmdb_titles = np.asarray(tfidf_tmdb_titles.sum(axis=1).T)[0]

tfidf_tmdb_titles.shape, sum_tfidf_tmdb_titles.shape

((1134202, 6654), (1134202,))

## Setting the features into the dataset

In [20]:
showings['tfidf_overview'] = sum_tfidf_overview
showings['tfidf_full_title'] = sum_tfidf_full_titles
showings['tfidf_stripped_title'] = sum_tfidf_stripped_titles
showings['tfidf_tmdb_title'] = sum_tfidf_tmdb_titles

In [10]:
dump(showings, data+'showings_extra_feature.joblib')

['../data/showings_extra_feature.joblib']

In [4]:
showings = load(data+'showings_extra_feature.joblib')