# Test of some sort

In [1]:
# Allows imported code to be updated without the need of running the import statement after each update
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import numpy as np
import pandas as pd

from fastai.torch_core import *
from joblib import load, dump
from sklearn import tree, preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
data = '../data/'
showings = load(data+'showings_extra_feature.joblib')
showings.replace(np.inf, 0, inplace=True)
showings.sort_values(by='showtime', inplace=True)
showings.reset_index(drop=True, inplace=True)

In [4]:
showings.loc[showings.tmdb_runtime.isna(), 'tmdb_runtime'] = round(showings.tmdb_runtime.mean())

### Functions modified from the fastai library to implement embedding of categorical features for use outside of fastai

In [6]:
def ifnone(a, b):
    """
    'a' if 'a' is not None, otherwise 'b'.
    """
    return b if a is None else a

def emb_sz_rule(n_cat:int)->int: return min(600, round(1.6 * n_cat**0.56))

def def_emb_sz(classes, n, sz_dict=None):
    """
    Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`.
    """
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

def get_emb_szs(classes, cat_names, sz_dict=None):
    """
    Return the default embedding sizes suitable for this data or takes the ones in `sz_dict`.
    """
    return [def_emb_sz(classes, n, sz_dict) for n in cat_names]

def trunc_normal_(x:Tensor, mean:float=0., std:float=1.) -> Tensor:
    """
    Truncated normal initialization.
    """
    return x.normal_().fmod_(2).mul_(std).add_(mean)

def embedding(ni:int,nf:int) -> nn.Module:
    """
    Create an embedding layer.
    """
    emb = nn.Embedding(ni, nf)
    with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)
    return emb

## Embedder

Adding the embeddings of the categorical features to the dataframe

In [7]:
def embedder(data:pd.DataFrame, cat_names:list):
    """
    A custom embedder based on the built-in functionality from fastai's TabularModel
    """
    classes = dict()
    values = dict()
    label_encoders = dict()
    for cat in cat_names:
        classes[cat] = data.loc[:, cat].unique().tolist()
        values[cat] = data.loc[:, cat].tolist()
        label_encoders[cat] = {'encoder': preprocessing.LabelEncoder()}
        label_encoders[cat]['targets'] = torch.LongTensor(label_encoders[cat]['encoder'].fit_transform(classes[cat]))
    emb_szs = get_emb_szs(classes, cat_names)
    embeds = nn.ModuleList([embedding(ni, nf) for ni,nf in emb_szs])
    for i, j in enumerate(label_encoders.values()):
        j['embeddings'] = dict()
        for target in j['targets']:
            j['embeddings'][target.item()] = embeds[i](target).tolist()
    
    test = {cat: emb(torch.LongTensor(label_encoders[cat]['encoder'].transform(values[cat]))).detach().numpy() for cat, emb in zip(cat_names, embeds)}
    
    for cat, emb_sz in zip(cat_names, emb_szs):
        sz = emb_sz[1]
        for i in range(sz):
            data[cat+'_emb_'+str(i)] = test[cat][:, i]

In [8]:
# Every categorical feature
cat_names = ['room',
             'location',
             'year',
             'month',
             'week',
             'day',
             'dayofweek',
             'dayofyear',
             'is_month_end',
             'is_month_start',
             'is_quarter_end',
             'is_quarter_start',
             'is_year_end',
             'is_year_start',
             'hour',
             'minute',
             'second',
             'is_holiday',
             'days_since_release',
             'Action',
             'Adventure',
             'Animation',
             'Comedy',
             'Crime',
             'Documentary',
             'Drama',
             'Family',
             'Fantasy',
             'History',
             'Horror',
             'Music',
             'Mystery',
             'Romance',
             'Science Fiction',
             'TV Movie',
             'Thriller',
             'War',
             'Western']

In [9]:
embedder(showings, cat_names)

Wall time: 6.67 s


In [10]:
train = showings.loc[(showings.year >= 2001) & (showings.year <= 2016)]
valid = showings.loc[(showings.year == 2017) | (showings.year == 2018)]
test  = showings.loc[showings.year == 2019]

train.shape, valid.shape, test.shape

((736080, 367), (257940, 367), (132328, 367))

In [11]:
dep_var = 'coverage'
# Every categorical embedding needs to be defined
cont_names = ['room_emb_0', 'room_emb_1', 'room_emb_2', 'room_emb_3', 'room_emb_4', 'room_emb_5', 'room_emb_6', 'room_emb_7', 'room_emb_8', 'room_emb_9', 'room_emb_10', 'room_emb_11', 'room_emb_12', 'room_emb_13', 'room_emb_14', 'room_emb_15', 'room_emb_16', 'room_emb_17', 'room_emb_18', 'room_emb_19', 'room_emb_20', 'room_emb_21', 'room_emb_22', 'room_emb_23', 'room_emb_24', 'room_emb_25', 'room_emb_26', 'room_emb_27', 'room_emb_28', 'room_emb_29', 'room_emb_30', 'room_emb_31', 'room_emb_32', 'room_emb_33', 'room_emb_34', 'room_emb_35', 'room_emb_36', 'room_emb_37', 'room_emb_38', 'room_emb_39', 'room_emb_40', 'room_emb_41', 'room_emb_42', 'room_emb_43', 'room_emb_44', 'room_emb_45', 'room_emb_46', 'room_emb_47',
              'location_emb_0', 'location_emb_1', 'location_emb_2', 'location_emb_3', 'location_emb_4', 'location_emb_5', 'location_emb_6', 'location_emb_7', 'location_emb_8', 'location_emb_9', 'location_emb_10', 'location_emb_11', 'location_emb_12', 'location_emb_13', 'location_emb_14', 'location_emb_15', 'location_emb_16', 'location_emb_17', 'location_emb_18', 'location_emb_19', 'location_emb_20', 'location_emb_21', 'location_emb_22', 'location_emb_23', 'location_emb_24', 'location_emb_25',
              'year_emb_0', 'year_emb_1', 'year_emb_2', 'year_emb_3', 'year_emb_4', 'year_emb_5', 'year_emb_6', 'year_emb_7', 'year_emb_8',
              'month_emb_0', 'month_emb_1', 'month_emb_2', 'month_emb_3', 'month_emb_4', 'month_emb_5',
              'week_emb_0', 'week_emb_1', 'week_emb_2', 'week_emb_3', 'week_emb_4', 'week_emb_5', 'week_emb_6', 'week_emb_7', 'week_emb_8', 'week_emb_9', 'week_emb_10', 'week_emb_11', 'week_emb_12', 'week_emb_13', 'week_emb_14',
              'day_emb_0', 'day_emb_1', 'day_emb_2', 'day_emb_3', 'day_emb_4', 'day_emb_5', 'day_emb_6', 'day_emb_7', 'day_emb_8', 'day_emb_9', 'day_emb_10',
              'dayofweek_emb_0', 'dayofweek_emb_1', 'dayofweek_emb_2', 'dayofweek_emb_3', 'dayofweek_emb_4',
              'dayofyear_emb_0', 'dayofyear_emb_1', 'dayofyear_emb_2', 'dayofyear_emb_3', 'dayofyear_emb_4', 'dayofyear_emb_5', 'dayofyear_emb_6', 'dayofyear_emb_7', 'dayofyear_emb_8', 'dayofyear_emb_9', 'dayofyear_emb_10', 'dayofyear_emb_11', 'dayofyear_emb_12', 'dayofyear_emb_13', 'dayofyear_emb_14', 'dayofyear_emb_15', 'dayofyear_emb_16', 'dayofyear_emb_17', 'dayofyear_emb_18', 'dayofyear_emb_19', 'dayofyear_emb_20', 'dayofyear_emb_21', 'dayofyear_emb_22', 'dayofyear_emb_23', 'dayofyear_emb_24', 'dayofyear_emb_25', 'dayofyear_emb_26', 'dayofyear_emb_27', 'dayofyear_emb_28', 'dayofyear_emb_29', 'dayofyear_emb_30', 'dayofyear_emb_31', 'dayofyear_emb_32', 'dayofyear_emb_33', 'dayofyear_emb_34', 'dayofyear_emb_35', 'dayofyear_emb_36', 'dayofyear_emb_37', 'dayofyear_emb_38', 'dayofyear_emb_39', 'dayofyear_emb_40', 'dayofyear_emb_41', 'dayofyear_emb_42', 'dayofyear_emb_43',
              'is_month_end_emb_0', 'is_month_end_emb_1', 'is_month_start_emb_0', 'is_month_start_emb_1', 'is_quarter_end_emb_0', 'is_quarter_end_emb_1', 'is_quarter_start_emb_0', 'is_quarter_start_emb_1', 'is_year_end_emb_0', 'is_year_end_emb_1', 'is_year_start_emb_0', 'is_year_start_emb_1',
              'hour_emb_0', 'hour_emb_1', 'hour_emb_2', 'hour_emb_3', 'hour_emb_4', 'hour_emb_5', 'hour_emb_6', 'hour_emb_7', 'hour_emb_8',
              'minute_emb_0', 'minute_emb_1', 'minute_emb_2', 'minute_emb_3', 'minute_emb_4', 'minute_emb_5', 'minute_emb_6', 'minute_emb_7', 'minute_emb_8', 'minute_emb_9', 'minute_emb_10', 'minute_emb_11', 'minute_emb_12', 'minute_emb_13', 'minute_emb_14',
              'is_holiday_emb_0', 'is_holiday_emb_1',
              'days_since_release_emb_0', 'days_since_release_emb_1', 'days_since_release_emb_2', 'days_since_release_emb_3', 'days_since_release_emb_4', 'days_since_release_emb_5', 'days_since_release_emb_6', 'days_since_release_emb_7', 'days_since_release_emb_8', 'days_since_release_emb_9', 'days_since_release_emb_10', 'days_since_release_emb_11', 'days_since_release_emb_12', 'days_since_release_emb_13', 'days_since_release_emb_14', 'days_since_release_emb_15', 'days_since_release_emb_16', 'days_since_release_emb_17', 'days_since_release_emb_18', 'days_since_release_emb_19', 'days_since_release_emb_20', 'days_since_release_emb_21', 'days_since_release_emb_22', 'days_since_release_emb_23', 'days_since_release_emb_24', 'days_since_release_emb_25', 'days_since_release_emb_26',
              'Action_emb_0', 'Action_emb_1',
              'Adventure_emb_0', 'Adventure_emb_1',
              'Animation_emb_0', 'Animation_emb_1',
              'Comedy_emb_0', 'Comedy_emb_1',
              'Crime_emb_0', 'Crime_emb_1',
              'Documentary_emb_0', 'Documentary_emb_1',
              'Drama_emb_0', 'Drama_emb_1',
              'Family_emb_0', 'Family_emb_1',
              'Fantasy_emb_0', 'Fantasy_emb_1',
              'History_emb_0', 'History_emb_1',
              'Horror_emb_0', 'Horror_emb_1',
              'Music_emb_0', 'Music_emb_1',
              'Mystery_emb_0', 'Mystery_emb_1',
              'Romance_emb_0', 'Romance_emb_1',
              'Science Fiction_emb_0', 'Science Fiction_emb_1',
              'TV Movie_emb_0', 'TV Movie_emb_1',
              'Thriller_emb_0', 'Thriller_emb_1',
              'War_emb_0', 'War_emb_1',
              'Western_emb_0', 'Western_emb_1',
              'seats',
              'tmdb_popularity',
              'tmdb_vote_average',
              'tmdb_vote_count',
              'tmdb_runtime',
              'tmdb_budget',
              'weekday_cos',
              'weekday_sin',
              'day_month_cos',
              'day_month_sin',
              'month_year_cos',
              'month_year_sin',
              'day_year_cos',
              'day_year_sin',
              'hour_cos',
              'hour_sin',
              'clock_cos',
              'clock_sin',
              'min_cos',
              'min_sin',
              'sec_cos',
              'sec_sin']

# Random Forest Regression

In [12]:
%%time
forest = RandomForestRegressor(n_estimators=10,
                               max_depth=2,
                               random_state=0,
                               criterion='mae',
                               max_features='sqrt')
train_X, train_y = train.loc[:, cont_names], train.coverage
forest.fit(train_X, train_y)

Wall time: 13h 20min 25s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=2, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [13]:
valid_X, valid_y = valid.loc[:, cont_names], valid.coverage
test_X,  test_y  = test.loc[:, cont_names],  test.coverage

train_preds = forest.predict(train_X)
valid_preds = forest.predict(valid_X)
test_preds  = forest.predict(test_X)

In [14]:
(
    round(mean_absolute_error(train_y, train_preds), 4),
    round(mean_absolute_error(valid_y, valid_preds), 4),
    round(mean_absolute_error(test_y,  test_preds),  4)
)

(0.1526, 0.14, 0.134)

In [15]:
dump(forest, 'random_forest_mini.joblib')

['random_forest_mini.joblib']