## Feature Extraction

In [None]:
import numpy as np
import pandas as pd

In [None]:
def get_item_numeric_features(df):
    numeric_feature_names = [
        'all_rating_min_max',
        'members_min_max',
        'aired_from_min_max',
        'aired_to_min_max'
    ]
    
    num_df = df[numeric_feature_names]
    return num_df.to_numpy()

def get_user_numeric_features(df):
    numeric_feature_names = [
        'user_rating_ave_min_max',
        'user_rating_std_min_max',
        'user_aired_from_ave_min_max',
        'user_aired_to_ave_min_max'
    ]
    
    num_df = df[numeric_feature_names]
    return num_df.to_numpy()

In [None]:
def get_multihot_feature(df, feat_name):
    feat_df = df[[feat_name]]
    feat_vecs = feat_df.to_numpy()
    feat_vec = np.apply_along_axis(lambda v: v[0], 1, feat_vecs)
    return feat_vec

In [None]:
def get_label(df):
    label_df = df[['label']]
    return label_df.to_numpy()

In [None]:
def get_all_features(df):
    return ( 
        get_multihot_feature(df, 'genres_multihot'),
        get_multihot_feature(df, 'user_liked_genres_multihot'),
        get_item_numeric_features(df),
        get_user_numeric_features(df) 
    )

## Load Parquet Dataset

In [None]:
import os

In [None]:
def data_files():
    filenames = []
    for root, dirs, files in os.walk('../data/dnn_feat_eng'):
        for file in files:
            if file.endswith('.parquet'):
                filenames.append(os.path.join(root, file))
                
    return filenames

## Define Model

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorboard.plugins.hparams import api as hp

### HParam

In [None]:
HP_LAYERS = hp.HParam("layers", hp.IntInterval(2, 3))
HP_LAYER_SIZE = hp.HParam("layer_size", hp.Discrete([64, 128, 256]))
HP_LEARN_RATE = hp.HParam("learn_rate", hp.Discrete([0.001, 0.003, 0.01]))

HPARAMS = [
    HP_LAYERS,
    HP_LAYER_SIZE,
    HP_LEARN_RATE
]

METRICS = [
    hp.Metric(
        "batch_loss",
        group="train",
        display_name="loss (train)",
    ),
    hp.Metric(
        "loss",
        group="validation",
        display_name="loss (val)",
    ),
]

In [None]:
def build_model(x1_shape, x2_shape, x3_shape, x4_shape, hparams):
    x1_input = keras.layers.Input(shape=(x1_shape,))
    x2_input = keras.layers.Input(shape=(x2_shape,))
    x3_input = keras.layers.Input(shape=(x3_shape,))
    x4_input = keras.layers.Input(shape=(x4_shape,))
    
    # compact embedding for x1 and x2
    compact_x1 = keras.layers.Dense(10)(x1_input)
    compact_x2 = keras.layers.Dense(10)(x2_input)
    
    # concat all
    merge = keras.layers.concatenate([compact_x1, compact_x2, x3_input, x4_input])
    
    # hidden layers
    h_input = merge
    for _ in range(hparams[HP_LAYERS]):
        h = keras.layers.Dense(hparams[HP_LAYER_SIZE], activation='relu')(h_input)
        h_input = h
    
    # output
    output = keras.layers.Dense(1, activation='sigmoid')(h_input)
    
    model = keras.models.Model(inputs=[x1_input, x2_input, x3_input, x4_input], outputs=output)
    
    # optimizer
    opt = keras.optimizers.Adam(learning_rate=hparams[HP_LEARN_RATE])
    model.compile(
        loss='binary_crossentropy',
        optimizer=opt,
        metrics=['accuracy']
    )
    
    return model


## Load Data and Train

In [None]:
test_x1s = []
test_x2s = []
test_x3s = []
test_x4s = []
test_ys = []

In [None]:
def data_files():
    filenames = []
    for root, dirs, files in os.walk('../data/dnn_feat_eng'):
        for file in files:
            if file.endswith('.parquet'):
                filenames.append(os.path.join(root, file))
                
    return filenames

filenames = data_files()

In [None]:
def run_model(model_id, hparams):
    # build model
    model = build_model(43, 43, 4, 4, hparams)
    print(f"model id: {model_id}:")
    print({h.name: hparams[h] for h in hparams})

    # config hparam logs
    log_filename = f"{model_id}"
    for h in hparams:
        log_filename += f"_{h.name}-{hparams[h]}"
    
    log_dir = os.path.join("hparams", log_filename)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir = log_dir,
        update_freq = 10,
        profile_batch = 0
    )
    hparams_callback = hp.KerasCallback(log_dir, hparams)
    
    # train model
    for filename in filenames[:1]:
        df = pd.read_parquet(filename)

        # shuffle and split train and test
        train_df = df

        # get features
        train_x1, train_x2, train_x3, train_x4 = get_all_features(train_df)

        # get label
        train_y = get_label(train_df)

        print('training on new dataset')

        model.fit(
            [train_x1, train_x2, train_x3, train_x4], 
            train_y, 
            validation_split=0.2,
            batch_size=16, 
            epochs=4,
            callbacks=[tensorboard_callback, hparams_callback]
        )

In [None]:
def test_params():
    with tf.summary.create_file_writer('hparams').as_default():
            hp.hparams_config(hparams=HPARAMS, metrics=METRICS)
            
    model_id = 0
    for layers in range(HP_LAYERS.domain.min_value, HP_LAYERS.domain.max_value + 1):
        for size in HP_LAYER_SIZE.domain.values:
            for rate in HP_LEARN_RATE.domain.values:
                hparams = {
                    HP_LAYERS: layers,
                    HP_LAYER_SIZE: size,
                    HP_LEARN_RATE: rate
                }

                run_model(model_id, hparams)
                model_id += 1


In [None]:
%rm -rf hparams
test_params()

In [None]:
%load_ext tensorboard
%tensorboard --logdir hparams

## Test

In [None]:
test_x1 = np.vstack(test_x1s)
test_x2 = np.vstack(test_x2s)
test_x3 = np.vstack(test_x3s)
test_x4 = np.vstack(test_x4s)
test_y = np.vstack(test_ys)

In [None]:
test_loss, test_accuracy = model.evaluate([test_x1, test_x2, test_x3, test_x4], test_y)

print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))

In [19]:
hparams = {
    HP_LAYERS: 2,
    HP_LAYER_SIZE: 128,
    HP_LEARN_RATE: 0.003,
}

In [20]:
model = build_model(43, 43, 4, 4, hparams)

In [21]:
model.save('mlp_model')

INFO:tensorflow:Assets written to: mlp_model/assets
