# 04-001 : Features - Random Forest Baseline - [0.664] :: 001

Attempt to replicate the results from `001_random-forest-baseline-0-664.ipynb`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
from typing import Any, Dict, List, Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from tqdm.auto import tqdm

import tensorflow_addons as tfa
import keras as k
from keras import optimizers
import keras_tuner
import keras_tuner as kt

from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.metrics import f1_score

from competition import data_preparation as dp
from competition import feature_engineering as fe
from competition import model_data as md
from competition import source_data as sd
import competition.models.simple_dense as sd_model
from competition.models.heatmap_covnet import HeatmapCovnetModel

from competition.model_training import mprint, mflush, mclear
from competition.predict import PredictionBase, Baseline, HeatmapPredictor

2023-04-21 14:09:02.000748: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Configure Logging

In [3]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-04-21 14:09:03 INFO     Started


## Data Collection

In [4]:
# load the source training set
df_source = sd.read_csv('../data/train.csv.gz',
                        compression='gzip',
                        dtype=sd.source_dtype)

(13174211, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [5]:
# load the source training labels
df_source_labels = sd.read_csv('../data/train_labels.csv')

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Data Preparation & Cleaning

In [6]:
# prepare the main dataset
df_source = dp.prepare_main_dataset(df_source)

# remove sessions with problems
problem_sessions = dp.find_problem_sessions(df_source)
df_source = df_source[~df_source['session_id'].isin(problem_sessions)]

In [7]:
# prepare the label dataset
df_source_labels = dp.prepare_label_dataset(df_source_labels)

# remove sessions with problems
df_source_labels = df_source_labels[~df_source_labels['session_id'].isin(problem_sessions)]

## Feature Engineering

In [8]:
# create the initial features
df_features = fe.create_initial_features(df_source, df_source_labels)

In [9]:
# add the elapsed time feature to the features dataset
df_features = fe.add_elapsed_time_features(
    features=df_features,
    X=df_source)

In [10]:
# add the total count features to the features dataset
df_features = fe.add_count_total_features(
    features=df_features,
    X=df_source)

In [11]:
# add the unique count features to the features dataset
df_features = fe.add_count_unique_features(
    features=df_features,
    X=df_source)

In [12]:
# add the numeric features to the features dataset
df_features = fe.add_numeric_features(
    features=df_features,
    X=df_source)

In [13]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(pd.DataFrame(df_features.dtypes))

## Data Selection

In [14]:
random_state = 51

In [15]:
# split the dataset into train, validation and test sets
train, val, test = md.select_sessions(
    y=df_source_labels,
    random_state=random_state,
    test_size=0.60,
    train_size=0.75)

Train: 3495
Validation: 1165
Test: 6988


## Model Training

### Create Datasets

In [16]:
feature_list = [
    'count_unique_event_name',
    'count_unique_name',
    'count_unique_fqid',
    'count_unique_room_fqid',
    'count_unique_text_fqid',
    'elapsed_time_mean',
    'level_mean',
    'page_mean',
    'room_coor_x_mean',
    'room_coor_y_mean',
    'screen_coor_x_mean',
    'screen_coor_y_mean',
    'hover_duration_mean',
    'elapsed_time_std',
    'level_std',
    'page_std',
    'room_coor_x_std',
    'room_coor_y_std',
    'screen_coor_x_std',
    'screen_coor_y_std',
    'hover_duration_std'
]

In [17]:
# set all nan values to 0
df_features = df_features.fillna(0)

In [18]:
# create the simple model dataset
simple_model_dataset = md.get_feature_dataset(
    features=df_features,
    y=df_source_labels,
    feature_list=feature_list,
    train=train,
    val=val,
    test=test,
    include_question=True,
    expand_question=False)

2023-04-21 14:10:22 INFO     -- Creating the train dataset
2023-04-21 14:10:22 INFO     Creating the dataset for all level groups


  0%|          | 0/62910 [00:00<?, ?it/s]

2023-04-21 14:11:45 INFO     -- Creating the val dataset
2023-04-21 14:11:45 INFO     Creating the dataset for all level groups


  0%|          | 0/20970 [00:00<?, ?it/s]

2023-04-21 14:12:13 INFO     -- Creating the test dataset
2023-04-21 14:12:13 INFO     Creating the dataset for all level groups


  0%|          | 0/125784 [00:00<?, ?it/s]

In [19]:
# convert the labels for multi-label classification
cat_features_dataset = md.labels_to_categorical(simple_model_dataset)

In [20]:
# get the shape of the question only dataset
input_data = cat_features_dataset['train']['X']
features_dataset_shape = input_data.shape[1]
print('features_dataset_shape:', features_dataset_shape)

# define the output shape
output_data = cat_features_dataset['train']['y']
output_shape = output_data.shape[1]
print('output_shape', output_shape)

features_dataset_shape: 39
output_shape 2


### Training

In [21]:
# create the experiment
mlflow.set_experiment("04_001")

<Experiment: artifact_location='file:///workspaces/dsm150-2022-oct/cw02/phase_04/mlruns/233069815971708149', creation_time=1682086073293, experiment_id='233069815971708149', last_update_time=1682086073293, lifecycle_stage='active', name='04_001', tags={}>

In [22]:
mlflow.end_run()

In [23]:
# create the hyperparameter object
def define_tune_parameters(hp):
    hp.Int('dense_layer_count', min_value=1, max_value=6, step=1)
    hp.Int('dense_units', min_value=512, max_value=1700, step=32)
    hp.Choice('dense_activation', values=['relu', 'tanh', 'LeakyReLU'])
    hp.Float('dense_l1_regularization', min_value=0.0, max_value=0.0005, step=0.00001)
    hp.Float('dense_l2_regularization', min_value=0.0, max_value=0.001, step=0.0001)
    hp.Float('dense_dropout', min_value=0.005, max_value=0.1, step=0.005)
    hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4, 1e-5, 1e-6])

In [24]:
# find the best model
for batch_size in [500, 1000, 2000, 3000, 4000]:
    for optimizer in [optimizers.Adam, optimizers.RMSprop]:
        sd_model.tune_model(
            define_tune_parameters=define_tune_parameters,
            dataset=cat_features_dataset,
            max_trials=50,
            input_shape=features_dataset_shape,
            output_shape=output_shape,
            dense_layer_count='dense_layer_count',
            dense_units='dense_units',
            dense_activation='dense_activation',
            dense_l1_regularization='dense_l1_regularization',
            dense_l2_regularization='dense_l2_regularization',
            dense_dropout='dense_dropout',
            train_epochs=2000,
            train_batch_size=batch_size,
            train_optimizer=optimizer,
            train_learning_rate='learning_rate',
            train_loss='categorical_crossentropy',
            train_metrics=[tfa.metrics.F1Score(name='f1_score', num_classes=2, threshold=0.5, average='macro')],
            train_class_weight=None,
            tune_objective='val_f1_score',
            tune_direction='max',
            tuner_type=kt.tuners.BayesianOptimization,
            tune_patience=10)

Trial 2 Complete [00h 00m 45s]
val_f1_score: 0.427545964717865

Best val_f1_score So Far: 0.427545964717865
Total elapsed time: 00h 01m 13s
2023-04-21 14:16:10 INFO     on_trial_begin

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
5                 |3                 |dense_layer_count
992               |1216              |dense_units
relu              |LeakyReLU         |dense_activation
5e-05             |0.00035           |dense_l1_regularization
0.0006            |0.0003            |dense_l2_regularization
0.03              |0.095             |dense_dropout
0.0001            |1e-06             |learning_rate

2023-04-21 14:16:10 INFO     Creating simple dense model
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000

KeyboardInterrupt: 