# 03-202 : Question 5 Model

Train a model specifically for question 5, and then use it in conjunction with `simple_monkey` to predict the entire dataset and get a classification report.

In [1]:
%load_ext autoreload
%autoreload 2

In [94]:
import sys
import logging
from typing import Any, Dict, List, Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from tqdm.auto import tqdm

import tensorflow_addons as tfa
import keras as k
from keras import optimizers
import keras_tuner
import keras_tuner as kt

from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.metrics import f1_score

from competition import data_preparation as dp
from competition import feature_engineering as fe
from competition import model_data as md
from competition import source_data as sd
import competition.models.simple_dense as sd_model
from competition.models.heatmap_covnet import HeatmapCovnetModel

from competition.model_training import mprint, mflush, mclear
from competition.predict import PredictionBase, Baseline

### Configure Logging

In [3]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-04-10 07:07:39 INFO     Started


## Data Collection

In [4]:
# load the source training set
df_source = sd.read_csv('../data/train.csv.gz',
                        compression='gzip',
                        dtype=sd.source_dtype)

(13174211, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [5]:
# load the source training labels
df_source_labels = sd.read_csv('../data/train_labels.csv')

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Data Preparation & Cleaning

In [6]:
# prepare the main dataset
df_source = dp.prepare_main_dataset(df_source,
                                    elapsed_time_min_clip=0,
                                    elapsed_time_max_clip=3691298)

# remove sessions with problems
problem_sessions = dp.find_problem_sessions(df_source)
df_source = df_source[~df_source['session_id'].isin(problem_sessions)]

In [7]:
# prepare the label dataset
df_source_labels = dp.prepare_label_dataset(df_source_labels)

# remove sessions with problems
df_source_labels = df_source_labels[~df_source_labels['session_id'].isin(problem_sessions)]

## Feature Engineering

Instead of duplicating the feature engineering workflow, we will use the same feature dataset created in notebook `03-123`.

In [None]:
df_features = pd.read_pickle(
    'data/features/03-123.parquet',
    compression='gzip')

with pd.option_context('display.max_columns', None):
    display(df_features.head(3))

## Data Selection

We first combine the features with the labels as we will do data selection now based on question number as opposed to to all previous notebooks.

In [None]:
df_combined = df_source_labels.merge(
    right=df_features, 
    on=['session_id', 'level_group'],
    how='left')

print(df_combined.shape)
with pd.option_context('display.max_columns', None):
    display(df_combined.head(6))

Create a function that will combine the datasets like we just did above and then return the dataset for the specified question.

In [None]:
# test the function
df_question_features, df_question_labels = md.get_question_dataset(features=df_features,
                                                                labels=df_source_labels,
                                                                question_num=10)

Split the data into training and test sets.

In [None]:
random_state = 51

In [None]:
# split the dataset into train, validation and test sets
train, val, test = md.select_sessions(
    y=df_question_labels,
    random_state=random_state,
    test_size=0.60,
    train_size=0.75)

## Model Training

### Create Datasets

In [None]:
# set the feature list
feature_list = ['elapsed_time_sum', 'elapsed_time_max', 'elapsed_time_min', 'elapsed_time_mean', 'elapsed_time_mode']

# create the simple model dataset
features_dataset = md.get_feature_dataset(
    features=df_question_features,
    y=df_question_labels,
    feature_list=feature_list,
    train=train,
    val=val,
    test=test,
    include_question=True,
    expand_question=False)

In [None]:
# convert the labels for multi-label classification
cat_features_dataset = md.labels_to_categorical(features_dataset)

In [None]:
# get the shape of the question only dataset
input_data = cat_features_dataset['train']['X']
features_dataset_shape = input_data.shape[1]
print('features_dataset_shape:', features_dataset_shape)

# define the output shape
output_data = cat_features_dataset['train']['y']
output_shape = output_data.shape[1]
print('output_shape', output_shape)

### Flat Features Only

In [None]:
# configure mlflow
mlflow.set_experiment("question-10-simple")
mlflow.keras.autolog()

In [None]:
mlflow.end_run()

In [None]:
# create the hyperparameter object
def define_tune_parameters(hp):
    hp.Int('dense_layer_count', min_value=1, max_value=6, step=1)
    hp.Int('dense_units', min_value=512, max_value=1700, step=32)
    hp.Choice('dense_activation', values=['relu', 'tanh', 'LeakyReLU'])
    hp.Float('dense_l1_regularization', min_value=0.0, max_value=0.0005, step=0.00001)
    hp.Float('dense_l2_regularization', min_value=0.0, max_value=0.001, step=0.0001)
    hp.Float('dense_dropout', min_value=0.005, max_value=0.1, step=0.005)
    hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4, 1e-5, 1e-6])

In [None]:
# find the best model
for batch_size in [500, 1000, 2000, 3000, 4000]:
    for optimizer in [optimizers.Adam, optimizers.RMSprop]:
        sd_model.tune_model(
            define_tune_parameters=define_tune_parameters,
            dataset=cat_features_dataset,
            max_trials=50,
            input_shape=features_dataset_shape,
            output_shape=output_shape,
            dense_layer_count='dense_layer_count',
            dense_units='dense_units',
            dense_activation='dense_activation',
            dense_l1_regularization='dense_l1_regularization',
            dense_l2_regularization='dense_l2_regularization',
            dense_dropout='dense_dropout',
            train_epochs=2000,
            train_batch_size=batch_size,
            train_optimizer=optimizer,
            train_learning_rate='learning_rate',
            train_loss='categorical_crossentropy',
            train_metrics=[tfa.metrics.F1Score(name='f1_score', num_classes=2, threshold=0.5, average='macro')],
            train_class_weight=None,
            tune_objective='val_f1_score',
            tune_direction='max',
            tuner_type=kt.tuners.BayesianOptimization,
            tune_patience=10)

### Combined Features

In [None]:
# configure mlflow
mlflow.set_experiment("question-10-heatmap")
mlflow.keras.autolog()

In [None]:
# create the heatmap dataset
heatmap_dataset = md.get_feature_dataset(
    features=df_question_features,
    y=df_question_labels,
    feature_list=['screen_heatmap_feature'],
    train=train,
    val=val,
    test=test,
    include_question=False,
    expand_question=False)

In [None]:
# define the flat heatmap input shape
input_data = heatmap_dataset['train']['X']
heatmap_shape = input_data.shape[1], input_data.shape[2], input_data.shape[3]
print('heatmap_shape:', heatmap_shape)

# get the shape of the question only dataset
input_data = cat_features_dataset['train']['X']
features_dataset_shape = input_data.shape[1]
print('features_dataset_shape:', features_dataset_shape)

# define the output shape
output_data = cat_features_dataset['train']['y']
output_shape = output_data.shape[1]
print('output_shape', output_shape)

In [None]:
# define the tuner parameters
def define_heatmap_tune_parameters(hp):
    # add the simple model parameters
    define_tune_parameters(hp)

    # add the heatmap model parameters
    hp.Int('covnet_block_count', min_value=1, max_value=3, step=1)
    hp.Choice('covnet_activation', values=['relu', 'tanh', 'LeakyReLU'])
    hp.Int('covnet_cov_count', min_value=1, max_value=3, step=1)
    hp.Int('covnet_channels', min_value=32, max_value=64, step=16)
    hp.Choice('covnet_kernel_size', values=['(3, 3)'])
    hp.Choice('covnet_pool_size', values=['(2, 2)'])

In [None]:
# create the model object
heatmap_model = HeatmapCovnetModel(
    input_shape=features_dataset_shape,
    heatmap_shape=heatmap_shape,
    output_shape=output_shape,
    loss='categorical_crossentropy',
    metrics=[tfa.metrics.F1Score(name='f1_score', num_classes=2, threshold=0.5, average='macro')])

In [None]:
mlflow.end_run()

In [None]:
# find the best model
for batch_size in [500, 1000, 2000, 3000, 4000]:
    for optimizer in [optimizers.Adam, optimizers.RMSprop]:
        model = heatmap_model.tune_model(
            define_tune_parameters=define_heatmap_tune_parameters,
            heatmap_dataset=heatmap_dataset,
            feature_dataset=cat_features_dataset,
            max_trials=50,
            train_epochs=1000,
            train_batch_size=batch_size,
            train_optimizer=optimizer,
            tuner_type=kt.tuners.BayesianOptimization,
            tune_objective='val_f1_score',
            tune_direction='max',
            train_class_weight=None,
            tune_patience=10)
        
        mlflow.end_run()

### Save the best model

In [None]:
# load the best model from mlflow
model_uri = "runs:/98db2ce003464d77a6a836c74d6a3b54/model"
q10_model = mlflow.keras.load_model(model_uri)

# save the model to disk
k.models.save_model(q10_model, "../data/interim/model_03-203.h5")

## Model Evaluation

In [8]:
# load the test session ids
test = np.load('../data/interim/test_03-202.npy')

# load the models
q05_model = k.models.load_model('../data/interim/model_03-202.h5')
q10_model = k.models.load_model('../data/interim/model_03-203.h5')

2023-04-10 07:08:12.705587: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-10 07:08:12.715899: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-10 07:08:12.716013: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-10 07:08:12.716501: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [9]:
# select all the records in the training set
df_test = df_source[df_source.session_id.isin(test)]

# select the last record for each session
df_test_labels = df_source_labels[df_source_labels.session_id.isin(test)]

### Create Predictor

In [26]:
class Predictor(Baseline):
    """
    Use the best model for question 5 to predict the correct labels.
    """

    def __init__(self, models:Dict[int, Dict[Any, float]]):
        # call the base class constructor
        super().__init__()

        # initialize the models collection
        self.models = {}
        self.thresholds = {}

        # assign the models
        self.models = models

    def feature_engineering(self, data:pd.DataFrame, labels:pd.DataFrame) -> pd.DataFrame:
        """
        This method is used to perform feature engineering on the data.
        """
        logging.info('Performing feature engineering:')
        # create the initial features
        df_features = fe.create_initial_features(data, labels)

        # add the elapsed time feature to the features dataset
        logging.info('\tAdding elapsed time features...')
        df_features = fe.add_elapsed_time_features(
            features=df_features,
            X=data)

        # add the total count features to the features dataset
        logging.info('\tAdding total count features...')
        df_features = fe.add_count_total_features(
            features=df_features,
            X=data)

        # add the unique count features to the features dataset
        logging.info('\tAdding unique count features...')
        df_features = fe.add_count_unique_features(
            features=df_features,
            X=data)    

        # add the heatmap features to the features dataset
        logging.info('\tAdding heatmap features...')
        df_features = fe.add_screen_heatmap_feature(
            features=df_features,
            X=df_source,
            verbose=True)
        
        return df_features

    def create_feature_dataset(self,
                               features:pd.DataFrame,
                               labels:pd.DataFrame) -> List[np.ndarray]:
        """
        This method is used to create a feature dataset.
        """   
        # create the flat features dataset
        features_dataset = md.create_feature_dataset(
            df_features=features,
            df_source_labels=labels,
            session_list=labels.session_id.unique(),
            feature_list=['elapsed_time_sum', 'elapsed_time_max', 
                          'elapsed_time_min', 'elapsed_time_mean', 
                          'elapsed_time_mode'],
            include_question=True,
            expand_question=False,
            verbose=False)

        # create the heatmap features dataset
        #logging.info('Creating the heatmap features dataset...')
        heatmap_dataset = md.create_feature_dataset(
            df_features=features,
            df_source_labels=labels,
            session_list=labels.session_id.unique(),
            feature_list=['screen_heatmap_feature'],
            include_question=False,
            expand_question=False,
            verbose=False)

        return [heatmap_dataset, features_dataset]


    def predict_question(self, feature_set:List[np.ndarray], question_num:int) -> int:
        """
        Predict the correct answer for the given question.

        Parameters
        ----------
        feature_set : List[pd.DataFrame]
            The list of feature sets for the questions.
        question_num : int
            The question number to predict.

        Returns
        -------
        int
            The predicted answer for the question.
        """
        # if no model is defined for the question, use the base class
        model_data = self.models.get(question_num, None)
        if model_data is None:
            return super().predict_question(feature_set, question_num)

        # get the model and threshold
        model = model_data['model']
        threshold = model_data['threshold']

        # use the model for prediction
        y_pred_model = model.predict(feature_set, verbose=0)
        y_pred_model = (y_pred_model[:, 1] > threshold).astype(int)

        return y_pred_model[0]

# create the predictor object
models = {
    5:  {'model': q05_model, 'threshold': 0.52},
    10: {'model': q10_model, 'threshold': 0.50}
}

predictor = Predictor(models)
#feature_set = predictor.feature_engineering(df_test, df_test_labels)


### Question 10

In [11]:
# select the data labels for the specific question
df_q_labels = df_test_labels[df_test_labels.question_num == 10]
q_level_groups = df_q_labels.level_group.unique()

# select the source data for the question
df_q = df_test[df_test.level_group.isin(q_level_groups)]

In [12]:
# get the test data labels
y_true = df_q_labels['correct']

In [17]:
# perform the predictions
base_model:Baseline = Baseline()
df_q_baseline = base_model.predict(data=df_q, labels=df_q_labels)
y_pred_baseline = df_q_baseline['correct']

2023-04-10 07:09:39 INFO     Predicting the target variable


  0%|          | 0/6988 [00:00<?, ?it/s]

In [27]:
# perform predictions with the actual models
q_predictor:Predictor = Predictor(models={
    10: {'model': q10_model, 'threshold': 0.50}
})

df_q_model = q_predictor.predict(data=df_q, labels=df_q_labels)
y_pred_model = df_q_model['correct']

2023-04-10 07:24:25 INFO     Performing feature engineering:
2023-04-10 07:24:25 INFO     	Adding elapsed time features...
2023-04-10 07:24:26 INFO     	Adding total count features...
2023-04-10 07:24:26 INFO     	Adding unique count features...
2023-04-10 07:24:26 INFO     	Adding heatmap features...


  0%|          | 0/6988 [00:00<?, ?it/s]

2023-04-10 07:27:55 INFO     Predicting the target variable


  0%|          | 0/6988 [00:00<?, ?it/s]

In [28]:
# show the classification report
mprint('#### Baseline')
mprint('```')
mprint(classification_report(y_true, y_pred_baseline, zero_division=0))
mprint('```')

mprint('#### Model')
mprint('```')
mprint(classification_report(y_true, y_pred_model, zero_division=0))
mprint('```')

mflush()

#### Baseline
```
              precision    recall  f1-score   support

           0       0.50      1.00      0.67      3512
           1       0.00      0.00      0.00      3476

    accuracy                           0.50      6988
   macro avg       0.25      0.50      0.33      6988
weighted avg       0.25      0.50      0.34      6988

```
#### Model
```
              precision    recall  f1-score   support

           0       0.61      0.59      0.60      3512
           1       0.60      0.62      0.61      3476

    accuracy                           0.60      6988
   macro avg       0.60      0.60      0.60      6988
weighted avg       0.60      0.60      0.60      6988

```

### All Questions

In [29]:
# get the test data labels
y_true = df_test_labels['correct']

In [30]:
# perform the baseline predictions
base_model:Baseline = Baseline()
df_baseline = base_model.predict(data=df_test, labels=df_test_labels)
y_pred_baseline = df_baseline['correct']

2023-04-10 08:45:48 INFO     Predicting the target variable


  0%|          | 0/125784 [00:00<?, ?it/s]

In [31]:
# perform predictions with the model
q_predictor:Predictor = Predictor(models={
    10: {'model': q10_model, 'threshold': 0.50}
})

df_model = q_predictor.predict(data=df_test, labels=df_test_labels)
y_pred_model = df_model['correct']

2023-04-10 08:53:58 INFO     Performing feature engineering:
2023-04-10 08:53:58 INFO     	Adding elapsed time features...
2023-04-10 08:54:00 INFO     	Adding total count features...
2023-04-10 08:54:00 INFO     	Adding unique count features...
2023-04-10 08:54:02 INFO     	Adding heatmap features...


  0%|          | 0/6988 [00:00<?, ?it/s]

2023-04-10 09:02:36 INFO     Predicting the target variable


  0%|          | 0/125784 [00:00<?, ?it/s]

In [32]:
# show the classification report
mprint('#### Baseline')
mprint('```')
mprint(classification_report(y_true, y_pred_baseline, zero_division=0))
mprint('```')

mprint('#### Model')
mprint('```')
mprint(classification_report(y_true, y_pred_model, zero_division=0))
mprint('```')

mflush()

#### Baseline
```
              precision    recall  f1-score   support

           0       0.52      0.48      0.50     37388
           1       0.79      0.81      0.80     88396

    accuracy                           0.71    125784
   macro avg       0.65      0.65      0.65    125784
weighted avg       0.71      0.71      0.71    125784

```
#### Model
```
              precision    recall  f1-score   support

           0       0.53      0.45      0.48     37388
           1       0.78      0.83      0.81     88396

    accuracy                           0.72    125784
   macro avg       0.66      0.64      0.65    125784
weighted avg       0.71      0.72      0.71    125784

```

### Question 5 and 10 models combined

In [33]:
# perform predictions with the model
q_predictor:Predictor = Predictor(models={
    5:  {'model': q05_model, 'threshold': 0.52},
    10: {'model': q10_model, 'threshold': 0.50}
})

df_model = q_predictor.predict(data=df_test, labels=df_test_labels)
y_pred_model = df_model['correct']

2023-04-10 09:21:13 INFO     Performing feature engineering:
2023-04-10 09:21:13 INFO     	Adding elapsed time features...
2023-04-10 09:21:15 INFO     	Adding total count features...
2023-04-10 09:21:16 INFO     	Adding unique count features...
2023-04-10 09:21:17 INFO     	Adding heatmap features...


  0%|          | 0/6988 [00:00<?, ?it/s]

2023-04-10 09:29:53 INFO     Predicting the target variable


  0%|          | 0/125784 [00:00<?, ?it/s]

In [42]:
mprint('#### Model')
mprint('```')
mprint(classification_report(y_true, y_pred_model, zero_division=0))
mprint('```')

mflush()

#### Model
```
              precision    recall  f1-score   support

           0       0.56      0.41      0.47     37388
           1       0.77      0.86      0.82     88396

    accuracy                           0.73    125784
   macro avg       0.67      0.63      0.64    125784
weighted avg       0.71      0.73      0.71    125784

```

The F1 macro average now decreased while the accuracy went up bu nearly 0.02. This is a bit frustrating and is due to the fact that the f1-score for the 0 class is now lower. As an experiment we could try an randomly flip some 1s to 0s.

In [97]:
def flip_to_zeros(y_pred:np.ndarray, n):
    y_result = y_pred.copy()

    # Find the indices of elements with value 1
    one_indices = np.where(y_result == 1)[0]

    # Check if there are enough ones to flip
    if len(one_indices) < n:
        raise ValueError(f'There are only {len(one_indices)} ones in the array, but you want to flip {n}.')

    # Randomly choose n indices to flip
    indices_to_flip = np.random.choice(one_indices, size=n, replace=False)

    # Flip the selected indices
    y_result[indices_to_flip] = 0

    return y_result

# # Example usage:
# y_example = np.array([0, 1, 1, 0, 1, 1, 0, 0, 1, 1])
# n = 3
# result = flip_to_zeros(y_example, n)
# print(result)

In [96]:
df_flips = pd.DataFrame()
f1_best = 0
best_flip_sample_frac = 0

for flip_sample_frac in tqdm(np.arange(0, 0.75, 0.0001)):
    # get the flipped predictions
    flip_sample_size = int(flip_sample_frac * len(y_pred_model))
    y_flipped = flip_to_zeros(y_pred_model.to_numpy(), flip_sample_size)

    # calcualte the f1 score
    f1 = f1_score(y_true, y_flipped, zero_division=0, average='macro')
    if f1 > f1_best:
        f1_best = f1
        best_flip_sample_frac = flip_sample_frac

# display the best f1 score
print(f'Best f1 score: {f1_best}')
print(f'Best flip sample fraction: {best_flip_sample_frac}')

#print(classification_report(y_true, y_flipped, zero_division=0))


  0%|          | 0/7500 [00:00<?, ?it/s]

Best f1 score: 0.643132872241387
Best flip sample fraction: 0.0006000000000000001


This idea was perhaps silly but worth a try. We can attempt in later experiments with class weights and bias some questions to 0s.