# 02-04-01 : Checkpoint

In [1]:
import sys
import gc
import pandas as pd
import numpy as np
import logging
from typing import Iterable, Tuple
from tqdm.auto import tqdm

import jo_wilder

import keras


2023-03-19 11:35:14.542661: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Configure Logging

In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-19 11:35:15 INFO     Started


## Data Preparation & Cleaning

In [3]:
def map_question_to_level_group(question_number):
    """
    Maps the question number to the level group.

    Parameters
    ----------
    question_number : int
        The question number.

    Returns
    -------
    str
        The level group.
    """
    if question_number in [1, 2, 3]:
        return '0-4'
    elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return '5-12'
    elif question_number in [14, 15, 16, 17, 18]:
        return '13-22'
    else:
        return None

In [4]:
def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset and add columns for the level group 
    and the question number.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0].astype(int)) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['id', 'session_id', 'question_num', 'correct']]
    
    # add the level group column
    df_labels['level_group'] = df_labels['question_num'].apply(map_question_to_level_group) 
        
    return df_labels

In [5]:
def prepare_main_dataset(data : pd.DataFrame,
                         elapsed_time_min_clip:int=0,
                         elapsed_time_max_clip:int=3691298) -> pd.DataFrame:
    """
    Prepares the main dataset by removing duplicates and removing 
    columns that are not needed.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)
    
    # clip the elapsed time to remove outliers
    df_main['elapsed_time'] = df_main['elapsed_time'].clip(
        lower=elapsed_time_min_clip,
        upper=elapsed_time_max_clip)
    
    return df_main

## Feature Engineering

In [6]:
def create_initial_features(X:pd.DataFrame,
                            y:pd.DataFrame) -> pd.DataFrame:
    """
    Creates the initial dataset to which additional features will be added.

    Parameters
    ----------
    X : pd.DataFrame
        The main dataset.
    y : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The initial feature dataset.
    """
    df_features =  y \
        .groupby(['session_id', 'level_group']) \
        .agg({'correct': ['count']}) \
        .reset_index() \
        .droplevel(1, axis=1) \
        .drop(columns=['correct']) \
        .sort_values(['session_id', 'level_group']) \
        
    # set the session_id to be an integer
    df_features['session_id'] = df_features['session_id'].astype(int)
        
    return df_features        

In [7]:
def add_numeric_column_features(features:pd.DataFrame,
                                X:pd.DataFrame,
                                column:str,
                                min_values:dict=None,
                                max_values:dict=None) -> pd.DataFrame:
    """
    Add the maximum elapsed time feature to the features dataset.

    Parameters
    ----------
    features : pd.DataFrame
        The features dataset.
    X : pd.DataFrame
        The main dataset.
    column : str
        The name of the numeric column to add to the features for.

    Returns
    -------
    None
    """
    # Define a function to calculate mode
    def mode(series):
        return series.mode().iat[0]

    # calculate the maximum, minimum and mean for the column
    df_result = X \
        .groupby(['session_id', 'level_group']) \
        .agg({column: ['sum', 'max', 'min', 'mean', mode]}) \
        .reset_index()
    
    # flatten the multi-index columns
    df_result.columns = ['_'.join(col).rstrip('_') for col in df_result.columns.values]

    # normalize the values
    if min_values is None or max_values is None:
        logging.warning('Not normalizing the values, min_value and max_values are not set.')
    else:
        metric_list = ['sum', 'max', 'min', 'mean', 'mode']
        for metric in metric_list:
            current_column = f'{column}_{metric}'
            df_result[current_column] = (df_result[current_column] - min_values[metric]) / (max_values[metric] - min_values[metric])       

    # join the features to the result   
    df_result = features.set_index(['session_id', 'level_group']) \
        .join(df_result.set_index(['session_id', 'level_group']), how='left') \
        .reset_index()
    
    return df_result

In [8]:
def feature_engineering(df_source:pd.DataFrame,
                        df_source_labels:pd.DataFrame) -> pd.DataFrame:
    """
    Creates the feature dataset.

    Parameters
    ----------
    df_source : pd.DataFrame
        The main dataset.
    df_source_labels : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The feature dataset.
    """
    # create the initial features
    df_features = create_initial_features(df_source, df_source_labels)

    # add the feature to the features dataset
    df_features = add_numeric_column_features(
        features=df_features,
        X=df_source,
        column='elapsed_time',
        min_values={
            'sum': 61395.0,
            'max':  990.0,
            'min':  0.0,
            'mean': 526.447,
            'mode': 0.0},
        max_values={
            'sum':  9990648000,
            'max':  3691298.0,
            'min':  3691298.0,
            'mean': 3691298.0,
            'mode': 3691298.0})
    
    return df_features

## Data Selection

In [9]:
def create_feature_dataset(df_features:pd.DataFrame,
                           df_source_labels:pd.DataFrame,
                           session_list: list,
                           feature_list:list,
                           level_group:str=None,
                           include_question:bool=True,
                           expand_question:bool=False) -> np.array:
    """
    Creates the feature dataset for the given level group and session list.
    If the level group is not specified it will create the dataset for all level groups.

    Parameters
    ----------
    df_features : pd.DataFrame
        The dataset of prepared features (by session_id and level_group).
    df_source_labels : pd.DataFrame
        The dataset containing the training labels (y_True).
    session_list : list
        The list of session ids to create the dataset for.
    level_group : str, optional
        The level group to create the dataset for, by default None
    feature_list : list
        The list of features to include in the dataset.
    include_question : bool, optional
        Whether to include the question number in the dataset as the first set of
        columns, by default True
    expand_question : bool, optional
        Whether to expand the question number into a one-hot vector to each item in the 
        case of a multi-dimensional feature, by default False

    Returns
    -------
    np.array
        The feature dataset.
    """
    # get the features and labels for the given level group
    if level_group is None:
        logging.debug('Creating the dataset for all level groups')
        df_features_group = df_features.query('session_id in @session_list')
        df_labels_group = df_source_labels.query('session_id in @session_list')
    else:
        logging.debug('Creating the dataset for level group: %s', level_group)
        df_features_group = df_features.query('level_group == @level_group and session_id in @session_list')
        df_labels_group = df_source_labels.query('level_group == @level_group and session_id in @session_list')

    # sort the df_labels_group
    df_labels_group = df_labels_group.sort_values(['session_id', 'question_num'])

    feature_dataset = []

    # get the features for each row in the level group labels dataset
    current_session_id = None
    df_session_features = None

    for index, row in df_labels_group.iterrows():        
        session_id = int(row['session_id'])
        session_level_group = row['level_group']
        question_num = int(row['question_num'])

        # get the features for the session
        if session_id != current_session_id:
            current_session_id = session_id
            df_session_features = df_features_group.query('session_id == @session_id')

        # get the level group features
        df_level_group_features = df_session_features.query('level_group == @session_level_group')

        # check if the session has features
        if df_level_group_features.shape[0] == 0:
            raise Exception(f'No features for session {session_id}, level group {session_level_group}!')
                            
        # get the features for the row
        row_features = []

        # get the question number one-hot encoded
        question_num_one_hot = np.zeros(18, dtype=np.int8)
        question_num_one_hot[question_num-1] = 1

        if include_question:
            row_features.extend(question_num_one_hot)

        for feature in feature_list:
            feature_value = df_level_group_features[feature].values[0]

            # check if the feature value is iterable
            if isinstance(feature_value, Iterable):
                if expand_question:
                    # reshape the question array to match the feature array shape
                    question_reshaped = np.tile(
                        question_num_one_hot, 
                        (feature_value.shape[0], 1))
                    
                    # add the question columns to the feature array
                    feature_value = np.hstack((question_reshaped, feature_value))

                row_features.extend(feature_value)
            else:
                row_features.append(feature_value)

        # add the row features to the output dataset
        feature_dataset.append(row_features)

    return np.array(feature_dataset, dtype=np.float32)

## Perform Predictions

In [10]:
# load the model
model_path = '../checkpoints/'
#model_path = '/kaggle/input/jw-02-04-91'

model = keras.models.load_model(f'{model_path}/02-04-01.h5')
model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 23)]              0         
                                                                 
 dense_36 (Dense)            (None, 1024)              24576     
                                                                 
 dense_37 (Dense)            (None, 1)                 1025      
                                                                 
Total params: 25,601
Trainable params: 25,601
Non-trainable params: 0
_________________________________________________________________


2023-03-19 11:35:16.218910: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-19 11:35:16.219735: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [11]:
def prepare_features(X:pd.DataFrame,
                     y:pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Prepare the model features
    """
    # prepare the main dataset
    logging.debug('Preparing the main dataset')
    df_source = prepare_main_dataset(X, elapsed_time_min_clip=0, elapsed_time_max_clip=3691298)

    # prepare the label dataset
    logging.debug('Preparing the label dataset')
    df_source_labels = prepare_label_dataset(y) \
        .set_index('id')

    # perform the feature engineering
    logging.debug('Performing the feature engineering')
    df_features = feature_engineering(df_source, df_source_labels)

    gc.collect()

    return df_features, df_source_labels

In [12]:
def predict_all(X:pd.DataFrame,
                y:pd.DataFrame,
                threshold:float) -> pd.DataFrame:
    """
    Perform predictions on the entire dataset.
    """
    feature_list = ['elapsed_time_sum', 'elapsed_time_max',
                    'elapsed_time_min', 'elapsed_time_mean',
                    'elapsed_time_mode']

    # get the features and labels
    df_features, df_source_labels = prepare_features(X, y)

    # create the dataset to be used for prediction
    X_pred = create_feature_dataset(
        df_features=df_features,
        df_source_labels=df_source_labels,
        session_list=df_source_labels['session_id'].unique(),
        feature_list=feature_list,
        level_group=None,
        include_question=True,
        expand_question=False)
        
    # get the predictions
    logging.debug('Predicting the labels')
    y_pred = (model.predict(X_pred, verbose=False) > threshold).astype(int)
    df_source_labels['correct'] = y_pred

    return df_source_labels


In [13]:
# delete the submission.csv file if
!rm submission.csv

In [14]:
def predict_iteration(df_label:pd.DataFrame, df_data:pd.DataFrame) -> None:
    """
    Perform predictions on the current iteration.
    """
    global env

    # get all the predictions
    df_predictions = predict_all(
        X=df_data,
        y=df_label,
        threshold=0.47)
        
    # get the predictions for the current iteration
    df_iteration = df_label \
        .drop(columns=['correct']) \
        .merge(df_predictions[['correct']], left_on='session_id', right_index=True)

    # submit the predictions
    env.predict(df_iteration)    

Estimate code taken from: https://www.kaggle.com/code/steubk/xgboost-baseline-and-inference-time-estimation

In [15]:
TIME_ESTIMATION = True

if TIME_ESTIMATION:
    from tqdm.notebook import tqdm
    
    #generate a mock Env
    class MockEnv:
        def predict(self, df_label):
            None

    
    # set the true Env to debug mode
    # see https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384814
    jo_wilder.make_env.__called__ = False
    env = jo_wilder.make_env()

    type(env)._state = type(type(env)._state).__dict__['INIT']
    iter_test = env.iter_test()    
    iters = []

    ## load sample submissions in a list 
    count = 0
    for n, (df_label, df_data) in enumerate(iter_test):
        iters.append( (df_label, df_data) )
        count += df_label.shape[0]
        env.predict(df_label)

    num_sids = count / 18
    print(f"num submission id in test set: {num_sids}")
    env = MockEnv()

    # iterate over 4000 x 3 : 12000 submission ids (~ 11779 train set)
    for nn in tqdm(range(4000), desc="Inference Time estimation"):
        for (df_label, test) in iters:    
            predict_iteration(df_label, test)
            
        ## we don't want to wait  for 4000 iterations
        if nn == 50:
            break
        
else:
    env = jo_wilder.make_env()
    iter_test = env.iter_test() 
    for df_label, df_data in iter_test:
        predict_iteration(df_label, df_data)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
num submission id in test set: 3.0


Inference Time estimation:   0%|          | 0/4000 [00:00<?, ?it/s]

In [16]:
# confirm submission file
df = pd.read_csv('submission.csv')
print(df.shape)
df.sample(n=10)

(54, 2)


Unnamed: 0,session_id,correct
18,20090109393214576_q13,0
47,20090312143683264_q17,0
36,20090312331414616_q11,0
22,20090312143683264_q7,0
46,20090312143683264_q16,0
42,20090109393214576_q17,0
12,20090109393214576_q7,0
27,20090312143683264_q12,0
41,20090109393214576_q16,0
15,20090109393214576_q10,0
