# 03-01: Mutli-Threshold for F1

The core idea is to first do a global threshold to optimize the F1 score. Then leave all the samples classified as 0 in place, and do a second threshold to optimize the F1 score for the samples classified as 1.

In [1]:
import sys
import logging
from typing import Iterable, List, Tuple, Dict

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split

### Configure Logging

In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-20 19:13:49 INFO     Started


## Data Collection

In [3]:
dtypes = {
    "session_id": np.int64,
    "elapsed_time": np.int32,
    "event_name": "category",
    "name": "category",
    "level": np.uint8,
    "page": "category",
    "room_coor_x": np.float32,
    "room_coor_y": np.float32,
    "screen_coor_x": np.float32,
    "screen_coor_y": np.float32,
    "hover_duration": np.float32,
    "text": "category",
    "fqid": "category",
    "room_fqid": "category",
    "text_fqid": "category",
    "fullscreen": "category",
    "hq": "category",
    "music": "category",
    "level_group": "category",
}

In [4]:
# load the source training set
df_source = pd.read_csv('../data/train.csv.gz',
                        compression='gzip',
                        dtype=dtypes)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(13174211, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [5]:
# load the source training labels
df_source_labels = pd.read_csv('../data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Data Preparation & Cleaning

In [6]:
def map_question_to_level_group(question_number) ->str:
    """
    Maps the question number to the level group.

    Parameters
    ----------
    question_number : int
        The question number.

    Returns
    -------
    str
        The level group.
    """
    if question_number in [1, 2, 3]:
        return '0-4'
    elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return '5-12'
    elif question_number in [14, 15, 16, 17, 18]:
        return '13-22'
    else:
        return None

In [7]:
def find_problem_sessions(data : pd.DataFrame) -> List[str]:
    """
    Finds the sessions that are duplicated on session_id and index. And
    Find sessions with reversed indexes.

    This idea is taken from the following Kaggle notebook:
    https://www.kaggle.com/code/abaojiang/eda-on-game-progress/notebook?scriptVersionId=120133716
    
    Parameters
    ----------
    data : pd.DataFrame
        The data to search.

    Returns
    -------
    List[str]
        The list of session ids that have a problem.
    """

    # find sessions duplicated on session_id and index
    sessions_with_duplicates = df_source.loc[
        data.duplicated(subset=["session_id", "index"], keep=False)] \
        ["session_id"].unique().tolist()


    # find sessions with reversed indexes
    sessions_with_reversed_index = []
    for sess_id, gp in df_source.groupby("session_id", observed=True):
        if not gp["index"].is_monotonic_increasing:
            sessions_with_reversed_index.append(sess_id)

    # via experimentation these sessions have been found to have time 
    # differences < -2000
    negative_time_diff_sessions = [
        '21030417085341900', '21070111080982292', 
        '21090108302064196', '21090409222921812']

    # combine the two lists into a single set
    return set(sessions_with_duplicates + 
               sessions_with_reversed_index + 
               negative_time_diff_sessions)

In [8]:
def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset and add columns for the level group 
    and the question number.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0].astype(int)) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['session_id', 'question_num', 'correct']]
    
    # add the level group column
    df_labels['level_group'] = df_labels['question_num'] \
        .apply(map_question_to_level_group) 
        
    return df_labels

In [9]:
def prepare_main_dataset(data : pd.DataFrame,
                         elapsed_time_min_clip:int=0,
                         elapsed_time_max_clip:int=3691298) -> pd.DataFrame:
    """
    Prepares the main dataset by removing duplicates and removing 
    columns that are not needed.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)
    
   # clip the elapsed time to remove outliers
    df_main['elapsed_time'] = df_main['elapsed_time'].clip(
        lower=elapsed_time_min_clip,
        upper=elapsed_time_max_clip)
    
    return df_main

### Process the Data

In [10]:
# prepare the main dataset
df_source = prepare_main_dataset(df_source, 
                                 elapsed_time_min_clip=0,
                                 elapsed_time_max_clip=3691298)

# remove sessions with problems
problem_sessions = find_problem_sessions(df_source)
df_source = df_source[~df_source['session_id'].isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    print(df_source.shape)
    display(df_source.head(3))  

(13019794, 14)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,-413.991394,-159.314682,380.0,494.0,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1,1323,person_click,basic,0,-413.991394,-159.314682,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,2,831,person_click,basic,0,-413.991394,-159.314682,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


In [11]:
# prepare the label dataset
df_source_labels = prepare_label_dataset(df_source_labels)

# remove sessions with problems
df_source_labels = df_source_labels[~df_source_labels['session_id'] \
    .isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
172317,21070319253640464,15,0,13-22
194865,21040512553883790,17,1,13-22
197728,22000108514966796,17,1,13-22


## Feature Engineering

### Functions

In [12]:
def create_initial_features(X:pd.DataFrame,
                            y:pd.DataFrame) -> pd.DataFrame:
    """
    Creates the initial dataset to which additional features 
    will be added.

    Parameters
    ----------
    X : pd.DataFrame
        The main dataset.
    y : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The initial feature dataset.
    """
    df_features =  y \
        .groupby(['session_id', 'level_group']) \
        .agg({'correct': ['count']}) \
        .reset_index() \
        .droplevel(1, axis=1) \
        .drop(columns=['correct']) \
        .sort_values(['session_id', 'level_group']) \
        
    # set the session_id to be an integer
    df_features['session_id'] = df_features['session_id'].astype(int)
        
    return df_features        

In [13]:
def add_numeric_column_features(features:pd.DataFrame,
                                X:pd.DataFrame,
                                column:str,
                                min_values:dict=None,
                                max_values:dict=None) -> pd.DataFrame:
    """
    Add the maximum elapsed time feature to the features dataset.

    Parameters
    ----------
    features : pd.DataFrame
        The features dataset.
    X : pd.DataFrame
        The main dataset.
    column : str
        The name of the numeric column to add to the features for.

    Returns
    -------
    None
    """
    # Define a function to calculate mode
    def mode(series):
        return series.mode().iat[0]

    # calculate the maximum, minimum and mean for the column
    df_result = X \
        .groupby(['session_id', 'level_group']) \
        .agg({column: ['sum', 'max', 'min', 'mean', mode]}) \
        .reset_index()
    
    # flatten the multi-index columns
    df_result.columns = [
        '_'.join(col).rstrip('_') for col in df_result.columns.values
    ]

    # normalize the values
    if min_values is None or max_values is None:
        logging.warning('Not normalizing the values, min_value and max_values are not set.')
    else:
        metric_list = ['sum', 'max', 'min', 'mean', 'mode']
        for metric in metric_list:
            current_column = f'{column}_{metric}'
            df_result[current_column] = \
                (df_result[current_column] - min_values[metric]) / \
                (max_values[metric] - min_values[metric])       

    # join the features to the result   
    df_result = features.set_index(['session_id', 'level_group']) \
        .join(df_result.set_index(['session_id', 'level_group']), how='left') \
        .reset_index()
    
    return df_result

### Create Features

#### Initial Features

In [14]:
# create the initial features
df_features = create_initial_features(df_source, df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group
0,20090312431273200,0-4
1,20090312431273200,13-22
2,20090312431273200,5-12
3,20090312433251036,0-4
4,20090312433251036,13-22
5,20090312433251036,5-12


#### Elapsed Time

In [15]:
# add the elapsed time features to the features dataset
df_features = add_numeric_column_features(
    features=df_features,
    X=df_source,
    column='elapsed_time',
    min_values={
        'sum': 61395.0,
        'max':  990.0,
        'min':  0.0,
        'mean': 526.447,
        'mode': 0.0},
    max_values={
        'sum':  9990648000,
        'max':  3691298.0,
        'min':  3691298.0,
        'mean': 3691298.0,
        'mode': 3691298.0})

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
0,20090312431273200,0-4,0.001411,0.052535,0.0,0.023103,0.0
1,20090312431273200,13-22,0.04374,0.344602,0.226677,0.281804,0.30132
2,20090312431273200,5-12,0.010577,0.135014,0.060002,0.096641,0.060002
3,20090312433251036,0-4,0.001352,0.063074,0.0,0.026311,0.0
4,20090312433251036,13-22,0.324157,1.0,0.318718,0.676403,1.0
5,20090312433251036,5-12,0.021933,0.221287,0.072301,0.150206,0.072301


## Data Selection

### Functions

In [16]:
def select_sessions(
        y: pd.DataFrame,
        random_state: int=1337,
        test_size: float=0.2,
        train_size:float=0.6) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Select samples from the dataset for training, validation and testing.
    The test set is selected first, then the training set is selected 
    from the remaining sessions. And finally the validation set is 
    selected from the remaining sessions.

    Parameters
    ----------
    y : pd.DataFrame
        The label dataset.
    random_state : int
        The random state to use.
    test_size : float
        The ratio of the sample to use for testing.
    train_size : float
        The ratio of the sample to use for training.

    Returns
    -------
    Tuple[np.ndarray, np.ndarray, np.ndarray]
        The selected session ids, the main dataset and the label dataset.
    """
    # select all the unique session ids
    all_session_ids = y['session_id'].unique()

    # set the random seed
    np.random.seed(random_state)

    # shuffle the session ids
    np.random.shuffle(all_session_ids)

    # select the session ids for the test set
    test, remainder = train_test_split(all_session_ids, test_size=1-test_size)

    # split the dataset into train and validation sets
    train, val = train_test_split(remainder, test_size=1-train_size)

    # print the number of sessions in each set
    print(f'Train: {len(train)}')
    print(f'Validation: {len(val)}')
    print(f'Test: {len(test)}')

    return train, val, test

In [26]:
def get_features_with_labels(features:pd.DataFrame,
                            y:pd.DataFrame) -> pd.DataFrame:
    """
    Create a complete dataset where the label dataset containing the
    target variable is joint to the features dataset.
    """
    return y.join(features.set_index(['session_id', 'level_group']),
                    on=['session_id', 'level_group'],
                    how='left')

### Create Datasets

In [27]:
random_state = 51

In [28]:
# split the dataset into train, validation and test sets
train, val, test = select_sessions(
    y=df_source_labels,
    random_state=random_state,
    test_size=0.60,
    train_size=0.75)

Train: 3495
Validation: 1165
Test: 6988


In [29]:
# get the prepared dataset
features_with_labels = get_features_with_labels(df_features, df_source_labels)

with pd.option_context('display.max_columns', None):
    display(features_with_labels.head(6))

Unnamed: 0,session_id,question_num,correct,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
0,20090312431273200,1,1,0-4,0.001411,0.052535,0.0,0.023103,0.0
1,20090312433251036,1,0,0-4,0.001352,0.063074,0.0,0.026311,0.0
2,20090314121766812,1,1,0-4,0.002928,0.106324,0.0,0.047996,0.0
3,20090314363702160,1,1,0-4,0.001627,0.05869,0.0,0.030143,0.0
4,20090314441803444,1,1,0-4,0.000824,0.047682,0.0,0.020862,0.0
5,20090315081004164,1,0,0-4,0.002515,0.092231,0.0,0.036151,0.0


In [34]:
# get the features for the train, validation and test datasets
df_train = features_with_labels[features_with_labels['session_id'].isin(train)] \
    .drop(columns=['session_id', 'level_group'])

df_val = features_with_labels[features_with_labels['session_id'].isin(val)] \
    .drop(columns=['session_id', 'level_group'])
    
df_test = features_with_labels[features_with_labels['session_id'].isin(test)] \
    .drop(columns=['session_id', 'level_group'])

In [41]:
print('Train      :', df_train.shape)
print('Validation :', df_val.shape)
print('Test       :', df_test.shape)

Train      : (62910, 7)
Validation : (20970, 7)
Test       : (125784, 7)


In [42]:
with pd.option_context('display.max_columns', None):
    display(df_train.head(6))

Unnamed: 0,question_num,correct,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
1,1,0,0.001352,0.063074,0.0,0.026311,0.0
4,1,1,0.000824,0.047682,0.0,0.020862,0.0
5,1,0,0.002515,0.092231,0.0,0.036151,0.0
9,1,0,0.002094,0.088831,0.0,0.037267,0.0
10,1,1,0.006778,0.2233,0.0,0.077021,0.0
13,1,1,0.002453,0.085622,0.0,0.039016,0.0


## Model Training