# 21. Feature Engineering

Everything has been super disappointing up to this point.

In [1]:
import sys
from typing import Tuple
import logging

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from pycaret.classification import *

## Configure Logging

In [2]:
logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

In [3]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.FileHandler("ex05_06.log"),
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-08 14:06:42 INFO     Started


## Load Source Data

In [4]:
# load the source training set
df_source = pd.read_csv('data/train.csv.gz', compression='gzip', index_col=1)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(13174211, 19)


Unnamed: 0_level_0,session_id,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,20090312431273200,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [5]:
# load the source training labels
df_source_labels = pd.read_csv('data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Functions

### Functions to prepare the data

In [6]:
def map_question_to_level_group(question_number):
    """
    Maps the question number to the level group.

    Parameters
    ----------
    question_number : int
        The question number.

    Returns
    -------
    str
        The level group.
    """
    if question_number in [1, 2, 3]:
        return '0-4'
    elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return '5-12'
    elif question_number in [14, 15, 16, 17, 18]:
        return '13-22'
    else:
        return None

def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset and add columns for the level group 
    and the question number.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0].astype(int)) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['session_id', 'question_num', 'correct']]
    
    # add the level group column
    df_labels['level_group'] = df_labels['question_num'].apply(map_question_to_level_group) 
        
    return df_labels

In [7]:
def prepare_main_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the main dataset by removing duplicates and removing 
    columns that are not needed.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)

    return df_main

In [8]:
def select_sessions(
        y: pd.DataFrame,
        sample_size: int,
        random_state: int=1337) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Selects a sample of sessions from the dataset.

    Parameters
    ----------
    y : pd.DataFrame
        The label dataset.
    sample_size : int
        The number of sessions to select.
    random_state : int
        The random state to use.

    Returns
    -------
    Tuple[np.ndarray, np.ndarray, np.ndarray]
        The selected session ids, the main dataset and the label dataset.
    """
    # select all the unique session ids
    all_session_ids = y['session_id'].unique()

    # create a sample for testing
    session_ids = np.random.choice(all_session_ids, size=sample_size, replace=False)

    # split the dataset into train, validation and test sets
    train, test = train_test_split(session_ids, test_size=0.4)
    test, val = train_test_split(test, test_size=0.5)

    # print the number of sessions in each set
    print(f'Train: {len(train)}')
    print(f'Validation: {len(val)}')
    print(f'Test: {len(test)}')

    return train, val, test

## Data Preprocessing

In [9]:
# prepare the main dataset
df_source = prepare_main_dataset(df_source)

with pd.option_context('display.max_columns', None):
    print(df_source.shape)
    display(df_source.head(3))  

(13173445, 13)


Unnamed: 0,session_id,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,cutscene_click,basic,0,-413.991405,-159.314686,380.0,494.0,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1323,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,831,person_click,basic,0,-413.991405,-159.314686,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


In [10]:
# prepare the label dataset
df_source_labels = prepare_label_dataset(df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
21476,22010116250792520,2,1,0-4
84068,21000111433937450,8,1,5-12
171219,21040510125933256,15,0,13-22


## Data Selection

In [11]:
random_state = 51
sample_size = df_source_labels['session_id'].nunique()
#sample_size=2000

In [12]:
train, val, test = select_sessions(
    y=df_source_labels,
    sample_size=sample_size,
    random_state=random_state)

Train: 7067
Validation: 2356
Test: 2356


## Functions for Modeling Data

### Create Basic Dataset

In [13]:
def create_basic_session_features(df_session:pd.DataFrame,
                                  level_group:str) -> pd.DataFrame:
    """
    Create a dataset with the most basic features for a single session.

    Parameters
    ----------
    df_session : pd.DataFrame
        The session dataset.

    Returns
    -------
    pd.DataFrame
        The session dataset.
    """
    df_features = df_session \
        .query(f'level_group == "{level_group}"') \
        [['level_group']] \
        .drop_duplicates() \
        .reset_index(drop=True)

    return df_features

# test the function
session_id = train[0]
df_session = df_source[df_source['session_id'] == session_id]

df_session = create_basic_session_features(df_session, '0-4')
print(df_session.shape)
with pd.option_context('display.max_columns', None):
    display(df_session.tail(3))

(1, 1)


Unnamed: 0,level_group
0,0-4


### Generic functions for creating feature datasets

In [14]:
def create_session_dataset(X:pd.DataFrame,
                                 y:pd.DataFrame,
                                 session_id:str,
                                 create_features) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create a dataset with the most basic features for a single session.

    Parameters
    ----------
    X : pd.DataFrame
        The main dataset.
    y : pd.DataFrame
        The label dataset.
    session_id : str
        The session id.

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        The session dataset and the session label dataset.
    """
    features = pd.DataFrame()
    y_true = []

    # select the session data
    df_session_labels = y.query('session_id == @session_id')
    df_session = X.query('session_id == @session_id')

    # create the features for each level group
    features_group = {
        '0-4': create_features(df_session, '0-4'),
        '5-12': create_features(df_session, '5-12'),
        '13-22': create_features(df_session, '13-22'),
    }

    # iterate over all the questions answered in the session
    for _, row in df_session_labels.iterrows():
        # get the question number, correct answer and level group
        question_num = row['question_num']
        correct = row['correct']
        level_group = row['level_group']

        # append the label as the target value
        y_true.append(correct)

        # get the features for the level group
        question_features = features_group[level_group] \
            .assign(question_num=question_num) \
            .drop('level_group', axis=1)
        
        # append the features to the dataset
        features = pd.concat([features, question_features], axis=0)


    return features.reset_index(drop=True), np.array(y_true)

# test the function
session_id = train[0]
df_session, y_session = create_session_dataset(
    X=df_source,
    y=df_source_labels,
    session_id=session_id,
    create_features=create_basic_session_features)

df_session['correct'] = y_session

print(df_session.shape)
with pd.option_context('display.max_columns', None):
    display(df_session.head(3))

(18, 2)


Unnamed: 0,question_num,correct
0,1,1
1,2,1
2,3,1


In [15]:
def create_dataset(X:pd.DataFrame,
                   y:pd.DataFrame,
                   session_list:np.ndarray,
                   create_features) -> pd.DataFrame:
    
    df_sessions = pd.DataFrame()
    
    for session_id in tqdm(session_list):
        # create the session dataset
        df_session, y_session = create_session_dataset(
            X=X,
            y=y,
            session_id=session_id,
            create_features=create_features)
        
        # add the label for pycaret training
        df_session['correct'] = y_session

        # append the session dataset to the main dataset
        df_sessions = pd.concat([df_sessions, df_session], axis=0)

    return df_sessions.reset_index(drop=True)


# test the function
df_basic = create_dataset(
    X=df_source,
    y=df_source_labels,
    session_list=train[:3],
    create_features=create_basic_session_features)

print(df_basic.shape)
with pd.option_context('display.max_columns', None):
    display(df_basic.head(3))

  0%|          | 0/3 [00:00<?, ?it/s]

(54, 2)


Unnamed: 0,question_num,correct
0,1,1
1,2,1
2,3,1


### Elapsed Time Features

In [16]:
def create_elapsed_time_session_features(df_session:pd.DataFrame,
                                  level_group:str) -> pd.DataFrame:
    """
    Create a dataset with the most basic features for a single session.

    Parameters
    ----------
    df_session : pd.DataFrame
        The session dataset.

    Returns
    -------
    pd.DataFrame
        The session dataset.
    """
    df_features = df_session \
        .query(f'level_group == "{level_group}"') \
        .groupby('level_group') \
        .agg({'elapsed_time': 'max'}) \
        .reset_index(drop=False)

    return df_features

# test the function
session_id = train[0]
df_session = df_source[df_source['session_id'] == session_id]

df_session = create_elapsed_time_session_features(df_session, '0-4')
print(df_session.shape)
with pd.option_context('display.max_columns', None):
    display(df_session.tail(3))

(1, 2)


Unnamed: 0,level_group,elapsed_time
0,0-4,2657121


### Event Count Features

In [17]:
def create_event_count_session_features(df_session:pd.DataFrame,
                                  level_group:str) -> pd.DataFrame:
    """
    Create a dataset with the event features for a single session.

    Parameters
    ----------
    df_session : pd.DataFrame
        The session dataset.

    Returns
    -------
    pd.DataFrame
        The session dataset.
    """
    df_features = df_session \
        .query(f'level_group == "{level_group}"') \
        .groupby('event_name') \
        .agg({'event_name': ['count']}) \
        .T \
        .assign(level_group=level_group) \
        .reset_index(drop=True)

    # df_features = df_session \
    #     .query(f'level_group == "{level_group}"') \
    #     .groupby('level_group') \
    #     .agg({'elapsed_time': 'max'}) \
    #     .reset_index(drop=False)

    return df_features

# test the function
session_id = train[0]
df_session = df_source[df_source['session_id'] == session_id]

df_session = create_event_count_session_features(df_session, '0-4')
print(df_session.shape)
with pd.option_context('display.max_columns', None):
    display(df_session.head(10))

(1, 12)


event_name,checkpoint,cutscene_click,map_click,map_hover,navigate_click,notebook_click,notification_click,object_click,object_hover,observation_click,person_click,level_group
0,1,34,2,2,74,3,12,39,8,3,23,0-4


## Model Training

In [18]:
def test_features(X:pd.DataFrame,
                  y:pd.DataFrame,
                  create_features) -> pd.DataFrame:
    
    # create the datasets
    df_train = create_dataset(X=X, y=y, session_list=train, create_features=create_features)
    df_val = create_dataset(X=X, y=y, session_list=val, create_features=create_features)
    df_test = create_dataset(X=X, y=y, session_list=test, create_features=create_features)

    # prepare the classifier
    classifier = setup(
        data=df_train,
        test_data=df_val,
        target='correct',
        train_size=1,
        session_id=random_state,
        fix_imbalance=False,
        fix_imbalance_method='RandomOverSampler',
        html=False,
        verbose=True)
    
    # compare the models
    top_model = compare_models(exclude=['knn'], n_select=5)

    # test the top model
    df_predicted = predict_model(estimator=top_model[0], data=df_test)
    print(classification_report(y_true=df_predicted.correct, y_pred=df_predicted.prediction_label))    

### Basic Dataset

In [19]:
test_features(
    X=df_source,
    y=df_source_labels,
    create_features=create_basic_session_features)

  0%|          | 0/7067 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

                    Description             Value
0                    Session id                51
1                        Target           correct
2                   Target type            Binary
3           Original data shape       (169614, 2)
4        Transformed data shape       (169614, 2)
5   Transformed train set shape       (127206, 2)
6    Transformed test set shape        (42408, 2)
7              Numeric features                 1
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator   StratifiedKFold
13                  Fold Number                10
14                     CPU Jobs                -1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  clf-default-name
18                          USI              829e


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.7318  0.7301  0.9012  0.7616   
dt               Decision Tree Classifier    0.7315  0.7301  0.9050  0.7598   
gbc          Gradient Boosting Classifier    0.7315  0.7300  0.9405  0.7451   
et                 Extra Trees Classifier    0.7315  0.7301  0.9050  0.7598   
xgboost         Extreme Gradient Boosting    0.7315  0.7301  0.9050  0.7598   
lightgbm  Light Gradient Boosting Machine    0.7315  0.7301  0.9050  0.7598   
ada                  Ada Boost Classifier    0.7295  0.7224  0.9785  0.7296   
lr                    Logistic Regression    0.7042  0.5607  1.0000  0.7042   
nb                            Naive Bayes    0.7042  0.6260  1.0000  0.7042   
svm                   SVM - Linear Kernel    0.7042  0.0000  1.0000  0.7042   
ridge                    Ridge Classifier    0.7042  0.0000  1.0000  0.7042   
qda       Quadratic Discriminant Analysis    0.7042 

### Elapsed Time Dataset

In [20]:
test_features(
    X=df_source,
    y=df_source_labels,
    create_features=create_elapsed_time_session_features)

  0%|          | 0/7067 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

                    Description             Value
0                    Session id                51
1                        Target           correct
2                   Target type            Binary
3           Original data shape       (169614, 3)
4        Transformed data shape       (169614, 3)
5   Transformed train set shape       (127206, 3)
6    Transformed test set shape        (42408, 3)
7              Numeric features                 2
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator   StratifiedKFold
13                  Fold Number                10
14                     CPU Jobs                -1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  clf-default-name
18                          USI              c24f


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.7371  0.7382  0.9444  0.7483   
lightgbm  Light Gradient Boosting Machine    0.7364  0.7383  0.9369  0.7507   
xgboost         Extreme Gradient Boosting    0.7336  0.7326  0.9312  0.7505   
ada                  Ada Boost Classifier    0.7303  0.7288  0.9721  0.7325   
lr                    Logistic Regression    0.7042  0.4266  1.0000  0.7042   
ridge                    Ridge Classifier    0.7042  0.0000  1.0000  0.7042   
lda          Linear Discriminant Analysis    0.7042  0.5636  1.0000  0.7042   
dummy                    Dummy Classifier    0.7042  0.5000  1.0000  0.7042   
nb                            Naive Bayes    0.7025  0.5218  0.9949  0.7045   
qda       Quadratic Discriminant Analysis    0.7024  0.6224  0.9946  0.7045   
rf               Random Forest Classifier    0.6496  0.6503  0.7514  0.7512   
dt               Decision Tree Classifier    0.6493 

### Event Count

In [21]:
test_features(
    X=df_source,
    y=df_source_labels,
    create_features=create_event_count_session_features)

  0%|          | 0/7067 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

                    Description             Value
0                    Session id                51
1                        Target           correct
2                   Target type            Binary
3           Original data shape      (169614, 13)
4        Transformed data shape      (169614, 13)
5   Transformed train set shape      (127206, 13)
6    Transformed test set shape       (42408, 13)
7              Numeric features                12
8      Rows with missing values             38.6%
9                    Preprocess              True
10              Imputation type            simple
11           Numeric imputation              mean
12       Categorical imputation              mode
13               Fold Generator   StratifiedKFold
14                  Fold Number                10
15                     CPU Jobs                -1
16                      Use GPU             False
17               Log Experiment             False
18              Experiment Name  clf-default-name


                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.7502  0.7652  0.9251  0.7677   
gbc          Gradient Boosting Classifier    0.7485  0.7602  0.9396  0.7599   
xgboost         Extreme Gradient Boosting    0.7455  0.7582  0.9140  0.7684   
ada                  Ada Boost Classifier    0.7400  0.7446  0.9375  0.7535   
rf               Random Forest Classifier    0.7399  0.7424  0.9099  0.7652   
et                 Extra Trees Classifier    0.7398  0.7450  0.9020  0.7686   
lr                    Logistic Regression    0.7121  0.6578  0.9746  0.7177   
lda          Linear Discriminant Analysis    0.7119  0.6563  0.9717  0.7184   
ridge                    Ridge Classifier    0.7114  0.0000  0.9793  0.7156   
dummy                    Dummy Classifier    0.7042  0.5000  1.0000  0.7042   
qda       Quadratic Discriminant Analysis    0.6995  0.6530  0.9145  0.7283   
nb                            Naive Bayes    0.6977 

## If Rules

In [23]:
# create the datasets
df_train = create_dataset(X=df_source, y=df_source_labels, session_list=train, create_features=create_basic_session_features)
df_val = create_dataset(X=df_source, y=df_source_labels, session_list=val, create_features=create_basic_session_features)
df_test = create_dataset(X=df_source, y=df_source_labels, session_list=test, create_features=create_basic_session_features)

  0%|          | 0/7067 [00:00<?, ?it/s]