# 02-02 : Time Series

Experiment with the data as a time series.

In [None]:
import sys
import gc
import logging

from typing import Iterable
from typing import Tuple, List

import pandas as pd
import numpy as np

import keras
from keras import optimizers
from keras.models import Sequential, Model

from keras.layers import Conv2D, Dense, Dropout, Flatten
from keras.layers import LeakyReLU, LSTM, MaxPooling2D, concatenate, Reshape
from keras.layers import Concatenate

from keras.layers import Input
from keras import callbacks
from keras import regularizers

from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm
import matplotlib.pyplot as plt

## Configure Logging

In [None]:
logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

In [None]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.FileHandler("ex05_06.log"),
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

## Load Source Data

In [None]:
dtypes = {
    "session_id": "category",
    "elapsed_time": np.int32,
    "event_name": "category",
    "name": "category",
    "level": np.uint8,
    "page": "category",
    "room_coor_x": np.float32,
    "room_coor_y": np.float32,
    "screen_coor_x": np.float32,
    "screen_coor_y": np.float32,
    "hover_duration": np.float32,
    "text": "category",
    "fqid": "category",
    "room_fqid": "category",
    "text_fqid": "category",
    "fullscreen": "category",
    "hq": "category",
    "music": "category",
    "level_group": "category",
}

In [None]:
# load the source training set
df_source = pd.read_csv('../data/train.csv.gz', compression='gzip', dtype=dtypes)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

In [None]:
# load the source training labels
df_source_labels = pd.read_csv('../data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

## Functions

In [None]:
def map_question_to_level_group(question_number):
    """
    Maps the question number to the level group.

    Parameters
    ----------
    question_number : int
        The question number.

    Returns
    -------
    str
        The level group.
    """
    if question_number in [1, 2, 3]:
        return '0-4'
    elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return '5-12'
    elif question_number in [14, 15, 16, 17, 18]:
        return '13-22'
    else:
        return None

In [None]:
def find_problem_sessions(data : pd.DataFrame) -> List[str]:
    """
    Finds the sessions that are duplicated on session_id and index. And
    Find sessions with reversed indexes.

    This idea is taken from the following Kaggle notebook:
    https://www.kaggle.com/code/abaojiang/eda-on-game-progress/notebook?scriptVersionId=120133716
    
    Parameters
    ----------
    data : pd.DataFrame
        The data to search.

    Returns
    -------
    List[str]
        The list of session ids that have a problem.
    """

    # find sessions duplicated on session_id and index
    sessions_with_duplicates = df_source.loc[
        data.duplicated(subset=["session_id", "index"], keep=False)] \
        ["session_id"].unique().tolist()


    # find sessions with reversed indexes
    sessions_with_reversed_index = []
    for sess_id, gp in df_source.groupby("session_id", observed=True):
        if not gp["index"].is_monotonic_increasing:
            sessions_with_reversed_index.append(sess_id)

    # via experimentation these sessions have been found to have time 
    # differences < -2000
    negative_time_diff_sessions = [
        '21030417085341900', '21070111080982292', 
        '21090108302064196', '21090409222921812']

    # combine the two lists into a single set
    return set(sessions_with_duplicates + sessions_with_reversed_index + negative_time_diff_sessions)


In [None]:
def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset and add columns for the level group 
    and the question number.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0].astype(int)) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['session_id', 'question_num', 'correct']]
    
    # add the level group column
    df_labels['level_group'] = df_labels['question_num'].apply(map_question_to_level_group) 
        
    return df_labels

In [None]:
def prepare_main_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the main dataset by removing duplicates and removing 
    columns that are not needed.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)
    
    return df_main

## Data Preprocessing

In [None]:
# prepare the main dataset
df_source = prepare_main_dataset(df_source)

# remove sessions with problems
problem_sessions = find_problem_sessions(df_source)
df_source = df_source[~df_source['session_id'].isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    print(df_source.shape)
    display(df_source.head(3))  

In [None]:
# prepare the label dataset
df_source_labels = prepare_label_dataset(df_source_labels)

# remove sessions with problems
df_source_labels = df_source_labels[~df_source_labels['session_id'].isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3, random_state=51))

## Functions to Create Features

In [None]:
def create_initial_features(X:pd.DataFrame,
                            y:pd.DataFrame) -> pd.DataFrame:
    """
    Creates the initial dataset to which features will be added.
    """
    return df_source_labels \
        .groupby(['session_id', 'level_group']) \
        .agg({'correct': ['count']}) \
        .reset_index() \
        .droplevel(1, axis=1) \
        .drop(columns=['correct']) \
        .sort_values(['session_id', 'level_group']) \

# test the function
df_features = create_initial_features(df_source, df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

### Time Series

In [None]:
# the list of all categorical columns
CATEGORICAL_COLUMNS = [
    'event_name',
    'name',
    'level',
    'fqid',
    'room_fqid',
    'text_fqid'
]

In [None]:
print('UNIQUE COUNT')
for column in CATEGORICAL_COLUMNS:
    print(f'\t{column.ljust(11)}:', df_source[column].nunique())


In [None]:
# the categorical columns to use, and the order they are when limiting 
# the number to include.
SELECTED_CATEGORICAL_COLUMNS = [
    'level',
    'event_name',
    'room_fqid'
    'name',
    'text_fqid'
]

# the base columns to include in the time series
STANDARD_COLUMNS = [
    'session_id',
    'index',
    'elapsed_time',
    'level_group'
]

In [None]:
session_id = 20110410405810250
level_group = '5-12'

# select the columns to use in the time series
cat_count = 1   # the number of categorical columns to include
df_selected = df_source[
    STANDARD_COLUMNS + 
    SELECTED_CATEGORICAL_COLUMNS[:cat_count]] \
    .sort_values('index') \


# add the time difference between events
df_selected['time_diff'] = df_selected \
    .groupby(['session_id', 'level_group'])['elapsed_time'] \
    .diff()

# there can be negative time differences, so we need to remove them,
# but within limits
#assert df_selected['time_diff'].min() > -1000

# df_selected['time_diff'] = df_selected['time_diff'] \
#     .apply(lambda x: 0 if x < 0 else x)

#display(df_selected.head(10))

# display the negative time differences with the smallest first
df_selected[df_selected['time_diff'] < -2000] \
    .sort_values('time_diff') \
    .session_id.unique()


In [None]:
df_selected[df_selected['session_id'] == 20110623185050776] \
    .elapsed_time \
    .plot(kind='line')

In [None]:
df_source[df_source['session_id'] == 20110623185050776] \
    .level \
    .plot(kind='line')