In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')
sys.path.append(str(current_dir) + '/../input/')

from codes import utils, loader

import importlib
for m in [utils, loader]:
    importlib.reload(m)

In [8]:
PATH = '../input'

train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
labels_df = pd.read_csv(f'{PATH}/train_labels.csv')
submission_df = pd.read_csv(f'{PATH}/sample_submission.csv')

In [10]:
labels_df.head()

Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1
3,20090314363702160_q1,1
4,20090314441803444_q1,1


In [19]:
train_events_per_session = train_df['session_id'].value_counts()
test_events_per_session = test_df['session_id'].value_counts()

In [22]:
session_ids = np.array(train_df['session_id'].unique())
peak_session_ids = train_df[train_df['elapsed_time'] > 4e7]['session_id'].unique()

In [25]:
train_df.columns

Index(['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level',
       'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
       'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid',
       'fullscreen', 'hq', 'music', 'level_group'],
      dtype='object')

In [27]:
train_df['name'].unique()

array(['basic', 'undefined', 'close', 'open', 'prev', 'next'],
      dtype=object)

In [28]:
grouped_df = train_df.groupby(['session_id', 'level'])\
    ['index'].count().reset_index()
grouped_df.columns = ['session_id', 'level', 'index_count']
mean_counts = grouped_df.groupby('level').mean().drop('session_id', axis=1)
mean_counts

Unnamed: 0_level_0,index_count
level,Unnamed: 1_level_1
0,29.31276
1,30.931573
2,42.071738
3,48.833857
4,18.192716
5,28.326004
6,90.434757
7,54.183987
8,37.80669
9,48.279226


In [29]:
train_page_number_counts = train_df['page'].value_counts().sort_index()
test_page_number_counts = test_df['page'].value_counts().sort_index()

In [30]:
train_page_number_counts

0.0    37232
1.0    50714
2.0    24672
3.0    31350
4.0    44207
5.0    51154
6.0    45417
Name: page, dtype: int64

In [31]:
# Count values
train_fqids = train_df['fqid'].value_counts()
train_room_fqids = train_df['room_fqid'].value_counts()
train_text_fqids = train_df['text_fqid'].value_counts()
test_fqids = test_df['fqid'].value_counts()
test_room_fqids = test_df['room_fqid'].value_counts()
test_text_fqids = test_df['text_fqid'].value_counts()
train_fqid_bundle = [train_fqids, train_room_fqids, train_text_fqids]
test_fqid_bundle = [test_fqids, test_room_fqids, test_text_fqids]
fqid_labels = ["fqid", "room_fqid", "text_fqid"]


In [33]:
train_fqids.head(100)

worker                            939555
archivist                         563259
gramps                            561000
wells                             394234
toentry                           392221
confrontation                     348723
crane_ranger                      251943
groupconvo                        227881
flag_girl                         224758
tomap                             202176
tostacks                          169123
tobasement                        162947
archivist_glasses                 158513
boss                              151822
journals                          150014
seescratches                      146223
groupconvo_flag                   136133
cs                                129873
teddy                             119041
expert                            118147
businesscards                     111142
ch3start                          107078
tunic.historicalsociety           106827
tofrontdesk                        99897
savedteddy      