# EDA:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [3]:
train = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('data/labels_q.csv')

raw_labels = pd.read_csv('data/raw/train_labels.csv')
raw_labels['session_id'] = raw_labels['session_id'].apply(lambda x: x.split('_')[0])

user_answers = raw_labels.groupby('session_id').sum().reset_index()

In [3]:
missing = train.isna().sum() / len(train)

# plot missing values as bars
fig = px.bar(x=missing.index, y=missing.values)
fig.update_layout(title='Missing values in train set', xaxis_title='Feature', yaxis_title='Missing values')
fig.show()

In [4]:
# fqid values that occur less than 100 times in train set are set to 'rare', nan values are set to 'missing'
rare_values = train['fqid'].value_counts()[train['fqid'].value_counts() < 1000].index

train['fqid'] = train['fqid'].apply(lambda x: 'rare' if x in rare_values else x)
train['fqid'] = train['fqid'].fillna('missing')

In [5]:
# text_fqid values that occur less than 100 times in train set are set to 'rare', nan values are set to 'missing'
rare_values = train['text_fqid'].value_counts()[train['text_fqid'].value_counts() < 1000].index

train['text_fqid'] = train['text_fqid'].apply(lambda x: 'rare' if x in rare_values else x)
train['text_fqid'] = train['text_fqid'].fillna('missing')

In [6]:
# text values that occur less than 1000 times in train set are set to 'rare', nan values are set to 'none'
rare_values = train['text'].value_counts()[train['text'].value_counts() < 1000].index

train['text'] = train['text'].apply(lambda x: 'rare' if x in rare_values else x)
train['text'] = train['text'].fillna('none')

In [7]:
# fill coor cols with 0
train['room_coor_x'] = train['room_coor_x'].fillna(0)
train['room_coor_y'] = train['room_coor_y'].fillna(0)

train['screen_coor_x'] = train['screen_coor_x'].fillna(0)
train['screen_coor_y'] = train['screen_coor_y'].fillna(0)

In [8]:
# fill page cols with -1
train['page'] = train['page'].fillna(-1)

In [9]:
# fill hover_duration with -1
train['hover_duration'] = train['hover_duration'].fillna(-1)

In [10]:
# check if users have always the same hq, music
user_hq = train.groupby('session_id')['hq'].sum().reset_index()
user_hq['hq'] = user_hq['hq'].apply(lambda x: 1 if x > 0 else 0)

user_music = train.groupby('session_id')['music'].sum().reset_index()
user_music['music'] = user_music['music'].apply(lambda x: 1 if x > 0 else 0)

In [11]:
user_metrics = user_answers
user_metrics['session_id'] = user_metrics['session_id'].astype(int)
# join user hq and music on user_metrics session_id col
user_metrics = user_metrics.merge(user_hq, on='session_id')
user_metrics = user_metrics.merge(user_music, on='session_id')

In [12]:
user_metrics["year"] = user_metrics["session_id"].apply(lambda x: int(str(x)[:2])).astype(np.uint8)
user_metrics["month"] = user_metrics["session_id"].apply(lambda x: int(str(x)[2:4]) + 1).astype(np.uint8)
user_metrics["weekday"] = user_metrics["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
user_metrics["hour"] = user_metrics["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
user_metrics["minute"] = user_metrics["session_id"].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)
user_metrics["second"] = user_metrics["session_id"].apply(lambda x: int(str(x)[10:12])).astype(np.uint8)
user_metrics["ms"] = user_metrics["session_id"].apply(lambda x: int(str(x)[12:15])).astype(np.uint16)
user_metrics["?"] = user_metrics["session_id"].apply(lambda x: int(str(x)[15:17])).astype(np.uint8)


In [13]:
# plot boxplots x=weekday, y=answers
fig = px.box(user_metrics, x='weekday', y='correct')
fig.update_layout(title='Answers by weekday', xaxis_title='Weekday', yaxis_title='Answers')
# draw line for all users average
fig.add_shape(type="line", x0=-0.25, y0=user_metrics['correct'].median(), x1=6.25, y1=user_metrics['correct'].median(), line=dict(color="Red", width=2))
fig.show()

In [14]:
# calculate mean by weekday
user_metrics.groupby('weekday')['correct'].mean()

weekday
0    13.322231
1    12.755945
2    12.661992
3    12.553625
4    12.510362
5    12.735147
6    13.068736
Name: correct, dtype: float64

In [15]:
# plot boxplots x=weekday, y=answers
fig = px.box(user_metrics, x='hour', y='correct')
fig.update_layout(title='Answers by hour', xaxis_title='Hour', yaxis_title='Answers')
# draw line for all users average
fig.add_shape(type="line", x0=-0.25, y0=user_metrics['correct'].median(), x1=23.25, y1=user_metrics['correct'].median(), line=dict(color="Red", width=2))
fig.show()

In [16]:
# plot boxplots x=hq, y=answers
fig = px.box(user_metrics, x='hq', y='correct')
fig.update_layout(title='Answers by hq', xaxis_title='hq', yaxis_title='Answers')
# draw line for all users average
fig.add_shape(type="line", x0=-0.25, y0=user_metrics['correct'].median(), x1=1.25, y1=user_metrics['correct'].median(), line=dict(color="Red", width=2))
# make it less wide
fig.update_layout(width=500)
fig.show()

In [17]:
# plot boxplots x=weekday, y=answers
fig = px.box(user_metrics, x='music', y='correct')
fig.update_layout(title='Answers by hour', xaxis_title='music', yaxis_title='Answers')
# draw line for all users average
fig.add_shape(type="line", x0=-0.25, y0=user_metrics['correct'].median(), x1=1.25, y1=user_metrics['correct'].median(), line=dict(color="Red", width=2))
fig.show()

In [19]:
train["weekday"] = train["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
train["hour"] = train["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)

In [22]:
train.drop(['hq', 'music'], axis=1, inplace=True)

In [23]:
train.to_pickle('cleaned_train.pkl')

In [4]:
train.shape

(26296946, 20)

In [6]:
2684191 / 26296946

0.10207234710829159