In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('labels_q1.csv')

In [3]:
train = train[train['level_group'] == '0-4']
train = train.drop(['level_group'], axis=1)

In [4]:
# drop screen_coor_x and screen_coor_y
train = train.drop(['screen_coor_x', 'screen_coor_y'], axis=1)

# drop rows that have nan values in room_coor_x and room_coor_y
train = train.dropna(subset=['room_coor_x', 'room_coor_y'])

In [5]:
#  sort by session_id then elapsed_time, recalculate index column
train = train.sort_values(['session_id', 'elapsed_time'])

# event index starts from 0 for each session
train['event_index'] = train.groupby('session_id').cumcount()

In [6]:
# Make each user's elapsed time into difference from previous row
train['elapsed_time_diff'] = train.groupby('session_id')['elapsed_time'].diff().fillna(0)

In [7]:
# drop index column
train = train.drop(['index'], axis=1)

In [31]:
train.isna().sum() / len(train)

session_id           0.0
elapsed_time         0.0
event_name           0.0
level                0.0
room_coor_x          0.0
room_coor_y          0.0
fqid                 0.0
room_fqid            0.0
text_fqid            0.0
fullscreen           0.0
event_index          0.0
elapsed_time_diff    0.0
dtype: float64

In [25]:
train.drop('hover_duration', axis=1, inplace=True)


In [28]:
train.drop(['text', 'hq', 'music', 'name'], inplace=True, axis=1)

In [30]:
# fill text column with 'none'
train['text_fqid'] = train['text_fqid'].fillna('none')
# fill fqid column with 'no_fqid'
train['fqid'] = train['fqid'].fillna('no_fqid')



In [22]:
# get columns that are object type
obj_cols = train.select_dtypes(include=['object']).columns

# if category in obj_cols has less than 1000 counts, replace category with 'other'
for col in obj_cols:
    counts = train[col].value_counts()
    rare_cats = counts[counts < 1000].index.tolist()
    train[col] = train[col].apply(lambda x: 'other' if x in rare_cats else x)


In [33]:
# save to csv
train.to_csv('data/train_cleaned.csv', index=False)