In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_pickle('cleaned_train.pkl')
labels = pd.read_csv('labels_q.csv')

In [3]:
train = train[train['level_group'] == '0-4']
train = train.drop(['level_group'], axis=1)

In [5]:
#  sort by session_id then elapsed_time, recalculate index column
train = train.sort_values(['session_id', 'elapsed_time'])

# event index starts from 0 for each session
train['event_index'] = train.groupby('session_id').cumcount()
# drop index column
train = train.drop(['index'], axis=1)

In [6]:
# Make each user's elapsed time into difference from previous row
train['elapsed_time_diff'] = train.groupby('session_id')['elapsed_time'].diff().fillna(0)

In [8]:
# fill nan or inf with 0
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(0)

In [11]:
# get columns that are object type
obj_cols = train.select_dtypes(include=['object']).columns

# if category in obj_cols has less than 1000 counts, replace category with 'other'
for col in obj_cols:
    counts = train[col].value_counts()
    rare_cats = counts[counts < 1000].index.tolist()
    train[col] = train[col].apply(lambda x: 'other' if x in rare_cats else x)


In [13]:
# explode categories into columns
train = pd.get_dummies(train, columns=obj_cols)

In [15]:
train.to_pickle('prepped_train.pkl')