In [1]:
#Python packages
import numpy as np
import pandas as pd
import re
from scipy import stats


#Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv', index_col=0)

In [3]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,year,dayofweek,weekofyear,dayofyear,quarter,is_month_start,Phase_Of_Day,time_by_phase_type,phase_of_day,practice_sec
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,...,2019,4,36,249,3,0.0,Evening,0.0,Evening,47804.47
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,...,2019,4,36,249,3,0.0,Evening,0.0,Evening,47804.47
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,...,2019,4,36,249,3,0.0,Evening,203527.66,Evening,47804.47
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,...,2019,4,36,249,3,0.0,Evening,203527.66,Evening,47804.47
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,...,2019,4,36,249,3,0.0,Evening,203527.66,Evening,47804.47


#### feature counting the number of assessments and games played per user

In [3]:
count = train.groupby(['installation_id','type'])['type'].count()
count = pd.DataFrame(count)
count.rename({'type':'type_count'},axis=1, inplace=True)
count.reset_index(inplace=True)

Unnamed: 0,installation_id,type,type_count
0,0001e90f,Activity,469
1,0001e90f,Clip,5
2,0001e90f,Game,883
3,000447c4,Activity,54
4,000447c4,Clip,3
5,000447c4,Game,124
6,0006a69f,Activity,1771
7,0006a69f,Assessment,261
8,0006a69f,Clip,37
9,0006a69f,Game,1732


In [4]:
count.loc[(count['type']=='Assessment') & (count['type_count']<100), 'assessment_prac'] = 1
count.loc[(count['type']=='Assessment') & (count['type_count']>=100) & (count['type_count']<500), 'assessment_prac'] = 2
count.loc[(count['type']=='Assessment') & (count['type_count']>=500) & (count['type_count']<1000), 'assessment_prac'] = 3
count.loc[(count['type']=='Assessment') & (count['type_count']>=1000), 'assessment_prac'] = 4

In [5]:
count.loc[(count['type']=='Game') & (count['type_count']<100), 'game_prac'] = 1
count.loc[(count['type']=='Game') & (count['type_count']>=100) & (count['type_count']<500), 'game_prac'] = 2
count.loc[(count['type']=='Game') & (count['type_count']>=500) & (count['type_count']<1000), 'game_prac'] = 3
count.loc[(count['type']=='Game') & (count['type_count']>=1000) & (count['type_count']<8000), 'game_prac'] = 4
count.loc[(count['type']=='Game') & (count['type_count']>=8000), 'game_prac'] = 5

In [6]:
temp = train.copy()

In [7]:
cols = ['installation_id', 'type']
temp = temp.join(count.set_index(cols), on=cols, lsuffix='temp_', rsuffix='group_')

In [8]:
train['assessment_prac'] = temp['assessment_prac']
train['game_prac'] = temp['game_prac']

#### feature depicting game/assessment difficulty

In [12]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,weekofyear,dayofyear,quarter,is_month_start,Phase_Of_Day,time_by_phase_type,phase_of_day,practice_sec,assessment_prac,game_prac
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,...,36,249,3,0.0,Evening,0.0,Evening,47804.47,,
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,...,36,249,3,0.0,Evening,0.0,Evening,47804.47,,
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,...,36,249,3,0.0,Evening,203527.66,Evening,47804.47,,
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,...,36,249,3,0.0,Evening,203527.66,Evening,47804.47,,
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,...,36,249,3,0.0,Evening,203527.66,Evening,47804.47,,


In [None]:
g = sns.FacetGrid(train, col='title')
g = g.map(plt.scatter, "timestamp", "game_time", edgecolor="w")

KeyboardInterrupt: 

In [9]:
train.groupby('title')['game_time'].agg(np.mean)
###depending on average game_time can make a feature depicting difficulty

title
12 Monkeys                            0.000000
Air Show                         179259.164832
All Star Sorting                 216088.487906
Balancing Act                         0.000000
Bird Measurer (Assessment)        90174.767762
Bottle Filler (Activity)         354691.892381
Bubble Bath                      252243.488859
Bug Measurer (Activity)          126066.392383
Cart Balancer (Assessment)        44935.744695
Cauldron Filler (Assessment)      61322.941606
Chest Sorter (Assessment)         59035.508904
Chicken Balancer (Activity)      147369.875760
Chow Time                        266010.943418
Costume Box                           0.000000
Crystal Caves - Level 1               0.000000
Crystal Caves - Level 2               0.000000
Crystal Caves - Level 3               0.000000
Crystals Rule                    238294.839778
Dino Dive                        194701.451205
Dino Drink                       194815.691968
Egg Dropper (Activity)           167486.742383
Firewor

#### Exponential Moving Avg Features

In [36]:
def rolling_exponential_average(df):
    col=list(df.select_dtypes(include=['float64','int64']).columns)
    #print(df.head())
    df=df.sort_values(['installation_id','timestamp'])
    df_rolling_avg=df.groupby(['installation_id'])[col].apply(lambda x:x.ewm(alpha=0.1,min_periods=1).mean())
    df_rolling_avg=df_rolling_avg.rename(columns={'accuracy_group':'rolling_accuracy_group','CRYSTALCAVES':'rolling_CRYSTALCAVES','MAGMAPEAK':'rolling_MAGMAPEAK', 'TREETOPCITY':'rolling_TREETOPCITY'})
    #print(df_rolling_avg.index)
    #df_rolling_avg.index.names=['installation_id', 'level_1']
    df_rolling_avg.index.names=['level_1']
    return(df_rolling_avg)

In [37]:
train = rolling_exponential_average(train)

#### sessions per month by user

In [None]:
games_per_month = train.groupby(['installation_id','month'])['game_session'].count()
games_per_month = pd.DataFrame(games_per_month).reset_index()
games_per_month

In [None]:
spec['info']=spec['info'].str.upper()
spec['hashed_info']=spec['info'].transform(hash)
spec_unique=pd.DataFrame(spec[['hashed_info']].drop_duplicates())
spec_unique['deduped_event_id']=np.arange(len(spec_unique))
spec=pd.merge(spec,spec_unique,on='hashed_info',how='left')
z=dict(zip(spec.event_id,spec.deduped_event_id))
df_train['event_id']=df_train['event_id'].map(z)
df_test['event_id']=df_test['event_id'].map(z)
    #df_train=df_train[df_train['event_id'].isin(df_test['event_id'])]
df_train=df_train[df_train['event_id']!=137]  # this particular event id only has 2 records in train and none in test....
df_event_id_train=pd.pivot_table(df_train.loc[:,['installation_id','game_session','event_id']],aggfunc=len,columns=['event_id'],index=['installation_id','game_session']).add_prefix('event_id_').rename_axis(None,axis=1).reset_index()
df_event_id_test=pd.pivot_table(df_test.loc[:,['installation_id','game_session','event_id']],aggfunc=len,columns=['event_id'],index=['installation_id','game_session']).add_prefix('event_id_').rename_axis(None,axis=1).reset_index()
df_event_id_train=df_event_id_train.fillna(0)
df_event_id_train=df_event_id_train.fillna(0)