In [2]:
#Python packages
import numpy as np
import pandas as pd
import re
from scipy import stats


#Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#modeling packages
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train_fin.csv', index_col=0)
specs = pd.read_csv('specs.csv', index_col=0)
train_labels = pd.read_csv('train_labels.csv', index_col=0)

In [21]:
print(train_labels.shape)
print(train_labels.head())

(17690, 6)
                 installation_id  title  num_correct  num_incorrect  accuracy  \
game_session                                                                    
6bdf9623adc94d89        0006a69f     19            1              0       1.0   
77b8ee947eb84b4e        0006a69f     13            0             11       0.0   
901acc108f55a5a1        0006a69f     19            1              0       1.0   
9501794defd84e4d        0006a69f     19            1              1       0.5   
a9ef3ecb3d1acc6a        0006a69f     13            1              0       1.0   

                  accuracy_group  
game_session                      
6bdf9623adc94d89               3  
77b8ee947eb84b4e               0  
901acc108f55a5a1               3  
9501794defd84e4d               2  
a9ef3ecb3d1acc6a               3  


In [4]:
train.shape

(11341043, 31)

#### encoding game title. Credit for code: https://www.kaggle.com/mhviraf/a-new-baseline-for-dsb-2019-catboost-model

In [11]:
# encode title
list_of_user_activities = list(set(train['title'].value_counts().index))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [12]:
train['timestamp']= pd.to_datetime(train['timestamp'])

In [13]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [19]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,time_by_phase_type,phase_of_day,practice_sec,assessment_prac,game_prac,months_played,recent_ratio,total_game_time,difficulty,assess_profile
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0.0,36,Clip,...,0.0,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,0.0,0.0
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0.0,31,Clip,...,0.0,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,0.0,0.0
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0.0,9,Activity,...,203527.66,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,1.0,0.0
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53.0,9,Activity,...,203527.66,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,1.0,0.0
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972.0,9,Activity,...,203527.66,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,1.0,0.0


In [14]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    durations = []
    for i, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        if test_set == True:
            second_condition = True
        else:
            if len(session)>1:
                second_condition = True
            else:
                second_condition= False
            
        if (session_type == 'Assessment') & (second_condition):
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features = user_activities_count.copy()
    #         features['installation_id'] = session['installation_id'].iloc[0]
#             features['game_session'] = i
            features['session_title'] = session['title'].iloc[0] 
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1

            features.update(accuracy_groups)
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            features['accumulated_actions'] = accumulated_actions
            accumulated_accuracy_group += features['accuracy_group']
            accuracy_groups[features['accuracy_group']] += 1
            if test_set == True:
                all_assessments.append(features)
            else:
                if true_attempts+false_attempts > 0:
                    all_assessments.append(features)
                
            counter += 1

    #         break

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type

    if test_set:
        return all_assessments[-1] 
    return all_assessments

In [15]:
compiled_data = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=17000):
    compiled_data += get_data(user_sample)

HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))

In [16]:
new_data = pd.DataFrame(compiled_data)

In [17]:
new_data.head()

Unnamed: 0,Activity,Assessment,Clip,Game,accumulated_accuracy,accumulated_correct_attempts,accumulated_uncorrect_attempts,accuracy_group,duration_mean,session_title,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,3,0,11,4,0.0,0,0,3,0.0,19,0,0,0,0,0.0,647
1,4,1,14,6,1.0,1,0,0,39.0,13,0,0,0,1,3.0,1143
2,4,2,14,6,0.5,1,11,3,65.5,19,1,0,0,1,1.5,1230
3,9,4,24,10,0.5,2,11,2,41.25,19,2,0,0,2,1.5,2159
4,10,5,28,13,0.5,3,12,3,39.2,13,2,0,1,2,1.6,2586


In [18]:
new_data.shape

(17690, 16)

### Merging with specs

In [None]:
    #spec=None
    spec['info']=spec['info'].str.upper()
    spec['hashed_info']=spec['info'].transform(hash)
    spec_unique=pd.DataFrame(spec[['hashed_info']].drop_duplicates())
    spec_unique['deduped_event_id']=np.arange(len(spec_unique))
    spec=pd.merge(spec,spec_unique,on='hashed_info',how='left')
    z=dict(zip(spec.event_id,spec.deduped_event_id))
    df_train['event_id']=df_train['event_id'].map(z)
    df_test['event_id']=df_test['event_id'].map(z)
        #df_train=df_train[df_train['event_id'].isin(df_test['event_id'])]
    df_train=df_train[df_train['event_id']!=137]  # this particular event id only has 2 records in train and none in test....
    df_event_id_train=pd.pivot_table(df_train.loc[:,['installation_id','game_session','event_id']],aggfunc=len,columns=['event_id'],index=['installation_id','game_session']).add_prefix('event_id_').rename_axis(None,axis=1).reset_index()
    df_event_id_test=pd.pivot_table(df_test.loc[:,['installation_id','game_session','event_id']],aggfunc=len,columns=['event_id'],index=['installation_id','game_session']).add_prefix('event_id_').rename_axis(None,axis=1).reset_index()
    df_event_id_train=df_event_id_train.fillna(0)
    df_event_id_train=df_event_id_train.fillna(0)
    df_event_id_test=df_event_id_test.fillna(0)
