In [1]:
#Python packages
import numpy as np
import pandas as pd
import re
from scipy import stats
import json

#Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#modeling packages
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train_fin.csv', index_col=0)
# spec = pd.read_csv('specs.csv', index_col=0)
train_labels = pd.read_csv('train_labels.csv', index_col=0)
test = pd.read_csv('test_fin.csv', index_col=0)
new_train = pd.read_csv('new_data.csv', index_col=0)

In [3]:
'''creating a feature determining whether or not a user
cleared an assessment or not, based on the event code information
given to us in the competition'''

from sklearn.model_selection import train_test_split

# Define cleared or not cleared
# 
train['cleared'] = True
train.loc[train['event_data'].str.contains('false') & train['event_code'].isin([4100, 4110]), 'cleared'] = False

test['cleared'] = True
test.loc[test['event_data'].str.contains('false') & test['event_code'].isin([4100, 4110]), 'cleared'] = False

### group by installation_id

In [4]:
# trains=train.groupby('installation_id').last()
# tests = test.groupby('installation_id').last()

# trains = trains.reset_index()
# tests = tests.reset_index()

### Model #1: accumulated activities per user (installation_id)

In [3]:
from sklearn.metrics import confusion_matrix
def qwk(act,pred,n=4,hist_range=(0,3)):
    
    O = confusion_matrix(act,pred)
    O = np.divide(O,np.sum(O))
    
    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            W[i][j] = ((i-j)**2)/((n-1)**2)
            
    act_hist = np.histogram(act,bins=n,range=hist_range)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range)[0]
    
    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E))
    
    num = np.sum(np.multiply(W,O))
    den = np.sum(np.multiply(W,E))
        
    return 1-np.divide(num,den)

In [12]:
'''encoding game titles'''

list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [15]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [13]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [3]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,time_by_phase_type,phase_of_day,practice_sec,assessment_prac,game_prac,months_played,recent_ratio,total_game_time,difficulty,assess_profile
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0.0,Welcome to Lost Lagoon!,Clip,...,0.0,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,0.0,0.0
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0.0,Magma Peak - Level 1,Clip,...,0.0,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,0.0,0.0
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0.0,Sandcastle Builder (Activity),Activity,...,203527.66,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,1.0,0.0
3,b2dba42b,0848ef14a8dc6892,2019-09-06 17:54:56.387000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53.0,Sandcastle Builder (Activity),Activity,...,203527.66,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,1.0,0.0
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06 17:55:03.253000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972.0,Sandcastle Builder (Activity),Activity,...,203527.66,Evening,47804.47,-999.0,-999.0,1.0,0.75,1357.0,1.0,0.0


In [9]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    durations = []
    for i, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        if test_set == True:
            second_condition = True
        else:
            if len(session)>1:
                second_condition = True
            else:
                second_condition= False
        if (session_type == 'Assessment') & (second_condition):
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features = user_activities_count.copy()
    #         features['installation_id'] = session['installation_id'].iloc[0]
#             features['game_session'] = i
            features['session_title'] = session['title'].iloc[0] 
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1

            features.update(accuracy_groups)
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            features['accumulated_actions'] = accumulated_actions
            accumulated_accuracy_group += features['accuracy_group']
            accuracy_groups[features['accuracy_group']] += 1
            if test_set == True:
                all_assessments.append(features)
            else:
                if true_attempts+false_attempts > 0:
                    all_assessments.append(features)
                
            counter += 1

    #         break

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type

    if test_set:
        return all_assessments[-1] 
    return all_assessments

In [13]:
trains = train[['installation_id','practice_sec','assessment_prac','game_prac',
                'recent_ratio', 'months_played','total_game_time']]

In [15]:
new_train.head()

Unnamed: 0,Activity,Assessment,Clip,Game,accumulated_accuracy,accumulated_correct_attempts,accumulated_uncorrect_attempts,accuracy_group,duration_mean,session_title,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,3,0,11,4,0.0,0,0,3,0.0,19,0,0,0,0,0.0,647
1,4,1,14,6,1.0,1,0,0,39.0,13,0,0,0,1,3.0,1143
2,4,2,14,6,0.5,1,11,3,65.5,19,1,0,0,1,1.5,1230
3,9,4,24,10,0.5,2,11,2,41.25,19,2,0,0,2,1.5,2159
4,10,5,28,13,0.5,3,12,3,39.2,13,2,0,1,2,1.6,2586


In [16]:
new_train.shape

(17690, 16)

In [17]:
len(trains.installation_id.unique())

17002

In [35]:
for i, (ins_id, user_sample) in tqdm(enumerate(train[:30].groupby('installation_id', sort=False)), total=17000):
    print(user_sample)

    event_id      game_session                        timestamp  \
0   27253bdc  45bb1e1b6b50c07b 2019-09-06 17:53:46.937000+00:00   
1   27253bdc  17eeb7f223665f53 2019-09-06 17:54:17.519000+00:00   
2   77261ab5  0848ef14a8dc6892 2019-09-06 17:54:56.302000+00:00   
3   b2dba42b  0848ef14a8dc6892 2019-09-06 17:54:56.387000+00:00   
4   1bb5fbdb  0848ef14a8dc6892 2019-09-06 17:55:03.253000+00:00   
5   1325467d  0848ef14a8dc6892 2019-09-06 17:55:06.279000+00:00   
6   1325467d  0848ef14a8dc6892 2019-09-06 17:55:06.913000+00:00   
7   1325467d  0848ef14a8dc6892 2019-09-06 17:55:07.546000+00:00   
8   1325467d  0848ef14a8dc6892 2019-09-06 17:55:07.979000+00:00   
9   1325467d  0848ef14a8dc6892 2019-09-06 17:55:08.566000+00:00   
10  1325467d  0848ef14a8dc6892 2019-09-06 17:55:08.966000+00:00   
11  1325467d  0848ef14a8dc6892 2019-09-06 17:55:09.673000+00:00   
12  1325467d  0848ef14a8dc6892 2019-09-06 17:55:09.930000+00:00   
13  1325467d  0848ef14a8dc6892 2019-09-06 17:55:10.157000+00:0

[30 rows x 31 columns]


In [23]:
compiled_data = []
for i, (ins_id, user_sample) in enumerate(trains.groupby('installation_id', sort=False)):
    compiled_data += user_sample

ValueError: Unable to coerce to Series, length must be 7: given 0

In [7]:
trains_=pd.DataFrame(compiled_data)
trains_.head()

In [31]:
len(new_train['Assessment'].unique())

187

In [24]:
all_features = [x for x in new_train.columns if x not in ['accuracy_group']]
cat_features = ['session_title']
X, y = new_train[all_features], new_train['accuracy_group']

In [5]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
    #                            eval_metric="AUC",
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=2000,
                               od_type="Iter",
#                                depth=8,
                               early_stopping_rounds=500,
    #                            l2_leaf_reg=1,
    #                            border_count=96,
                               random_seed=2019
                              )
        
    return clf
oof = np.zeros(len(X))

In [6]:
from sklearn.model_selection import KFold
import datetime
# preds = np.zeros(len(X_test))
oof = np.zeros(len(X))
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)

training_start_time = time()
for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    start_time = time()
    print(f'Training on fold {fold+1}')
    clf = make_classifier()
    clf.fit(X.loc[trn_idx, all_features], y.loc[trn_idx], eval_set=(X.loc[test_idx, all_features], y.loc[test_idx]),
                          use_best_model=True, verbose=500)
    
#     preds += clf.predict(X_test).reshape(len(X_test))/NFOLDS
    oof[test_idx] = clf.predict(X.loc[test_idx, all_features]).reshape(len(test_idx))
    
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
    
print('-' * 30)
print('OOF QWK:', qwk(y, oof))
print('-' * 30)

Training on fold 1
0:	learn: 1.3792089	test: 1.3792932	best: 1.3792932 (0)	total: 234ms	remaining: 7m 48s
500:	learn: 1.0048554	test: 1.0414856	best: 1.0414856 (500)	total: 19.4s	remaining: 58s
1000:	learn: 0.9789609	test: 1.0372794	best: 1.0372794 (1000)	total: 39.6s	remaining: 39.5s
1500:	learn: 0.9536591	test: 1.0361311	best: 1.0361115 (1499)	total: 1m 2s	remaining: 20.7s
1999:	learn: 0.9326911	test: 1.0365752	best: 1.0360566 (1520)	total: 1m 25s	remaining: 0us

bestTest = 1.03605656
bestIteration = 1520

Shrink model to first 1521 iterations.
Fold 1 finished in 0:01:25.656842
Training on fold 2
0:	learn: 1.3790483	test: 1.3793715	best: 1.3793715 (0)	total: 37.8ms	remaining: 1m 15s
500:	learn: 1.0055635	test: 1.0443320	best: 1.0443320 (500)	total: 19.3s	remaining: 57.7s
1000:	learn: 0.9811364	test: 1.0366911	best: 1.0366911 (1000)	total: 38.8s	remaining: 38.7s
1500:	learn: 0.9588217	test: 1.0335298	best: 1.0335298 (1500)	total: 1m	remaining: 20s
1999:	learn: 0.9394033	test: 1.032140

In [7]:
clf = make_classifier()
clf.fit(X, y, verbose=500)

del X, y

0:	learn: 1.3790183	total: 11.5ms	remaining: 22.9s
500:	learn: 1.0083652	total: 4.4s	remaining: 13.2s
1000:	learn: 0.9833509	total: 8.89s	remaining: 8.87s
1500:	learn: 0.9605747	total: 13.7s	remaining: 4.54s
1999:	learn: 0.9418766	total: 18.6s	remaining: 0us


In [16]:
# process test set
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
X_test = pd.DataFrame(new_test)
del test

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [17]:
preds = clf.predict(X_test)
del X_test

In [21]:
submission = pd.read_csv('sample_submission.csv',index_col=0)

In [22]:
submission['accuracy_group'] = np.round(preds).astype('int')
submission.to_csv('submission.csv', index=None)

In [40]:
submission.reset_index(inplace=True)
submission.head()

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3


In [44]:
submission.to_csv('submission1.csv')

In [42]:
samp = pd.read_csv('sample_submission.csv', index_col=0)
samp.head()

Unnamed: 0_level_0,accuracy_group
installation_id,Unnamed: 1_level_1
00abaee7,3
01242218,3
017c5718,3
01a44906,3
01bc6cb6,3
