In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import features

In [2]:
feats=pd.read_csv('installation_features.csv')
labels=pd.read_csv('installation_labels.csv')

In [3]:
feature_pipe=features.get_data_processing_pipe(feats,log_features=['game_time', 'event_count'], categorical_features=['last_world', 'last_assessment'])

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, make_scorer, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
import inspect
import xgboost as xgb
import soft_kappa_loss as kappa
from sklearn.utils import class_weight
from OrdinalRegressor import OrdinalRegressor
from OptimizeThresholds import OptimizedRounder

X_train, X_test, y_train, y_test = train_test_split(feats, labels.accuracy_group, test_size=0.05, random_state=42)

### setup the pipeline
ordinal_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', OrdinalRegressor(xgb.XGBRegressor,
                              **{'colsample_bytree':0.5,
                                 'learning_rate':0.1,
                                 'max_depth':7,
                                 'subsample':1}))])


ordinal_pipe.fit(X_train, y_train)
y_pred = ordinal_pipe.predict(X_test, **{'classify': True})
print((y_pred==y_test).mean())
print(cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

[0.5 1.5 2.5]
[1.5 1.5 2.5]
[0.5 2.5 2.5]
[0.5 1.5 3.5]
[-0.19926585  0.80073415  2.35149226]
[0.15036708 1.15036708 2.42574613]
[0.69573668 1.34787078 2.46769131]
[0.69573668 1.3738387  2.34541838]
[0.84321787 1.46282934 2.16422798]
[0.81725957 1.34214598 1.94683043]
[0.74039865 1.52947811 2.13950652]
[0.99869706 1.65856097 2.16031007]
[1.08230668 1.7682051  2.36884687]
[1.08218898 1.69632152 2.075291  ]
[0.91887866 1.75126921 2.13463507]
[0.98192528 1.59841677 2.16307262]
[1.02518732 1.67586769 2.21420806]
[0.98853311 1.72318608 2.23219943]
[1.0184816  1.77773441 2.22638417]
[1.01551435 1.82150041 2.27090363]
[0.99397949 1.79329905 2.17103402]
[1.04664058 1.76576817 2.22002463]
[1.01403095 1.78094877 2.21175542]
[1.01532035 1.77061551 2.22578172]
[1.01393207 1.76714884 2.24095391]
[1.02230414 1.7672879  2.2246914 ]
[1.02787738 1.76386041 2.22042216]
[1.02492536 1.76510096 2.23171861]
[1.02423052 1.77068611 2.2246986 ]
[1.02066857 1.76736126 2.2236264 ]
[1.02280532 1.76700547 2.223902

array([[111,  69,  28,  23],
       [ 15,  41,  40,  26],
       [  6,  25,  29,  51],
       [ 10,  24,  85, 302]])

In [20]:
regressor_kappa_score = make_scorer(lambda y, y_pred: cohen_kappa_score(y, y_pred, weights='quadratic'), needs_proba=True)
regressor_accuracy_score = make_scorer(lambda y, y_pred: accuracy_score(y, y_pred), needs_proba=True)

ordinal_cv = GridSearchCV(ordinal_pipe, cv=10, 
                      scoring={'kappa': regressor_kappa_score,
                               'accuracy': regressor_accuracy_score,
                               'rmse': make_scorer(mean_squared_error)},
                      param_grid={'clf__max_depth': [5, 6, 7, 8, 9],
                                  'clf__learning_rate': [0.01, 0.03, 0.1],
                                  'clf__subsample': [0.8, 1],
                                  'clf__colsample_bytree': [0.3, 0.5, 0.8]},
                      refit=False)
ordinal_cv.fit(feats, labels.accuracy_group)
pd.DataFrame(ordinal_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__colsample_bytree,param_clf__learning_rate,param_clf__max_depth,param_clf__subsample,params,split0_test_kappa,...,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse
0,1.248235,0.121510,0.029205,0.000448,0.3,0.01,5,0.8,"{'clf__colsample_bytree': 0.3, 'clf__learning_...",0.426128,...,1.527094,1.511351,1.535135,1.549557,1.529234,1.503019,1.626322,1.534657,0.033195,2
1,1.228184,0.052097,0.029413,0.000751,0.3,0.01,5,1,"{'clf__colsample_bytree': 0.3, 'clf__learning_...",0.425651,...,1.527802,1.524168,1.544002,1.562350,1.538148,1.511784,1.634600,1.543826,0.033406,1
2,1.390350,0.051530,0.029461,0.000336,0.3,0.01,6,0.8,"{'clf__colsample_bytree': 0.3, 'clf__learning_...",0.453609,...,1.511639,1.498218,1.524101,1.530724,1.513676,1.488007,1.611000,1.518860,0.033356,4
3,1.354959,0.073191,0.029926,0.000609,0.3,0.01,6,1,"{'clf__colsample_bytree': 0.3, 'clf__learning_...",0.423182,...,1.514419,1.510285,1.530466,1.544247,1.523655,1.496594,1.628662,1.528539,0.036245,3
4,1.519205,0.080073,0.029766,0.000363,0.3,0.01,7,0.8,"{'clf__colsample_bytree': 0.3, 'clf__learning_...",0.452718,...,1.501825,1.490037,1.513835,1.518565,1.503371,1.479171,1.605219,1.508334,0.034784,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,1.847548,0.061400,0.030999,0.000818,0.8,0.1,7,1,"{'clf__colsample_bytree': 0.8, 'clf__learning_...",0.574742,...,1.028130,1.116717,1.115493,1.010590,1.036850,1.079949,1.139471,1.078751,0.045071,68
86,2.048280,0.049164,0.031392,0.000634,0.8,0.1,8,0.8,"{'clf__colsample_bytree': 0.8, 'clf__learning_...",0.588021,...,1.046987,1.119440,1.143399,1.039388,1.030929,1.081872,1.113661,1.081283,0.036502,65
87,2.043642,0.019643,0.031686,0.000349,0.8,0.1,8,1,"{'clf__colsample_bytree': 0.8, 'clf__learning_...",0.561599,...,1.035838,1.115880,1.134545,1.030925,1.042269,1.086439,1.127543,1.084473,0.041002,59
88,2.249689,0.055419,0.032278,0.000959,0.8,0.1,9,0.8,"{'clf__colsample_bytree': 0.8, 'clf__learning_...",0.560643,...,1.080851,1.129795,1.117845,1.031247,1.049196,1.092669,1.180964,1.095994,0.040598,48


In [25]:
# print(ordinal_cv.best_params_)
pd.DataFrame(ordinal_cv.cv_results_).sort_values('mean_test_kappa', ascending=False).head()[['mean_test_kappa', 'mean_test_accuracy', 'mean_test_rmse']]

Unnamed: 0,mean_test_kappa,mean_test_accuracy,mean_test_rmse
21,0.551784,0.507631,1.068293
50,0.551402,0.505031,1.063029
80,0.550968,0.503901,1.064869
20,0.550338,0.501244,1.069027
52,0.550332,0.5,1.068208


In [29]:
params = pd.DataFrame(ordinal_cv.cv_results_).sort_values('mean_test_kappa', ascending=False)['params'].values[1]
print(params)

{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__subsample': 0.8}


In [30]:
ordinal_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', OrdinalRegressor(xgb.XGBRegressor,
                              **{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__subsample': 0.8}))])


ordinal_pipe.fit(X_train, y_train)
y_pred = ordinal_pipe.predict(X_test, **{'classify': True})
print((y_pred==y_test).mean())
print(cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.5536723163841808
0.621847880310938


array([[123,  51,  33,  24],
       [ 18,  39,  33,  32],
       [ 10,  21,  25,  55],
       [ 10,  37,  71, 303]])