In [3]:
import catboost as cb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [4]:
my_xtrain = pd.read_csv("../ourfeatures/my_bigX_version9.csv")
my_ytrain = pd.read_csv("../ourfeatures/y_train2.csv")

In [5]:
my_xtrain.columns

Index(['app_id', 'transaction_number_max', 'amnt_min', 'amnt_max', 'amnt_mean',
       'amnt_median', 'currency_num_unique_values_x', 'currency_median',
       'operation_type_group_median', 'operation_kind_median',
       ...
       'mymcc90', 'mymcc91', 'mymcc92', 'mymcc93', 'mymcc94', 'mymcc95',
       'mymcc96', 'mymcc97', 'mymcc98', 'mymcc99'],
      dtype='object', length=236)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(my_xtrain, my_ytrain, test_size=0.1)

In [7]:
dataset = cb.Pool(x_train, y_train['flag'])

In [8]:
model1 = cb.CatBoostRegressor(loss_function="Poisson")
grid = {'iterations': [192, 193],
        'learning_rate': [ 0.1],
        'depth': [7, 9],
        'l2_leaf_reg': [3, 4 ]}
model1.grid_search(grid, dataset)

0:	learn: 0.7049961	test: 0.7049160	best: 0.7049160 (0)	total: 606ms	remaining: 1m 55s
1:	learn: 0.5173711	test: 0.5171658	best: 0.5171658 (1)	total: 855ms	remaining: 1m 21s
2:	learn: 0.3981327	test: 0.3979456	best: 0.3979456 (2)	total: 918ms	remaining: 57.9s
3:	learn: 0.3188212	test: 0.3186104	best: 0.3186104 (3)	total: 957ms	remaining: 45s
4:	learn: 0.2645715	test: 0.2643368	best: 0.2643368 (4)	total: 999ms	remaining: 37.4s
5:	learn: 0.2267439	test: 0.2265225	best: 0.2265225 (5)	total: 1.04s	remaining: 32.3s
6:	learn: 0.1996131	test: 0.1993808	best: 0.1993808 (6)	total: 1.1s	remaining: 29.2s
7:	learn: 0.1799106	test: 0.1796793	best: 0.1796793 (7)	total: 1.16s	remaining: 26.6s
8:	learn: 0.1658373	test: 0.1655892	best: 0.1655892 (8)	total: 1.21s	remaining: 24.7s
9:	learn: 0.1545817	test: 0.1542846	best: 0.1542846 (9)	total: 1.26s	remaining: 23s
10:	learn: 0.1468205	test: 0.1465054	best: 0.1465054 (10)	total: 1.31s	remaining: 21.6s
11:	learn: 0.1405717	test: 0.1402205	best: 0.1402205 (1

{'params': {'depth': 9,
  'l2_leaf_reg': 3,
  'iterations': 192,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,

In [10]:
pred = model1.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test['flag'], pred)))
r2 = r2_score(y_test['flag'], pred)
score = model1.score(x_test, y_test['flag'])
local_score = model1.score(x_train, y_train['flag'])
aucs = roc_auc_score(y_test['flag'], pred)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))

print("Best params: ", model1._get_params())

Testing performance
RMSE: 0.16
R2: 0.03
Score: 0.0317
Local Score: 0.0823
Best params:  {'depth': 9, 'verbose': 0, 'iterations': 192, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'loss_function': 'Poisson'}


In [None]:
model9 = cb.CatBoostClassifier()
grid = {'iterations': [190, 180],
        'learning_rate': [0.03, 0.1],
        'depth': [4, 5, 6,],
        'l2_leaf_reg': [0.2, 1, 3]}
model9.grid_search(grid, dataset)

0:	learn: 0.6429227	test: 0.6430383	best: 0.6430383 (0)	total: 167ms	remaining: 31.5s
1:	learn: 0.5974587	test: 0.5975414	best: 0.5975414 (1)	total: 212ms	remaining: 19.9s
2:	learn: 0.5555755	test: 0.5556342	best: 0.5556342 (2)	total: 249ms	remaining: 15.5s
3:	learn: 0.5174456	test: 0.5174489	best: 0.5174489 (3)	total: 283ms	remaining: 13.1s
4:	learn: 0.4831524	test: 0.4831282	best: 0.4831282 (4)	total: 314ms	remaining: 11.6s
5:	learn: 0.4518710	test: 0.4518397	best: 0.4518397 (5)	total: 345ms	remaining: 10.6s
6:	learn: 0.4232018	test: 0.4231615	best: 0.4231615 (6)	total: 377ms	remaining: 9.85s
7:	learn: 0.3973755	test: 0.3973166	best: 0.3973166 (7)	total: 411ms	remaining: 9.35s
8:	learn: 0.3737799	test: 0.3737235	best: 0.3737235 (8)	total: 446ms	remaining: 8.97s
9:	learn: 0.3524435	test: 0.3523918	best: 0.3523918 (9)	total: 480ms	remaining: 8.64s
10:	learn: 0.3329515	test: 0.3329111	best: 0.3329111 (10)	total: 522ms	remaining: 8.5s
11:	learn: 0.3153008	test: 0.3152641	best: 0.3152641 

In [None]:
pred = model9.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test['flag'], pred)))
r2 = r2_score(y_test['flag'], pred)
score = model9.score(x_test, y_test['flag'])
local_score = model9.score(x_train, y_train['flag'])
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))

### Get contest results

In [11]:
contest_x = pd.read_csv("../ourfeatures/my_bigtestX_version9.csv")
contest_y = pd.read_csv("../ourfeatures/y_test2.csv")

In [None]:
prediction = model9.predict_proba(contest_x)[:,1]
contest_y["flag"] = prediction

In [12]:
prediction = model1.predict(contest_x)
contest_y["flag"] = prediction

In [18]:
contest_y

Unnamed: 0,app_id,flag
0,805133,0.043691
1,805134,0.015463
2,805135,0.019833
3,805136,0.007219
4,805137,0.043972
...,...,...
188669,1003045,0.030917
188670,1003047,0.008587
188671,1003048,0.013913
188672,1003049,0.029707


In [19]:
contest_y.to_csv("results/newres-super-12.csv", index=False)