In [1]:
import catboost as cb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
my_xtrain = pd.read_csv("../ourfeatures/my_bigX_version23.csv")
my_ytrain = pd.read_csv("../ourfeatures/y_train2.csv")

In [5]:
my_xtrain.columns

Index(['app_id', 'transaction_number_max', 'amnt_min', 'amnt_max', 'amnt_mean',
       'amnt_median', 'currency_num_unique_values_x', 'currency_median',
       'operation_type_group_median', 'operation_kind_median',
       ...
       'mymcc90', 'mymcc91', 'mymcc92', 'mymcc93', 'mymcc94', 'mymcc95',
       'mymcc96', 'mymcc97', 'mymcc98', 'mymcc99'],
      dtype='object', length=236)

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [15]:
X_scaled = scaler.fit_transform(my_xtrain)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, my_ytrain, test_size=0.1)

In [18]:
dataset = cb.Pool(x_train, y_train['flag'])

In [27]:
model1 = cb.CatBoostRegressor(loss_function="Poisson")
grid = {'iterations': [194, 193],
        'learning_rate': [ 0.15],
        'depth': [10, 9],
        'l2_leaf_reg': [3, 4 ]}
model1.grid_search(grid, dataset)

0:	learn: 0.5950682	test: 0.5953516	best: 0.5953516 (0)	total: 849ms	remaining: 2m 43s
1:	learn: 0.3917021	test: 0.3922965	best: 0.3922965 (1)	total: 1.79s	remaining: 2m 51s
2:	learn: 0.2818943	test: 0.2826404	best: 0.2826404 (2)	total: 2.68s	remaining: 2m 50s
3:	learn: 0.2211345	test: 0.2220191	best: 0.2220191 (3)	total: 4s	remaining: 3m 9s
4:	learn: 0.1855278	test: 0.1865158	best: 0.1865158 (4)	total: 6.67s	remaining: 4m 11s
5:	learn: 0.1635398	test: 0.1646564	best: 0.1646564 (5)	total: 8.06s	remaining: 4m 12s
6:	learn: 0.1492282	test: 0.1504222	best: 0.1504222 (6)	total: 9.15s	remaining: 4m 4s
7:	learn: 0.1392621	test: 0.1405623	best: 0.1405623 (7)	total: 10.4s	remaining: 4m 1s
8:	learn: 0.1328215	test: 0.1342366	best: 0.1342366 (8)	total: 11.5s	remaining: 3m 56s
9:	learn: 0.1279713	test: 0.1296702	best: 0.1296702 (9)	total: 12.5s	remaining: 3m 50s
10:	learn: 0.1245107	test: 0.1263409	best: 0.1263409 (10)	total: 13.5s	remaining: 3m 45s
11:	learn: 0.1216883	test: 0.1237635	best: 0.12


KeyboardInterrupt



In [6]:
pred = model1.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test['flag'], pred)))
r2 = r2_score(y_test['flag'], pred)
score = model1.score(x_test, y_test['flag'])
local_score = model1.score(x_train, y_train['flag'])
aucs = roc_auc_score(y_test['flag'], pred)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))

print("Best params: ", model1._get_params())

Testing performance
RMSE: 0.16
R2: 0.03
Score: 0.0262
Local Score: 0.0905
Best params:  {'depth': 9, 'verbose': 0, 'iterations': 193, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'loss_function': 'Poisson'}


In [19]:
model9 = cb.CatBoostClassifier()
grid = {'iterations': [190, 180],
        'learning_rate': [0.03, 0.1],
        'depth': [4,  6,],
        'l2_leaf_reg': [0.2, 1, 3]}
model9.grid_search(grid, dataset)

0:	learn: 0.6434651	test: 0.6435664	best: 0.6435664 (0)	total: 76.5ms	remaining: 14.5s
1:	learn: 0.5976632	test: 0.5978814	best: 0.5978814 (1)	total: 177ms	remaining: 16.6s
2:	learn: 0.5562616	test: 0.5565217	best: 0.5565217 (2)	total: 252ms	remaining: 15.7s
3:	learn: 0.5181140	test: 0.5183784	best: 0.5183784 (3)	total: 341ms	remaining: 15.8s
4:	learn: 0.4837684	test: 0.4840517	best: 0.4840517 (4)	total: 418ms	remaining: 15.5s
5:	learn: 0.4523614	test: 0.4526721	best: 0.4526721 (5)	total: 517ms	remaining: 15.9s
6:	learn: 0.4238837	test: 0.4242128	best: 0.4242128 (6)	total: 598ms	remaining: 15.6s
7:	learn: 0.3979587	test: 0.3983199	best: 0.3983199 (7)	total: 678ms	remaining: 15.4s
8:	learn: 0.3744052	test: 0.3748049	best: 0.3748049 (8)	total: 754ms	remaining: 15.2s
9:	learn: 0.3529789	test: 0.3534132	best: 0.3534132 (9)	total: 837ms	remaining: 15.1s
10:	learn: 0.3334611	test: 0.3339259	best: 0.3339259 (10)	total: 930ms	remaining: 15.1s
11:	learn: 0.3157300	test: 0.3162349	best: 0.316234

{'params': {'depth': 6,
  'l2_leaf_reg': 3,
  'iterations': 190,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,

In [26]:
pred = model9.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test['flag'], pred)))
r2 = r2_score(y_test['flag'], pred)
score = model9.score(x_test, y_test['flag'])
local_score = model9.score(x_train, y_train['flag'])
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))
print("Best params: ", model9._get_params())

Testing performance
RMSE: 0.16
R2: -0.02
Score: 0.9739
Local Score: 0.9733
Best params:  {'depth': 6, 'verbose': 0, 'loss_function': 'Logloss', 'iterations': 190, 'l2_leaf_reg': 3, 'learning_rate': 0.1}


### Get contest results

In [21]:
contest_x = pd.read_csv("../ourfeatures/my_bigXtest_version23.csv")
contest_y = pd.read_csv("../ourfeatures/y_test2.csv")

In [22]:
X_test__scaled = scaler.fit_transform(contest_x)

In [23]:
prediction = model9.predict_proba(X_test__scaled)[:,1]
contest_y["flag"] = prediction

In [9]:
prediction = model1.predict(contest_x)
contest_y["flag"] = prediction

In [24]:
contest_y

Unnamed: 0,app_id,flag
0,805133,0.028185
1,805134,0.014281
2,805135,0.009593
3,805136,0.006475
4,805137,0.050528
...,...,...
188669,1003045,0.023119
188670,1003047,0.009953
188671,1003048,0.008818
188672,1003049,0.022593


In [25]:
contest_y.to_csv("results/newres-super-14.csv", index=False)