# **Problem statement**

This is a problem of *binary classification*.

**Train set:** 10000 samples.

**Test set:** 2000 samples.

**Features:** 30.

Steps:
1.   Feature engineering.
2.   Validation setup.
3.   Parameters tuning.

Expected result: predicted probabilities for test set data belonging to the positive class.

Solution quality is measured by the standard ROC-AUC metric:
```
from sklearn.metrics import roc_auc_score

score = roc_auc_score(test_target, solution_frame)
```
Points are calculated using the formula:

`result = 100 if y > 0.82 else (40 + (y - 0.6) / 0.22 * 60 if y > 0.6 else 0)`

where *y* is the ROC-AUC metric result. The number of points is a rounded integer.

# **Install modules**

In [1]:
!pip install catboost



# **Add modules**

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import seaborn as sns

from catboost import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

  import pandas.util.testing as tm


# **Load data**

In [3]:
# Specify the correct path to the train.csv file
train_path = './data/train.csv'
train_data = pd.read_csv(train_path, header=None)

# Specify the correct path to the train-target.csv file
target_path = './data/train-target.csv'
target_data = pd.read_csv(target_path, header=None)

# Specify the correct path to the test.csv file
test_path = './data/test.csv'
test_data = pd.read_csv(test_path, header=None)

# pd.set_option('display.float_format', lambda x: '%.2f' % x)

# **Analyse data**

In [4]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,-220.53053,-70.19744,119.035181,20.711737,-6.152986,52.225051,-23.230903,-166.521871,-41.571463,2.620401,271.292251,2.695313,44.529272,12.998666,25.157748,2.192931,3.438528,-81.684213,49.566982,-36.818566,16.93642,46.543585,-80.962171,-146.652218,10.899085,4.370985,4.060272,40.68089,-37.942655,33.838225
1,-8.536541,-8.305435,-117.828269,-28.588333,14.22324,-42.087807,-45.538664,58.898976,27.749744,2.545916,-97.233793,-4.899233,-104.382283,2.867261,-128.433975,2.280566,3.412432,31.50191,-47.950176,103.060202,57.027508,121.304258,31.507396,88.282912,66.766185,-4.363974,-5.791376,-32.101939,-52.255449,-4.826111
2,126.229973,133.463504,-105.511797,-149.110267,-3.673355,-104.302244,5.815395,100.838385,-46.240211,0.1417,194.054804,-3.134316,-145.510845,86.972373,-126.200888,0.598976,3.351103,51.08496,141.443443,-150.020696,4.819849,88.905713,51.475105,-73.257358,108.947287,8.56724,0.116269,-117.205053,-107.835928,-93.990332
3,369.571563,11.850181,-299.969407,29.371721,-3.457523,-115.901854,159.134323,-149.741411,-108.847522,0.078277,-215.802195,-1.724561,-128.050861,-99.99771,138.824836,0.460472,1.891893,-73.956562,-116.557681,310.894782,80.930568,-166.476192,-73.882682,108.222355,19.573192,-2.242024,0.515601,-25.218215,121.674228,-16.877368
4,-99.563708,-85.166292,-73.363391,-35.357907,5.437025,-54.892519,56.430532,-49.237377,-50.054608,2.586612,-115.997344,-0.058934,97.666751,36.416791,-124.206264,0.638184,3.989474,-23.264695,-92.966637,121.929801,31.682235,209.163125,-22.847302,-62.067337,58.109167,2.862082,0.920341,-132.875142,3.492953,-15.799596


In [5]:
train_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,12.844797,-0.299114,17.4454,-1.089765,0.06051,-0.067073,2.00918,-28.428487,16.467341,1.349504,21.271771,0.009568,-12.566037,0.868286,17.187938,1.077477,3.061034,-13.037922,0.671168,21.573813,0.211515,-2.63617,-13.042007,0.428028,0.999935,-0.153092,0.49348,1.096011,0.690934,-8.751687
std,331.21897,87.84063,166.830889,77.132117,7.079223,95.32823,112.574723,124.090562,141.363732,1.263255,181.467439,2.024503,87.855783,170.544586,117.722386,0.743995,0.846151,62.050977,80.674359,229.410302,46.418509,165.031271,62.060615,82.821135,77.482044,8.557124,1.888925,99.041787,100.623747,78.557761
min,-1625.085835,-285.272638,-670.544061,-287.536745,-25.612006,-404.876372,-395.781633,-514.417062,-526.193023,1.1e-05,-779.558716,-8.085427,-333.520093,-619.878237,-420.900247,0.001132,0.098466,-254.889907,-314.385808,-819.812924,-196.452972,-766.155471,-255.701616,-362.646319,-276.221505,-31.560708,-9.925537,-381.244544,-388.045812,-338.939731
25%,-186.892257,-60.3042,-90.723945,-52.634237,-4.781103,-65.870246,-74.324525,-110.342167,-78.871151,0.073837,-93.345528,-1.365797,-71.433708,-113.590347,-61.817411,0.490904,2.449836,-54.146848,-53.979555,-132.440268,-31.016369,-115.622198,-54.202381,-56.1847,-51.076999,-5.897427,0.045169,-65.880474,-66.301084,-61.817245
50%,30.376174,-0.576299,18.361277,-1.449727,-0.017278,-1.396145,2.512421,-27.85759,14.174901,2.525594,24.600206,0.005653,-13.759096,-5.891067,15.790737,0.885891,3.049842,-12.724674,0.733288,17.014555,0.459627,-8.188229,-12.62832,0.334218,0.464923,-0.101883,0.492704,0.700402,-4.469156,-10.324727
75%,231.413537,58.639722,126.733466,51.377692,4.827844,63.905144,76.779477,55.226946,110.592913,2.600066,137.302646,1.375353,43.800738,111.017308,95.642441,1.618811,3.747585,28.655681,54.538959,172.038852,31.697139,101.651604,28.586541,55.119716,52.929229,5.634585,0.941508,67.585794,62.984446,43.972321
max,1746.951115,348.103285,786.045193,305.712632,28.931791,351.851613,447.816362,402.650987,598.634382,2.67441,870.338426,8.354485,378.61752,630.578879,465.070306,3.785537,4.988274,202.886675,291.272532,1038.727492,191.379527,681.156385,202.958195,302.583135,264.326359,33.644683,12.157634,418.514443,455.981845,340.571609


In [6]:
target_data.head()

Unnamed: 0,0
0,1
1,1
2,0
3,0
4,1


In [7]:
target_data.describe()

Unnamed: 0,0
count,10000.0
mean,0.5051
std,0.499999
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [8]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,394.650101,-102.778703,-48.854416,17.463193,4.744214,-90.50426,135.111157,-27.691507,192.961067,1.755201,-213.139157,-1.371032,167.220896,-438.697273,117.776726,2.678264,2.336249,2.098611,-131.893003,439.386126,-56.947165,140.952506,1.885744,11.483086,-131.665132,12.876561,0.156747,-185.849095,-29.314711,-84.254441
1,62.200701,-139.3235,235.644493,10.860594,1.92111,39.08844,-186.024836,-53.158605,-122.290561,1.426944,235.208582,1.730758,30.691925,232.770741,-150.823103,2.363756,0.432066,0.924914,-5.622402,-402.651882,-45.854744,-148.360929,2.614495,-24.895114,-40.050598,7.376349,1.962719,-123.77691,65.756136,-10.378203
2,232.736729,5.502143,204.738165,45.738959,1.338264,33.604743,92.235943,-115.127927,-95.263085,2.048801,189.115207,0.951928,-47.048687,80.40848,130.937671,1.001301,2.613453,2.824269,-84.963692,-264.639656,-15.638165,-234.60649,0.409558,286.011182,48.109151,-9.277772,1.838195,19.461847,126.144518,-121.300353
3,340.045437,-129.924115,62.694027,61.899097,-1.294288,-58.616589,-55.080664,10.115134,9.051856,2.732693,-21.916455,1.305913,11.176007,44.407738,-27.797463,0.696783,2.966355,0.467539,-72.113602,-56.165896,13.407883,-56.332212,2.801811,70.875892,26.673509,2.389621,2.188665,109.700845,109.378365,-58.425305
4,-60.576957,71.958472,489.203093,121.380719,-3.620431,44.666983,-161.487192,137.920737,540.680255,1.172919,272.675402,0.539,-9.820108,577.827262,-235.411444,0.808339,2.801212,0.059909,-93.998931,-369.64109,-51.559846,-6.645415,1.889414,-21.814728,-91.701674,-9.206418,1.211916,-11.602746,-8.162451,-87.90297


In [9]:
test_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,12.33003,0.594732,21.036386,-1.068319,0.28073,2.078126,1.368706,-30.727283,13.734982,1.526372,26.092106,0.061548,-10.591496,-2.027994,11.19285,1.50888,1.471736,1.494923,1.276287,15.396867,0.299517,-3.980763,1.488627,-0.649107,-0.104626,0.228077,1.494488,-2.989028,2.247501,-7.972199
std,328.044932,87.544043,162.581255,78.113002,7.268029,93.784396,112.922529,123.900147,141.015919,0.879543,177.318166,2.068898,86.821138,169.421484,119.353158,0.86672,0.882983,0.862269,79.328245,222.993899,45.563554,166.370946,0.865149,83.293617,77.554074,8.635147,0.860202,96.719378,101.362127,76.902106
min,-1378.019194,-278.832794,-496.539117,-261.798297,-30.233249,-305.723206,-360.742123,-551.178542,-425.413503,0.004806,-565.358521,-8.959283,-337.703345,-562.933726,-472.332656,0.000238,0.000628,0.003018,-270.694067,-820.626551,-134.89269,-570.100943,0.000176,-264.977882,-237.884728,-25.897209,0.001916,-314.590496,-390.183141,-255.942366
25%,-184.510033,-62.338859,-85.934816,-52.77638,-4.544456,-61.315477,-74.239382,-116.618401,-81.085627,0.774433,-86.432018,-1.308002,-67.099251,-116.454502,-65.928997,0.721389,0.718332,0.76399,-52.222029,-139.438406,-31.603735,-113.846773,0.730311,-56.848055,-53.436358,-5.412394,0.743161,-67.248976,-65.97272,-57.571427
50%,26.355285,-0.258353,25.447463,-0.401079,0.308074,-0.481592,0.687663,-30.509444,14.443198,1.521427,33.4161,0.120962,-12.376938,-11.147522,11.209582,1.515249,1.437937,1.483309,-0.976532,3.541607,-0.005193,-13.143199,1.507663,-2.288634,-0.548459,0.327443,1.488349,-7.102902,-5.558406,-9.936732
75%,224.14215,61.381564,125.605397,52.32719,5.492492,62.86743,78.602144,52.159752,107.182238,2.324663,147.175637,1.462057,47.060195,110.385643,90.804152,2.26805,2.240815,2.250212,56.664534,167.597548,31.608875,99.427691,2.226652,53.654201,51.593613,5.872039,2.232687,62.150788,64.326872,41.716363
max,1260.03755,309.024478,705.392347,272.912431,22.347921,376.926664,507.25396,389.551551,588.96808,2.999879,618.520385,7.273638,314.003973,638.697169,421.935986,2.998587,2.999357,2.999751,293.785474,727.874581,170.590508,599.481226,2.999377,286.011182,298.695818,31.723915,2.998223,342.78096,407.922899,273.218286


In [None]:
plt.figure(figsize=(27,27))
for i in range(train_data.shape[1]):
    plt.subplot(6, 5, i+1)
    sns.distplot(train_data[i], bins=20)

In [None]:
plt.figure(figsize=(27,27))
for i in range(test_data.shape[1]):
    plt.subplot(6, 5, i+1)
    sns.distplot(test_data[i], bins=20)

# **Preprocess data**

In [12]:
features_to_drop = [9, 15, 16, 17, 22, 26]
train_data = train_data.drop(features_to_drop, axis=1)
X_test = test_data.drop(features_to_drop, axis=1)

X = np.array(train_data)
y = np.ravel(target_data)

# **Build Catboost model**

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=123)

In [14]:
# These parameters give ROC AUC 0.8224
model = CatBoostClassifier(iterations=1400,
                           learning_rate=0.01,
                           l2_leaf_reg=8.0,
                           depth=10,
                           rsm=0.5,
                           loss_function='Logloss',
                           logging_level='Silent',
                           use_best_model=True,
                           random_state=123
                           )

In [15]:
model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid)
    )

<catboost.core.CatBoostClassifier at 0x7fe584620828>

In [16]:
predictions = model.predict(X_train)
score = roc_auc_score(y_train, predictions)
print("Prediction score for training data:", score)
predictions = model.predict(X_valid)
preds_proba = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, predictions)
print("Prediction score for validation data:", score)
score_proba = roc_auc_score(y_valid, preds_proba[:, 1])
print("Prediction score for validation data (proba):", score_proba)

Prediction score for training data: 0.9853696191456116
Prediction score for validation data: 0.801920192019202
Prediction score for validation data (proba): 0.8427112711271126


# **Cross-validation**

In [17]:
#cv_params = model.get_params()
#cv_data = cv(
#    Pool(X, label=y),
#    cv_params,
#    fold_count=5,
#    shuffle=True,
#    plot=True
#    )

In [18]:
#print('Best validation AUC score: {:.6f}±{:.2f} on step {}'.format(
#    np.max(cv_data['test-AUC-mean']),
#    cv_data['test-AUC-std'][np.argmax(cv_data['test-AUC-mean'])],
#    np.argmax(cv_data['test-AUC-mean'])
#))

In [19]:
train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_valid, y_valid)

# **Hyperparameters tuning**

In [20]:
#param_grid = {
#    'iterations': [500, 1000, 2000],
#    'learning_rate': [0.01, 0.03],
#    'l2_leaf_reg': [7, 9, 11],
#    'depth': [8, 10, 12],
#    'rsm': [0.4, 0.5, 0.6]
#    }

#cbc = CatBoostClassifier(loss_function='Logloss')
#tuned_model = cbc.grid_search(param_grid,
#                              X=X,
#                              y=y,
#                              cv=5
#                              )

In [21]:
tuned_model = CatBoostClassifier(iterations=1500,
                                 learning_rate=0.01,
                                 l2_leaf_reg=3.5,
                                 depth=8,
                                 rsm=0.98,
                                 loss_function='Logloss',
                                 logging_level='Silent',
                                 use_best_model=True,
                                 random_state=123)

tuned_model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid)
    )

<catboost.core.CatBoostClassifier at 0x7fe5847ae5f8>

In [22]:
#tuned_model = CatBoostClassifier(
#    iterations=best['iterations'],
#    learning_rate=best['learning_rate'],
#    l2_leaf_reg=best['l2_leaf_reg'],
#    loss_function=best['loss_function'],
#    depth=best['depth'],
#    rsm=best['rsm'],
#    task_type='GPU',
#    verbose=False
#)
#cv_data = cv(Pool(X, y),
#             tuned_model.get_params(),
#             fold_count=5)

In [23]:
#print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-AUC-mean'])))

In [24]:
predictions = tuned_model.predict(X_train)
score = roc_auc_score(y_train, predictions)
print("Prediction score for training data:", score)
predictions = tuned_model.predict(X_valid)
preds_proba = tuned_model.predict_proba(X_valid)
score = roc_auc_score(y_valid, predictions)
print("Prediction score for validation data:", score)
score_proba = roc_auc_score(y_valid, preds_proba[:, 1])
print("Prediction score for validation data (proba):", score_proba)
#print("Prediction proba for validation data:", preds_proba[:, 1])
#print("Prediction proba for test data:", tuned_model.predict_proba(X_test)[:, 1])

Prediction score for training data: 0.9260416359993822
Prediction score for validation data: 0.7998799879987999
Prediction score for validation data (proba): 0.8447094709470947


In [25]:
predictions = tuned_model.predict_proba(X_test)[:, 1]
output = pd.DataFrame(predictions)
output.to_csv('./data/submission.csv', index=False, header=False)