In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import glob
# from collections import Counter
# from tqdm import tqdm, tqdm_notebook
import pickle
from catboost import CatBoostClassifier, Pool
import pyarrow.parquet as pq

In [2]:
def unpickle_file(file):
    with open(file, 'rb') as f:
        return pickle.load(f)

In [3]:
%%time
X_train, y_train = unpickle_file('train_dataset.pcl')
X_valid, y_valid = unpickle_file('valid_dataset.pcl')
X_test = unpickle_file('test_dataset.pcl')

CPU times: user 13.9 s, sys: 3.38 s, total: 17.2 s
Wall time: 20.3 s


In [4]:
X_train.head()

Unnamed: 0,instanceId_userId,instanceId_objectType,instanceId_objectId,audit_pos,audit_clientType,audit_timestamp,audit_timePassed,audit_resourceType,metadata_ownerId,metadata_ownerType,...,HAS_VIDEOS,IS_CENSORED,IS_DISGUSTING,IS_EXTERNAL_SHARE,IS_GIF,IS_INTERNAL_GROUP_SHARE,IS_INTERNAL_SHARE,IS_PART_OF_ALBUM,IS_PART_OF_TOPIC,IS_PROMO
0,508328,Post,23153928,1,MOB,1520363859368,614568,8,14428,GROUP_OPEN_OFFICIAL,...,0,0,0,0,0,0,0,0,1,0
1,12579254,Video,517400,11,WEB,1521568985423,17848919,6,3455,GROUP_OPEN,...,0,0,0,0,0,0,0,0,1,0
2,4965989,Post,37223618,21,API,1518149398057,76599952,8,79231,GROUP_OPEN_OFFICIAL,...,1,0,0,1,0,0,0,0,1,0
3,4099766,Post,28873723,6,MOB,1518408049203,28155035,8,75557,GROUP_OPEN_OFFICIAL,...,0,0,0,0,0,0,0,0,1,0
4,4422422,Post,11752365,2,API,1518442465926,2296891,8,60528,GROUP_OPEN_OFFICIAL,...,1,0,0,1,0,0,0,0,1,0


In [None]:
# audit_timestamp
# index
# #убираем привязку к конкретному времени
# instanceId_objectId
# audit_timePassed

# #перекодировать
# metadata_options

# instanceId_objectId - заменить на эмбединги

In [5]:
ignore_columns = ['instanceId_userId', #нам ненужны идентификаторы конкретного клиента - мы их "перекодировали" в user_activity
                'audit_timestamp', 
                'audit_timePassed', 
                'instanceId_objectId',
                'ImageId']

# ignore_columns_idx = [X_train.columns.get_loc(column_name) for column_name in ignore_columns]
# ignore_columns_idx

In [6]:
# #удаляем ненужные колонки
X_train.drop(ignore_columns, axis=1, inplace=True)
X_valid.drop(ignore_columns, axis=1, inplace=True)

In [7]:
X_train['metadata_numVideos']

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         3
13         2
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
          ..
1049970    0
1049971    0
1049972    0
1049973    2
1049974    0
1049975    0
1049976    0
1049977    0
1049978    0
1049979    0
1049980    3
1049981    0
1049982    3
1049983    0
1049984    0
1049985    0
1049986    0
1049987    0
1049988    0
1049989    0
1049990    0
1049991    3
1049992    0
1049993    0
1049994    0
1049995    0
1049996    0
1049997    0
1049998    0
1049999    0
Name: metadata_numVideos, Length: 1050000, dtype: int64

In [8]:
cat_columns = X_train.select_dtypes('object').columns
cat_columns

Index(['instanceId_objectType', 'audit_clientType', 'metadata_ownerType',
       'metadata_platform', 'membership_status'],
      dtype='object')

In [None]:
eval_set = Pool(X_valid, y_valid, cat_features=cat_columns)

In [None]:
model = CatBoostClassifier(iterations=200,
                           od_pval=0.05,
                           early_stopping_rounds=10,
                           loss_function='CrossEntropy',
                           verbose=False,
                           custom_metric=['AUC'],
                           task_type='GPU')

model.fit(X_train, y_train, plot=True, eval_set=(X_valid, y_valid), cat_features=cat_columns)

In [None]:
model.save_model('gr_boost', 
           format="cbm", 
           export_parameters=None,
           pool=None)

In [None]:
model.get_feature_importance(prettified=True)

In [9]:
model = CatBoostClassifier().load_model(fname='gr_boost', format='catboost')

In [10]:
#удаляем ненужные колонки
test_data = X_test.drop(ignore_columns, axis=1)
X_test.fillna(0, inplace=True, downcast=False)
test_data.fillna(0, inplace=True, downcast=False)

In [10]:
set(X_valid.columns)^set(test_data.columns)

set()

In [11]:
%%time
predicted = model.predict_proba(test_data, thread_count=-1, verbose=True)

CPU times: user 1min 44s, sys: 696 ms, total: 1min 45s
Wall time: 1min 2s


In [12]:
X_test['predicted'] = np.argmax(predicted, axis=1)
result = X_test[["instanceId_userId", "instanceId_objectId", "predicted"]].sort_values(by=['instanceId_userId', 'predicted'])
result.head(10)

Unnamed: 0,instanceId_userId,instanceId_objectId,predicted
381646,8,34024426,0
381647,8,32525896,0
381648,8,26815124,0
381649,8,22387388,0
381650,8,36665670,0
381651,8,14385580,0
381652,8,20989661,0
381653,8,22160290,0
381654,8,20974659,0
381655,8,38659430,0


In [13]:
X_test['predicted'].sum()

139248

In [14]:
model.classes_

In [15]:
submit = result.groupby("instanceId_userId")['instanceId_objectId'].apply(list)
submit.head(10)

instanceId_userId
8      [34024426, 32525896, 26815124, 22387388, 36665...
59                        [29096263, 39141251, 23685237]
92                        [19818572, 12367356, 23774973]
107    [39090161, 35377184, 22664858, 35614366, 10501...
158                       [33303760, 30223730, 19280083]
179                       [23137073, 25605851, 31763374]
188                                 [30483180, 37981047]
206               [576587, 35123322, 22440260, 32493490]
242                                 [30060756, 17014343]
254    [8065156, 24219820, 15200850, 16081726, 31074144]
Name: instanceId_objectId, dtype: object

In [16]:
import csv
submit.to_csv('submit.csv.gz', header = False, compression='gzip')#, quoting=csv.QUOTE_NONE, escapechar=' '