In [1]:
%matplotlib inline
import matplotlib.pylab as plt
plt.style.use(['bmh'])
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from pprint import pprint
import re
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import shap
from sklearn.metrics import *

### Load data

In [2]:
data_file = '../data/features.json.gzip'
data = pd.read_json(data_file, lines=True, compression='gzip')
data = data.set_index('id')
print(data.shape)
data = data.drop(['text','url', 'order','domain'], 1, errors='ignore')
print(data.shape)

(258446, 116)
(258446, 113)


In [3]:
categorical_cols = ['self_tag', 'parent_tag', 'grand_parent_tag', 'right_tag', 'left_tag']

text_cols = ['self_class', 'parent_class', 'grand_parent_class', 'right_class', 'left_class',
             'grand_parent_id', 'left_id', 'parent_id', 'right_id', 'self_id', 
             'grand_parent_itemprop', 'left_itemprop', 'parent_itemprop', 'right_itemprop', 
             'self_itemprop']

In [4]:
data[categorical_cols] = data[categorical_cols].fillna('NA')
data[text_cols] = data[text_cols].fillna('')
data['label'] = data['label'].fillna('none')

In [5]:
X_train = data.drop('label', 1)
y_train = data.label
print(X_train.shape)

(258446, 112)


In [6]:
from catboost import CatBoost, CatBoostClassifier, Pool

In [7]:
train_pool = Pool(X_train, y_train, 
                  cat_features=categorical_cols, 
                  text_features=text_cols)

In [8]:
import json
with open('../data/cb_parameters.json') as j:
    param = json.loads(j.read())

In [9]:
param

{'auto_class_weights': 'SqrtBalanced',
 'bagging_temperature': 0.5,
 'border_count': 254,
 'class_names': ['none', 'content', 'header', 'date', 'summary', 'subheader'],
 'custom_metric': ['F1', 'Accuracy', 'TotalF1'],
 'depth': 9,
 'dictionaries': [{'dictionary_id': 'Word',
   'gram_count': '1',
   'max_dictionary_size': '50000',
   'occurence_lower_bound': 5},
  {'dictionary_id': 'BiGram',
   'gram_count': '2',
   'max_dictionary_size': '50000',
   'occurence_lower_bound': 5}],
 'early_stopping_rounds': 250,
 'eval_metric': 'TotalF1:use_weights=true;average=Macro',
 'has_time': True,
 'l2_leaf_reg': 0.1,
 'learning_rate': 0.3,
 'min_data_in_leaf': 6,
 'n_estimators': 3000,
 'objective': 'MultiClassOneVsAll',
 'random_strength': 2.0,
 'task_type': 'GPU'}

In [10]:
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

class_weight = compute_class_weight(classes=param['class_names'], y=y_train, class_weight='balanced') / 5
class_weight

array([ 0.03574871,  0.65983967,  5.33098185,  5.34421009, 16.22385436,
       13.31509531])

In [11]:
param['n_estimators'] = 1000
param['early_stopping_rounds'] = None
param['auto_class_weights'] = 'SqrtBalanced'
# param['class_weights'] = class_weight
param['objective'] = 'MultiClassOneVsAll'

In [12]:
model = CatBoostClassifier(**param)
model.fit(train_pool, verbose=100);

0:	learn: 0.5324296	total: 217ms	remaining: 3m 36s
100:	learn: 0.9941927	total: 17.7s	remaining: 2m 37s
200:	learn: 0.9982018	total: 33.2s	remaining: 2m 11s
300:	learn: 0.9992539	total: 48s	remaining: 1m 51s
400:	learn: 0.9996054	total: 1m 2s	remaining: 1m 32s
500:	learn: 0.9997960	total: 1m 16s	remaining: 1m 16s
600:	learn: 0.9999098	total: 1m 31s	remaining: 1m
700:	learn: 0.9999399	total: 1m 45s	remaining: 45s
800:	learn: 0.9999749	total: 2m	remaining: 29.8s
900:	learn: 0.9999909	total: 2m 14s	remaining: 14.8s
999:	learn: 0.9999954	total: 2m 28s	remaining: 0us


In [13]:
model.save_model('../data/model_weights_sqrt.cbm', format='cbm')