In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer
from sklearn.metrics import f1_score

import optuna

from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier

In [2]:
Xy_train_val = pd.read_csv('data/train.csv', index_col='review_id').fillna('Unknown')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1]

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('data/test.csv', index_col='review_id').fillna('Unknown')

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train_val.shape, y_train_val.shape, X_test.shape



((50876, 10), (50876, 9), (50651, 10))

In [4]:
def objective(trial):
    
    param_catboost = {
        'depth': trial.suggest_int('depth', 2, 9),
        'iterations': trial.suggest_int('iterations', 10, 50, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.25),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.5),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'l2_leaf_reg': trial.suggest_int("l2_leaf_reg", 1, 10, 1),
        'border_count': trial.suggest_categorical('border_count', [32, 64, 128, 256]),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10),
    }
    
    score = 0
    for seed in [0, 42, 100]:
        train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)        
        
        pipeline = OneVsRestClassifier(CatBoostClassifier(
            **param_catboost,
            cat_features=['city', 'position'], 
            text_features=['positive', 'negative'], 
            allow_writing_files=False, verbose=25,
        ))
        
        pipeline.fit(train_x, train_y)
        
        score += f1_score(valid_y, pipeline.predict(valid_x), average='samples')
    
    return score / 3

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[32m[I 2022-02-17 00:35:44,269][0m A new study created in memory with name: no-name-f755a9fc-153b-4e46-a7b4-35c62c0d8de4[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.6643839	total: 2s	remaining: 1m 37s
25:	learn: 0.4186543	total: 1m 16s	remaining: 1m 10s
49:	learn: 0.3798355	total: 3m 3s	remaining: 0us
0:	learn: 0.6221399	total: 4.98s	remaining: 4m 3s


KeyboardInterrupt: 

In [None]:
trial = study.best_trial

print(f"Number of finished trials: {len(study.trials)}")

print(f"Best trial value: {trial.value:.2f}")

print("Params: ")
for key, value in trial.params.items():
    print(f"\t {key}: {value:.2f}")

In [None]:
param_catboost = list(trial.params.items())

model_catboost = CatBoostClassifier(
            **param_catboost,
            cat_features=['city', 'position'], 
            text_features=['positive', 'negative'], 
            allow_writing_files=False, verbose=25
)
model_catboost.fit(X_train_val, y_train_val)
model_catboost.save_model('17022022_catboost')

In [None]:
list(zip(X_train.columns, model_catboost.feature_importances_))

In [6]:
# model_catboost = CatBoostClassifier()
# model_catboost.load_model('17022022_catboost')

In [None]:
def predict_multilabel(model, X):
    y_pred = list(map(lambda x: ','.join(x), mb.inverse_transform(model.predict(X))))
    y_pred_top1 = model.predict_proba(X).argmax(axis=1)
    
    return np.where([len(x) > 0 for x in y_pred], y_pred, y_pred_top1)

In [9]:
pd.DataFrame({
    'review_id': X_test.index, 
    'target': predict_multilabel(model_catboost, X_test),
}).to_csv('answers.csv', index=False)