In [1]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pathlib import Path
import pandas as pd
from sklearn.metrics import f1_score

In [2]:
work_path = Path('.')
df = pd.read_csv(work_path.joinpath('full_data.csv').__str__())
target_class_map = {target: n for n, target in enumerate(df.topic.unique())}
df.topic = df.topic.map(target_class_map)

test_idx = []
for target in df.topic.unique():
    test_idx += df[df.topic == target].sample(frac=0.25, random_state=42).index.tolist()

In [3]:
train_df = df.drop(index=test_idx)
test_df = df.loc[test_idx]

# CatBoostClassifier on text_feature

In [83]:
train_set = Pool(data=train_df[['content']],
                  label=train_df.topic.to_numpy(),
                  text_features=['content'])

eval_set = Pool(data=test_df[['content']],
                  label=test_df.topic.to_numpy(),
                  text_features=['content'])

In [84]:
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [3, 6, 9],
    'auto_class_weights': ['Balanced', 'SqrtBalanced']
}

scorer = make_scorer(f1_score, average='weighted')

In [85]:
base_params = {
    'random_state': 42,
    "verbose": False,
    'early_stopping_rounds': 5,
    #'use_best_model': True,
    "loss_function": "MultiClass",
    'text_features': ['content']
}
model = CatBoostClassifier(**base_params)

In [105]:
clf = GridSearchCV(model, param_grid)

In [87]:
clf.fit(train_df[['content']], train_df.topic.to_numpy())

In [88]:
pd.DataFrame(clf.cv_results_)#.dropna(subset=['split0_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_auto_class_weights,param_depth,param_iterations,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.26056,0.016294,0.067887,0.0088,Balanced,3,100,0.01,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.84366,0.872347,0.878623,0.854232,0.547776,0.799328,0.126394,4
1,4.086935,0.012434,0.06075,0.002067,Balanced,3,100,0.1,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.8035,0.826162,0.814921,0.834433,0.593687,0.774541,0.091026,5
2,7.077548,0.075237,0.05156,0.004011,Balanced,3,200,0.01,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.839931,0.866036,0.861119,0.852511,0.581349,0.800189,0.109778,3
3,6.782198,0.022347,0.052281,0.002161,Balanced,3,200,0.1,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.762478,0.786575,0.772453,0.786514,0.6,0.741604,0.071384,7
4,20.061338,0.209043,0.06262,0.004044,Balanced,6,100,0.01,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.778256,0.802352,0.757532,0.791679,0.545481,0.73506,0.095964,8
5,19.543366,0.071017,0.06231,0.001218,Balanced,6,100,0.1,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.733792,0.7751,0.691535,0.769871,0.5934,0.71274,0.066765,12
6,37.94712,1.21665,0.057299,0.002405,Balanced,6,200,0.01,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.769076,0.792025,0.743185,0.781923,0.563845,0.730011,0.084677,10
7,36.386769,0.576203,0.055955,0.003241,Balanced,6,200,0.1,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.708835,0.739816,0.684648,0.738594,0.595696,0.693518,0.053024,18
8,67.473048,0.355515,0.06642,0.00353,Balanced,9,100,0.01,"{'auto_class_weights': 'Balanced', 'depth': 9,...",0.798623,0.806081,0.764706,0.785653,0.571306,0.745274,0.08811,6
9,67.459434,0.244417,0.067873,0.002905,Balanced,9,100,0.1,"{'auto_class_weights': 'Balanced', 'depth': 9,...",0.736948,0.740103,0.712769,0.752941,0.595122,0.707577,0.057709,14


In [89]:
pred = clf.best_estimator_.predict(test_df[['content']])

In [90]:
f1_score(test_df.topic.to_numpy(), pred, average='weighted')

0.8552614328833373

**Вывод:** CatBoost плохо работает со встроенными параметрами обработки текста.

# CatBoostClassifier on clear text 

In [4]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

In [6]:
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

In [7]:
train_df.content = train_df.content.apply(preprocess_text)
test_df.content = test_df.content.apply(preprocess_text)

In [8]:
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [3, 6, 9],
    'auto_class_weights': ['Balanced', 'SqrtBalanced']
}

scorer = make_scorer(f1_score, average='weighted')

In [9]:
base_params = {
    'random_state': 42,
    "verbose": False,
    'early_stopping_rounds': 5,
    #'use_best_model': True,
    "loss_function": "MultiClass",
    'text_features': ['content']
}
model = CatBoostClassifier(**base_params)

In [10]:
clf = GridSearchCV(model, param_grid)

In [13]:
clf.fit(train_df[['content']], train_df.topic.to_numpy())

In [14]:
pd.DataFrame(clf.cv_results_)#.dropna(subset=['split0_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_auto_class_weights,param_depth,param_iterations,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.768715,0.055168,0.053888,0.004351,Balanced,3,100,0.01,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.863167,0.87206,0.886657,0.887518,0.664562,0.834793,0.085606,3
1,3.708021,0.011115,0.050073,0.003418,Balanced,3,100,0.1,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.816982,0.825301,0.810617,0.839885,0.640172,0.786592,0.073862,5
2,7.028702,0.214062,0.04443,0.003129,Balanced,3,200,0.01,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.860298,0.862593,0.879484,0.874605,0.670301,0.829456,0.079901,4
3,6.549841,0.051249,0.042555,0.003323,Balanced,3,200,0.1,"{'auto_class_weights': 'Balanced', 'depth': 3,...",0.766208,0.798336,0.783357,0.810043,0.632999,0.758189,0.064298,10
4,19.820013,0.122248,0.049499,0.003356,Balanced,6,100,0.01,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.803787,0.825014,0.811478,0.832138,0.617217,0.777927,0.080967,6
5,19.471139,0.156368,0.05266,0.003063,Balanced,6,100,0.1,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.734079,0.780264,0.752654,0.794261,0.618077,0.735867,0.062515,15
6,38.734633,0.224491,0.046289,0.003116,Balanced,6,200,0.01,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.777395,0.809524,0.790244,0.820373,0.618077,0.763123,0.074038,9
7,37.951181,0.332758,0.045783,0.003819,Balanced,6,200,0.1,"{'auto_class_weights': 'Balanced', 'depth': 6,...",0.712565,0.763052,0.729412,0.782496,0.626112,0.722727,0.054187,19
8,64.978013,0.652478,0.054627,0.00239,Balanced,9,100,0.01,"{'auto_class_weights': 'Balanced', 'depth': 9,...",0.789443,0.81813,0.790818,0.812339,0.627834,0.767713,0.07086,8
9,65.779137,0.302328,0.054401,0.00325,Balanced,9,100,0.1,"{'auto_class_weights': 'Balanced', 'depth': 9,...",0.738669,0.771658,0.732855,0.794835,0.622095,0.732022,0.059406,17


In [15]:
pred = clf.best_estimator_.predict(test_df[['content']])

In [16]:
f1_score(test_df.topic.to_numpy(), pred, average='weighted')

0.8679819666887993