In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import yaml

from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
# Прочтём файл конфига с путями

CONFIG_PATH = "config.yaml"
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
    CONFIG = yaml.load(config_file, Loader=yaml.FullLoader)

In [3]:
# Загрузим обработанный датасет

data = pd.read_csv(CONFIG['datasets_folder'] + '/processed_df.csv')

columns_order = [
    'post_id', 'topic', 'tfidf_sum', 
    'tfidf_mean', 'tfidf_max', 'user_id',
    'gender', 'age', 'country', 'city', 
    'exp_group', 'os', 'source','month', 
    'hour', 'day', 'weekday', 'timestamp', 'target'
]
data = data[columns_order]

In [4]:
data.head()

Unnamed: 0,post_id,topic,tfidf_sum,tfidf_mean,tfidf_max,user_id,gender,age,country,city,exp_group,os,source,month,hour,day,weekday,timestamp,target
0,3699,covid,5.503323,4e-06,0.20018,31483,1,24,Russia,Yekaterinburg,1,0,0,11,6,1,0,2021-11-01 06:01:56,1
1,3648,covid,5.228833,4e-06,0.212564,62188,0,22,Russia,Moscow,2,0,0,11,6,1,0,2021-11-01 06:01:56,1
2,368,business,19.789338,1.5e-05,0.233924,89579,0,24,Belarus,Byarozawka,2,1,0,11,6,1,0,2021-11-01 06:01:56,1
3,6433,movie,6.80985,5e-06,0.16298,4258,1,23,Russia,Verkhnyaya Toyma,4,0,0,11,6,1,0,2021-11-01 06:01:56,1
4,2982,covid,5.144025,4e-06,0.195852,103627,1,36,Russia,Arshty,2,0,0,11,6,1,0,2021-11-01 06:01:56,1


In [5]:
data.shape

(4555190, 19)

In [6]:
# Разделим датасет на train и val

X = data.drop(['timestamp', 'target', 'user_id', 'post_id'], axis=1)
y = data['target']

X_train = X.iloc[:-1000000].copy()
y_train = y.iloc[:-1000000].copy()

X_val = X.iloc[-1000000:].copy()
y_val = y.iloc[-1000000:].copy()

In [7]:
# Обучим катбуст

cat_features = [
    'country', 'city', 'topic', 'gender', 
    'exp_group', 'os', 'source',
    'month', 'hour', 'day', 'weekday'
]


params = dict(
    cat_features=cat_features,
    verbose=False,
    random_seed=42,
)

catboost_model = CatBoostClassifier(**params)

catboost_model.fit(
    X_train, 
    y_train,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x244f54729e0>

In [8]:
print(f"Качество на тесте: {catboost_model.score(X_val, y_val)}")
classification_report(y_val, catboost_model.predict(X_val), output_dict=True)

Качество на тесте: 0.602874


{'0': {'precision': 0.6040533197082442,
  'recall': 0.565653452921344,
  'f1-score': 0.5842230788720421,
  'support': 493249},
 '1': {'precision': 0.6018617115183411,
  'recall': 0.6391028335415224,
  'f1-score': 0.6199234728546846,
  'support': 506751},
 'accuracy': 0.602874,
 'macro avg': {'precision': 0.6029575156132927,
  'recall': 0.6023781432314332,
  'f1-score': 0.6020732758633633,
  'support': 1000000},
 'weighted avg': {'precision': 0.6029427200664026,
  'recall': 0.602874,
  'f1-score': 0.6023142892231401,
  'support': 1000000}}

In [9]:
print(f"Качество на трейне: {roc_auc_score(y_train, catboost_model.predict_proba(X_train)[:, 1])}")
print(f"Качество на вале: {roc_auc_score(y_val, catboost_model.predict_proba(X_val)[:, 1])}")

Качество на трейне: 0.5849210457749224
Качество на вале: 0.6433835276652008


Обучим на полный train

In [10]:
cat_features = [
    'country', 'city', 'topic', 'gender', 
    'exp_group', 'os', 'source',
    'month', 'hour', 'day', 'weekday'
]


params = dict(
    cat_features=cat_features,
    depth=2,
    iterations=250,
    verbose=False,
    random_seed=42,
)

catboost_model = CatBoostClassifier(**params)

catboost_model.fit(
    X, 
    y,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x26b6a83be50>

In [11]:
print(f"Качество: {catboost_model.score(X, y)}")
classification_report(y, catboost_model.predict(X), output_dict=True)

Качество: 0.5985704218704379


{'0': {'precision': 0.6101877254489632,
  'recall': 0.5578047997624356,
  'f1-score': 0.5828216100097918,
  'support': 2289905},
 '1': {'precision': 0.5886922404825281,
  'recall': 0.6397791006429655,
  'f1-score': 0.6131734287367956,
  'support': 2265285},
 'accuracy': 0.5985704218704379,
 'macro avg': {'precision': 0.5994399829657456,
  'recall': 0.5987919502027006,
  'f1-score': 0.5979975193732937,
  'support': 4555190},
 'weighted avg': {'precision': 0.5994980726217066,
  'recall': 0.5985704218704379,
  'f1-score': 0.5979154962549321,
  'support': 4555190}}

In [12]:
print(f"Качество на трейне: {roc_auc_score(y, catboost_model.predict_proba(X)[:, 1])}")

Качество на трейне: 0.6382765788638578


In [13]:
catboost_model.save_model(CONFIG['data_folder'] + '/catboost_model', format="cbm")