In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

In [None]:
# Прочтём файл конфига с путями

CONFIG_PATH = "config.yaml"
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
    CONFIG = yaml.load(config_file, Loader=yaml.FullLoader)

In [None]:
# Загрузим обработанный датасет

data = pd.read_csv(CONFIG['datasets_folder'] + '/processed_df.csv')

columns_order = [
    'post_id', 'topic', 'tfidf_sum', 
    'tfidf_mean', 'tfidf_max', 'user_id',
    'gender', 'age', 'country', 'city', 
    'exp_group', 'os', 'source','month', 
    'hour', 'day', 'weekday', 'timestamp', 'target'
]
data = data[columns_order]

In [None]:
data.head()

In [None]:
# Разделим датасет на трейн и тест

X = data.drop(['timestamp', 'target', 'user_id', 'post_id'], axis=1)
y = data['target']

X_train = X.iloc[:-712175].copy()
y_train = y.iloc[:-712175].copy()

X_test = X.iloc[-712175:].copy()
y_test = y.iloc[-712175:].copy()

In [None]:
# Ввиду дисбаланса классов, найдем их веса

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [None]:
class_weights

In [None]:
# Обучим катбуст

categorical_features = ['country', 'city', 'topic']
catboost_model = CatBoostClassifier(class_weights=class_weights, cat_features=categorical_features)

catboost_model.fit(X_train, y_train)

In [None]:
# Грубо оценим качество обученной модели

print(f"Качество на тесте: {catboost_model.score(X_test, y_test)}")
classification_report(y_test, catboost_model.predict(X_test), output_dict=True)

In [None]:
catboost_model.save_model(CONFIG['datasets_folder'] + '/catboost_model', format="cbm")