In [1]:
import re
import pandas as pd
import numpy as np
import yaml

from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Word2Vec
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Прочтём файл конфига с путями

CONFIG_PATH = "config.yaml"
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
    CONFIG = yaml.load(config_file, Loader=yaml.FullLoader)

In [None]:
# Загрузим датасет постов и обработаем тексты

post_df = pd.read_csv(CONFIG['data_folder'] + '/post_data.csv')

In [None]:
post_df.head()

### Предобработаем и лемматизируем text

In [None]:
def preprocess_text(text):
    text = re.findall('[^\W_]+', text)
    text = [token.lower() for token in text if len(token) > 1]
    text = " ".join(text)
    if len(text) == 0:
        return 'placeholder text'
    return text

def lemmatize_row(row: str, lemmatizer: WordNetLemmatizer) -> str:
    list_of_words = row.split()
    list_of_words = list(map(lemmatizer.lemmatize, list_of_words))
    return ' '.join(list_of_words)

def lemmatize_text_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    lemmatizer = WordNetLemmatizer()
    lemmatized_col = f'lemmatized_{column}'
    df[lemmatized_col] = df[column].apply(lambda x: preprocess_text(x))
    df[lemmatized_col] = df[lemmatized_col].apply(
        lambda x: lemmatize_row(x, lemmatizer)
    )
    return df

In [None]:
post_df = lemmatize_text_column(post_df, 'text')

In [None]:
post_df[['lemmatized_text', 'text']].sample(1).values

In [None]:
post_df.to_csv(CONFIG['datasets_folder'] + '/post_data_lemmatized_and_embs.csv', index=False)

### Text w2v

In [None]:
class LossLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        elif self.epoch % 10 == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1
        
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, w2v_model, alpha=1):
        
        self.w2v_model = w2v_model
        self.alpha = alpha
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):
            title_vector = np.zeros((self.w2v_model.wv.vector_size,))
            try:
                tokens = title.split()
            except BaseException:
                continue
            
            counter = 1
            
            for token in tokens:
                if token in self.w2v_model.wv.key_to_index:
                    title_vector += self.w2v_model.wv.get_vector(token)
                    counter += 1 
                    
            X_transformed[i] = title_vector / (self.alpha * counter)
        
        return X_transformed

In [None]:
text_w2v_corpus = post_df.lemmatized_text.str.split()
text_w2v_corpus = text_w2v_corpus
text_w2v_model = Word2Vec(sg=1, hs=1, vector_size=10)
text_w2v_model.build_vocab(text_w2v_corpus)
text_w2v_model.train(
    text_w2v_corpus,
    total_examples=text_w2v_model.corpus_count,
    epochs=250,
    compute_loss=True,
    callbacks=[LossLogger()]
)

In [None]:
text_w2v_model.save(CONFIG['datasets_folder'] + '/text_w2v_model_10')
text_word_vectors = text_w2v_model.wv
text_word_vectors.save(CONFIG['datasets_folder'] + '/text_w2v_word_vectors_10')

In [None]:
text2vec = Word2VecTransformer(w2v_model=text_w2v_model)
w2v_text_transform = text2vec.transform(post_df.lemmatized_text.values)

text_w2v_cols = [f'text_w2v_{i}' for i in range(1, 11)]

w2v_df = pd.DataFrame(w2v_text_transform, columns=text_w2v_cols)
post_df = pd.concat((post_df.reset_index(drop=True), w2v_df.reset_index(drop=True)), axis=1)

In [None]:
post_df.head()

In [None]:
post_df.to_csv(CONFIG['datasets_folder'] + '/post_data_lemmatized_and_embs.csv', index=False)

## Обучим катбуст

In [3]:
# Соединим исходный датасет действий пользователей с полученными эмбеддингами текстов

data = pd.read_csv(CONFIG['datasets_folder'] + '/processed_df.csv')
post_df = pd.read_csv(CONFIG['datasets_folder'] + '/post_data_lemmatized_and_embs.csv').drop(['topic'], axis=1)
data = data.merge(post_df, on='post_id')

text_w2v_cols = [f'text_w2v_{i}' for i in range(1, 11)]

columns_order = [
    'post_id', 'topic', 'tfidf_sum', 
    'tfidf_mean', 'tfidf_max', 'user_id',
    'gender', 'age', 'country', 'city', 
    'exp_group', 'os', 'source','month', 
    'hour', 'day', 'weekday', 'timestamp', 'target'
] + text_w2v_cols
data = data[columns_order]

In [4]:
# Разделим датасет на трейн и тест

X = data.drop(['timestamp', 'target', 'user_id', 'post_id'], axis=1)
y = data['target']

X_train = X.iloc[:-712175].copy()
y_train = y.iloc[:-712175].copy()

X_test = X.iloc[-712175:].copy()
y_test = y.iloc[-712175:].copy()

In [5]:
# Ввиду дисбаланса классов, найдем их веса

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [6]:
class_weights

{0: 0.5721843779474648, 1: 3.963353250504535}

In [7]:
# Обучим катбуст

categorical_features = ['country', 'city', 'topic']
catboost_model = CatBoostClassifier(
    class_weights=class_weights, 
    cat_features=categorical_features, 
    verbose=False
)

catboost_model.fit(X_train, y_train, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1812d4cd990>

In [8]:
# Грубо оценим качество обученной модели

print(f"Качество на тесте: {catboost_model.score(X_test, y_test)}")
classification_report(y_test, catboost_model.predict(X_test), output_dict=True)

Качество на тесте: 0.6165787903254116


{'0': {'precision': 0.9221623049854076,
  'recall': 0.6146808674155576,
  'f1-score': 0.7376621579947025,
  'support': 624568},
 '1': {'precision': 0.18658149124585952,
  'recall': 0.6301094661385506,
  'f1-score': 0.2879100418028148,
  'support': 87607},
 'accuracy': 0.6165787903254116,
 'macro avg': {'precision': 0.5543718981156336,
  'recall': 0.6223951667770541,
  'f1-score': 0.5127860998987587,
  'support': 712175},
 'weighted avg': {'precision': 0.831676078497142,
  'recall': 0.6165787903254116,
  'f1-score': 0.6823366640596126,
  'support': 712175}}

In [9]:
catboost_model.save_model(CONFIG['data_folder'] + '/catboost_model_with_text_embs', format="cbm")