In [1]:
import re
import pandas as pd
import numpy as np
import yaml

from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from nltk.stem import WordNetLemmatizer
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Word2Vec
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Прочтём файл конфига с путями

CONFIG_PATH = "config.yaml"
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
    CONFIG = yaml.load(config_file, Loader=yaml.FullLoader)

In [3]:
# Загрузим датасет постов и обработаем тексты

post_df = pd.read_csv(CONFIG['data_folder'] + '/post_data.csv')

In [4]:
post_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks The UK manufact...,business
1,2,Aids and climate top Davos agenda Climate cha...,business
2,3,Asian quake hits European shares Shares in Eu...,business
3,4,India power shares jump on debut Shares in In...,business
4,5,Lacroix label bought by US firm Luxury goods ...,business


### Предобработаем и лемматизируем text

In [5]:
def preprocess_text(text):
    text = re.findall('[^\W_]+', text)
    text = [token.lower() for token in text if len(token) > 1]
    text = " ".join(text)
    if len(text) == 0:
        return 'placeholder text'
    return text

def lemmatize_row(row: str, lemmatizer: WordNetLemmatizer) -> str:
    list_of_words = row.split()
    list_of_words = list(map(lemmatizer.lemmatize, list_of_words))
    return ' '.join(list_of_words)

def lemmatize_text_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    lemmatizer = WordNetLemmatizer()
    lemmatized_col = f'lemmatized_{column}'
    df[lemmatized_col] = df[column].apply(lambda x: preprocess_text(x))
    df[lemmatized_col] = df[lemmatized_col].apply(
        lambda x: lemmatize_row(x, lemmatizer)
    )
    return df

In [6]:
post_df = lemmatize_text_column(post_df, 'text')

In [7]:
post_df[['lemmatized_text', 'text']].sample(1).values

array([['uk young top euro earnings league british child enjoy the highest average annual income in europe more than double that of spanish or italian youngster report suggests child in the uk between the age of 10 and 17 had an annual income of 775 said market analyst datamonitor they use pester power to get their parent to stump up nearly third of this income the report said a for how they spend their cash the bulk go on personal care soft drink and food datamonitor said datamonitor add that british teenager are keen on personal care because it help them combine two seemingly contradictory emotional need the desire to fit in and the desire to express their individuality british teenage girl compared to their counterpart in seven european country are the most keen to use make up product nearly three out of four girl said they used make up according to the datamonitor report the trend marked british teenager out a particularly important to cosmetic manufacturer a they are likely to exp

In [8]:
post_df.to_csv(CONFIG['datasets_folder'] + '/post_data_lemmatized_and_embs.csv', index=False)

### Text w2v

In [9]:
class LossLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        elif self.epoch % 10 == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1
        
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, w2v_model, alpha=1):
        
        self.w2v_model = w2v_model
        self.alpha = alpha
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):
            title_vector = np.zeros((self.w2v_model.wv.vector_size,))
            try:
                tokens = title.split()
            except BaseException:
                continue
            
            counter = 1
            
            for token in tokens:
                if token in self.w2v_model.wv.key_to_index:
                    title_vector += self.w2v_model.wv.get_vector(token)
                    counter += 1 
                    
            X_transformed[i] = title_vector / (self.alpha * counter)
        
        return X_transformed

In [10]:
text_w2v_corpus = post_df.lemmatized_text.str.split()
text_w2v_corpus = text_w2v_corpus
text_w2v_model = Word2Vec(sg=1, hs=1, vector_size=10)
text_w2v_model.build_vocab(text_w2v_corpus)
text_w2v_model.train(
    text_w2v_corpus,
    total_examples=text_w2v_model.corpus_count,
    epochs=250,
    compute_loss=True,
    callbacks=[LossLogger()]
)

Loss after epoch 0: 25454584.0
Loss after epoch 10: 735712.0
Loss after epoch 20: 840792.0
Loss after epoch 30: 893192.0
Loss after epoch 40: 902232.0
Loss after epoch 50: 892272.0
Loss after epoch 60: 894320.0
Loss after epoch 70: 859496.0
Loss after epoch 80: 845536.0
Loss after epoch 90: 0.0
Loss after epoch 100: 0.0
Loss after epoch 110: 0.0
Loss after epoch 120: 0.0
Loss after epoch 130: 0.0
Loss after epoch 140: 0.0
Loss after epoch 150: 0.0
Loss after epoch 160: 0.0
Loss after epoch 170: 0.0
Loss after epoch 180: 0.0
Loss after epoch 190: 0.0
Loss after epoch 200: 0.0
Loss after epoch 210: 0.0
Loss after epoch 220: 0.0
Loss after epoch 230: 0.0
Loss after epoch 240: 0.0


(289528214, 381662000)

In [11]:
text_w2v_model.save(CONFIG['datasets_folder'] + '/text_w2v_model_10')
text_word_vectors = text_w2v_model.wv
text_word_vectors.save(CONFIG['datasets_folder'] + '/text_w2v_word_vectors_10')

In [12]:
text2vec = Word2VecTransformer(w2v_model=text_w2v_model)
w2v_text_transform = text2vec.transform(post_df.lemmatized_text.values)

text_w2v_cols = [f'text_w2v_{i}' for i in range(1, 11)]

w2v_df = pd.DataFrame(w2v_text_transform, columns=text_w2v_cols)
post_df = pd.concat((post_df.reset_index(drop=True), w2v_df.reset_index(drop=True)), axis=1)

In [13]:
post_df.head()

Unnamed: 0,post_id,text,topic,lemmatized_text,text_w2v_1,text_w2v_2,text_w2v_3,text_w2v_4,text_w2v_5,text_w2v_6,text_w2v_7,text_w2v_8,text_w2v_9,text_w2v_10
0,1,UK economy facing major risks The UK manufact...,business,uk economy facing major risk the uk manufactur...,-0.368771,0.140649,0.088472,0.021023,0.443825,-0.061287,-0.286586,0.539613,0.303215,0.001286
1,2,Aids and climate top Davos agenda Climate cha...,business,aid and climate top davos agenda climate chang...,-0.419259,0.070178,0.138067,-0.007298,0.547702,-0.148632,-0.261452,0.35081,0.175346,-0.051671
2,3,Asian quake hits European shares Shares in Eu...,business,asian quake hit european share share in europe...,-0.376187,0.177674,0.145735,0.042862,0.485289,0.022764,-0.256666,0.49182,0.283088,0.007006
3,4,India power shares jump on debut Shares in In...,business,india power share jump on debut share in india...,-0.302412,0.191225,0.093895,0.012854,0.595289,-0.010114,-0.180129,0.466709,0.306224,0.04517
4,5,Lacroix label bought by US firm Luxury goods ...,business,lacroix label bought by u firm luxury good gro...,-0.178043,0.146726,0.135657,-0.064135,0.651315,-0.080695,-0.196958,0.340326,0.142488,-0.037439


In [14]:
post_df.to_csv(CONFIG['datasets_folder'] + '/post_data_lemmatized_and_embs.csv', index=False)

## Обучим катбуст

In [15]:
# Соединим исходный датасет действий пользователей с полученными эмбеддингами текстов

data = pd.read_csv(CONFIG['datasets_folder'] + '/processed_df.csv')
post_df = pd.read_csv(CONFIG['datasets_folder'] + '/post_data_lemmatized_and_embs.csv').drop(['topic'], axis=1)
data = data.merge(post_df, on='post_id')

text_w2v_cols = [f'text_w2v_{i}' for i in range(1, 11)]

columns_order = [
    'post_id', 'topic', 'tfidf_sum', 
    'tfidf_mean', 'tfidf_max', 'user_id',
    'gender', 'age', 'country', 'city', 
    'exp_group', 'os', 'source','month', 
    'hour', 'day', 'weekday', 'timestamp', 'target'
] + text_w2v_cols
data = data[columns_order]

In [16]:
X = data.drop(['timestamp', 'target', 'user_id', 'post_id'], axis=1)
y = data['target']

In [18]:
# Обучим катбуст

categorical_features = ['country', 'city', 'topic']
catboost_model = CatBoostClassifier(
    cat_features=categorical_features,
    depth=2,
    iterations=250,
    verbose=False,
    random_seed=42,
)

catboost_model.fit(X, y, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2087a67e1d0>

In [19]:
# Грубо оценим качество обученной модели

print(f"Качество: {catboost_model.score(X, y)}")
classification_report(y, catboost_model.predict(X), output_dict=True)

Качество: 0.6040527398418069


{'0': {'precision': 0.6131435330731188,
  'recall': 0.5754138272111725,
  'f1-score': 0.5936798320124734,
  'support': 2289905},
 '1': {'precision': 0.5959336612093622,
  'recall': 0.6330029113334525,
  'f1-score': 0.6139092137629217,
  'support': 2265285},
 'accuracy': 0.6040527398418069,
 'macro avg': {'precision': 0.6045385971412405,
  'recall': 0.6042083692723126,
  'f1-score': 0.6037945228876975,
  'support': 4555190},
 'weighted avg': {'precision': 0.6045851053050367,
  'recall': 0.6040527398418069,
  'f1-score': 0.6037398547642279,
  'support': 4555190}}

In [20]:
print(f"Качество на трейне: {roc_auc_score(y, catboost_model.predict_proba(X)[:, 1])}")

Качество на трейне: 0.6461308174783567


In [21]:
catboost_model.save_model(CONFIG['data_folder'] + '/catboost_model_with_text_embs', format="cbm")