In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from catboost import CatBoostClassifier, Pool

from transformers import AutoTokenizer
from transformers import DistilBertModel

In [4]:
user_data = pd.read_csv('drive//MyDrive//Colab Notebooks//data//users.csv')
user_data.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [5]:
post_data = pd.read_csv('drive//MyDrive//Colab Notebooks//data//posts.csv')
post_data.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [6]:
feed_data = pd.read_csv('drive//MyDrive//Colab Notebooks//data//feeds.csv')
feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-10-14 12:45:21,110257,652,view,0
1,2021-10-31 18:54:52,5320,3811,view,0
2,2021-11-17 18:21:38,107771,5536,view,0
3,2021-11-21 17:02:22,83907,2736,view,0
4,2021-10-13 18:35:53,303,6491,view,0


Поработаем с постами и топиками из таблицы post_data

In [7]:
from transformers import AutoTokenizer
from transformers import BertModel
from transformers import RobertaModel
from transformers import DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',
        'roberta': 'roberta-base',
        'distilbert': 'distilbert-base-cased'
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [8]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Сделаем датасет для постов

In [9]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])


dataset = PostDataset(post_data['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [10]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []

    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Tesla T4


In [12]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

100%|██████████| 220/220 [01:50<00:00,  1.99it/s]


array([[ 3.6315086e-01,  4.8937496e-02, -2.6408118e-01, ...,
        -1.4159346e-01,  1.5918216e-02,  9.1982896e-05],
       [ 2.3641640e-01, -1.5950108e-01, -3.2779828e-01, ...,
        -2.8993604e-01,  1.1936528e-01, -1.6235473e-03],
       [ 3.7519148e-01, -1.1394388e-01, -2.4054705e-01, ...,
        -3.3891949e-01,  5.8694065e-02, -2.1265799e-02],
       ...,
       [ 3.4038273e-01,  6.6492192e-02, -1.6318429e-01, ...,
        -8.6562753e-02,  2.0340374e-01,  3.2090571e-02],
       [ 4.3209219e-01,  1.1091532e-02, -1.1730607e-01, ...,
         7.5401559e-02,  1.0273975e-01,  1.5274222e-02],
       [ 3.0427766e-01, -7.6215670e-02, -6.7758739e-02, ...,
        -5.4348916e-02,  2.4438348e-01, -1.4148588e-02]], dtype=float32)

In [13]:
# кластеризация текстов
from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

In [15]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

post_data['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()



Unnamed: 0,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,2.353855,3.625337,3.411175,3.371418,2.836325,3.005784,3.441464,3.384828,1.809452,3.470074,3.459047,2.221048,1.952203,3.420595,3.753656
1,2.310817,3.362058,3.221624,3.325222,2.560121,2.853049,2.978178,3.371665,1.421095,3.246447,3.141553,2.239701,2.197514,3.328099,3.553338
2,2.386565,3.365002,3.287446,3.26821,2.887404,3.039246,2.967098,3.499609,1.706188,3.393992,3.14052,3.0397,1.823302,3.348239,3.560918
3,2.813685,3.798079,3.697577,3.513192,3.379107,3.282822,3.716256,3.74227,2.469405,4.063752,3.793526,3.397228,2.437142,3.730276,2.981264
4,2.034604,3.053057,2.84655,3.034991,2.144238,2.645245,2.639252,2.802445,2.128224,3.239778,2.777034,2.935046,1.466523,2.816569,3.332887


In [17]:
post_data = pd.concat((post_data, dists_df), axis=1)

post_data.drop(["text"], axis=1, inplace=True)

post_data.head()

Unnamed: 0,post_id,topic,TextCluster,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,1,business,8,2.353855,3.625337,3.411175,3.371418,2.836325,3.005784,3.441464,3.384828,1.809452,3.470074,3.459047,2.221048,1.952203,3.420595,3.753656
1,2,business,8,2.310817,3.362058,3.221624,3.325222,2.560121,2.853049,2.978178,3.371665,1.421095,3.246447,3.141553,2.239701,2.197514,3.328099,3.553338
2,3,business,8,2.386565,3.365002,3.287446,3.26821,2.887404,3.039246,2.967098,3.499609,1.706188,3.393992,3.14052,3.0397,1.823302,3.348239,3.560918
3,4,business,12,2.813685,3.798079,3.697577,3.513192,3.379107,3.282822,3.716256,3.74227,2.469405,4.063752,3.793526,3.397228,2.437142,3.730276,2.981264
4,5,business,12,2.034604,3.053057,2.84655,3.034991,2.144238,2.645245,2.639252,2.802445,2.128224,3.239778,2.777034,2.935046,1.466523,2.816569,3.332887


In [18]:
df = pd.merge(feed_data,
              user_data,
              on='user_id',
              how='left'
             )

In [19]:
df = pd.merge(df,
             post_data,
             on='post_id',
              how='left'
             )

In [28]:
df['timestamp'] = pd.to_datetime(df.timestamp)
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day

In [29]:
df = df.drop(['action', 'user_id','post_id', 'timestamp'], axis = 1)

In [27]:
df.dtypes

timestamp                object
user_id                   int64
post_id                   int64
action                   object
target                    int64
gender                    int64
age                       int64
country                  object
city                     object
exp_group                 int64
os                       object
source                   object
topic                    object
TextCluster               int32
DistanceToCluster_0     float32
DistanceToCluster_1     float32
DistanceToCluster_2     float32
DistanceToCluster_3     float32
DistanceToCluster_4     float32
DistanceToCluster_5     float32
DistanceToCluster_6     float32
DistanceToCluster_7     float32
DistanceToCluster_8     float32
DistanceToCluster_9     float32
DistanceToCluster_10    float32
DistanceToCluster_11    float32
DistanceToCluster_12    float32
DistanceToCluster_13    float32
DistanceToCluster_14    float32
dtype: object

In [30]:
cat_features = list(df.select_dtypes(include=['object']).columns)
cat_features

['country', 'city', 'os', 'source', 'topic']

In [32]:
from sklearn.model_selection import train_test_split

X = df.drop(['target'], axis = 1)
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [34]:
X_train.columns

Index(['gender', 'age', 'country', 'city', 'exp_group', 'os', 'source',
       'topic', 'TextCluster', 'DistanceToCluster_0', 'DistanceToCluster_1',
       'DistanceToCluster_2', 'DistanceToCluster_3', 'DistanceToCluster_4',
       'DistanceToCluster_5', 'DistanceToCluster_6', 'DistanceToCluster_7',
       'DistanceToCluster_8', 'DistanceToCluster_9', 'DistanceToCluster_10',
       'DistanceToCluster_11', 'DistanceToCluster_12', 'DistanceToCluster_13',
       'DistanceToCluster_14', 'month', 'day'],
      dtype='object')

In [39]:
catboost = CatBoostClassifier(
    iterations=500,
    learning_rate=1,
    depth=2,
    random_seed=111,
    thread_count=-1,
    task_type="GPU",
    cat_features=cat_features
)

catboost.fit(X_train, y_train)

0:	learn: 0.3387840	total: 275ms	remaining: 2m 16s
1:	learn: 0.3343532	total: 464ms	remaining: 1m 55s
2:	learn: 0.3324897	total: 647ms	remaining: 1m 47s
3:	learn: 0.3315867	total: 834ms	remaining: 1m 43s
4:	learn: 0.3312603	total: 1.01s	remaining: 1m 39s
5:	learn: 0.3311613	total: 1.18s	remaining: 1m 36s
6:	learn: 0.3309282	total: 1.38s	remaining: 1m 37s
7:	learn: 0.3304090	total: 1.57s	remaining: 1m 36s
8:	learn: 0.3303385	total: 1.83s	remaining: 1m 39s
9:	learn: 0.3302968	total: 2.1s	remaining: 1m 42s
10:	learn: 0.3292041	total: 2.37s	remaining: 1m 45s
11:	learn: 0.3291093	total: 2.71s	remaining: 1m 50s
12:	learn: 0.3290454	total: 3s	remaining: 1m 52s
13:	learn: 0.3289032	total: 3.26s	remaining: 1m 53s
14:	learn: 0.3287552	total: 3.45s	remaining: 1m 51s
15:	learn: 0.3286732	total: 3.63s	remaining: 1m 49s
16:	learn: 0.3284553	total: 3.8s	remaining: 1m 47s
17:	learn: 0.3283352	total: 3.97s	remaining: 1m 46s
18:	learn: 0.3283185	total: 4.14s	remaining: 1m 44s
19:	learn: 0.3282307	total:

<catboost.core.CatBoostClassifier at 0x7ff6a40eaf80>

In [40]:
# проверим качество работы модели
pred = catboost.predict(X_test)
pred

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
pred_proba = catboost.predict_proba(X_test)
pred_proba = pred_proba[:, 1]

for count, i in enumerate(pred_proba):
    if i>=0.3:
        pred_proba[count] = 1
    else:
        pred_proba[count] = 0

pred_proba

array([0., 0., 0., ..., 0., 0., 0.])

In [42]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

f1_score(y_test, pred), f1_score(y_test, pred_proba)

(1.1361828799963642e-05, 0.006318619762491598)

In [43]:
accuracy_score(y_test, pred), precision_score(y_test, pred), recall_score(y_test, pred)

(0.8933175757575758, 0.5, 5.680978946292025e-06)

In [44]:
accuracy_score(y_test, pred_proba), precision_score(y_test, pred_proba), recall_score(y_test, pred_proba)

(0.8924896969696969, 0.22614274258219727, 0.003204072125708702)

Сравним с работой первой модели для pred_proba

Presicion (0.1859 против 0.2261), recall (0.0004 против 0.0032) и f1_score (0.0008 против 0.0063) показывают большие значения.

Сохраним модель в форматах .cbm и .pkl




In [45]:
catboost.save_model('catboost_model_v2', format="cbm")

In [46]:
import pickle
filename = 'sklearn_model_v2.pkl'
pickle.dump(catboost, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))