In [1]:
import warnings
warnings.filterwarnings("ignore")
import gc

In [2]:
import pandas as pd
from sqlalchemy import create_engine

def batch_load_sql(query: str) -> pd.DataFrame:
    chunksize = 200000
    engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )
    conn = engine.connect().execution_options(stream_results=True)

    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=chunksize):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)


In [3]:
post_text_df = batch_load_sql('SELECT * FROM post_text_df')
post_text_df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [4]:
post_text_df.columns

Index(['post_id', 'text', 'topic'], dtype='object')

In [5]:
### Сделаем эмбеддинги постов 

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import \
    RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import \
    DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(
        checkpoint_names[model_name])

In [6]:
tokenizer, model = get_model('distilbert')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
### Сделаем датасет для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])


dataset = PostDataset(post_text_df['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [8]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

model = model.to(device)

cpu


In [9]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [10]:
import string
import re
import nltk

nltk.download('wordnet')
wnl = nltk.stem.WordNetLemmatizer()

def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r'{string.punctuation}', ' ', line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = ' '.join([token.lemmatize(word) for word in line.split(' ')])
    return line

[nltk_data] Downloading package wordnet to /Users/zkv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

  0%|          | 0/220 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  1%|▏         | 3/220 [00:19<23:13,  6.42s/it]


KeyboardInterrupt: 

In [None]:
### Пытаемся кластеризовать тексты

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

In [None]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

post_text_df['TextCluster'] = kmeans.labels_

dists_columns = [f'dist_to_cluster_{i}' for i in range(1, n_clusters+1)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

In [None]:
post_text_df = pd.concat((post_text_df, dists_df), axis=1)

post_text_df.head()

In [None]:
feed_data_query = """
    SELECT 
        post_id,
        age,
        city,
        country,
        exp_group,
        gender,
        os,
        source,
        CAST(EXTRACT(HOUR FROM timestamp) AS INT) AS hour,
        CAST(EXTRACT(MONTH FROM timestamp) AS INT) AS month,
        target
    FROM 
        feed_data AS f
    JOIN  
        user_data AS u USING (user_id)
    WHERE action = 'view'
    LIMIT 10000000       
"""
feed_data_df = batch_load_sql(feed_data_query)
feed_data_df.head()

In [None]:
feed_data_df = pd.merge(feed_data_df, post_text_df, on='post_id', how='left')
feed_data_df.head()

In [None]:
cols_order = [
    'topic', 'TextCluster', 'dist_to_cluster_1', 'dist_to_cluster_2',
    'dist_to_cluster_3', 'dist_to_cluster_4', 'dist_to_cluster_5',
    'dist_to_cluster_6', 'dist_to_cluster_7', 'dist_to_cluster_8',
    'dist_to_cluster_9', 'dist_to_cluster_10', 'dist_to_cluster_11',
    'dist_to_cluster_12', 'dist_to_cluster_13', 'dist_to_cluster_14',
    'dist_to_cluster_15', 'age', 'city', 'country', 
    'exp_group', 'gender', 'os', 'source', 'hour', 'month'
]

In [None]:
target = feed_data_df.target
features = feed_data_df[cols_order]
features.head()

In [None]:
from catboost import CatBoostClassifier

object_cols = ['topic', 'TextCluster', 'age', 'city', 'country', 
               'exp_group', 'gender', 'os', 'source', 'hour', 'month']

task_type = 'GPU' if device == 'cuda:0' else None

catboost = CatBoostClassifier(
    iterations=200,
    learning_rate=1,
    depth=2,
    random_seed=1,
    thread_count=-1,
    task_type=task_type
)

catboost.fit(X=features, y=target, cat_features=object_cols)

In [None]:
catboost.save_model('./model/catboost_model_june.cbm', format='cbm')

In [None]:
post_text_df.head()

In [None]:
post_text_df.shape

In [None]:
post_text_df.to_sql(name='n_koren_3_posts_featured_df_june',
                    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
                        "postgres.lab.karpov.courses:6432/startml", 
                    if_exists='replace', 
                    index=False)

In [None]:
ddff = pd.read_sql("SELECT * FROM n_koren_3_posts_featured_df_june",
                   con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
                        "postgres.lab.karpov.courses:6432/startml")

In [None]:
ddff.head()

In [3]:
filename = ('dddfdf.sdf')
filename[:-4]

'dddfdf'