In [None]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

df_news= pd.read_csv('/datasets/news_data.csv')
df_news= df_news.sample(400, random_state=42).reset_index(drop=True)

tokenizer = transformers.BertTokenizer(
    vocab_file='/datasets/ds_bert/vocab.txt')

max_len = 512

tokenized = df_news['text'].apply(
    lambda x: np.array(
        tokenizer.encode(
            x,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length'
        )
    )
)

padded = np.array(tokenized.values.tolist())
attention_mask = np.where(padded != 0, 1, 0)

config = transformers.BertConfig.from_json_file(
    '/datasets/ds_bert/bert_config.json')
model = transformers.BertModel.from_pretrained(
    '/datasets/ds_bert/rubert_model.bin', config=config)

batch_size = 100
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)])
        attention_mask_batch = torch.LongTensor(
            attention_mask[batch_size*i:batch_size*(i+1)]
          )

        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)

        embeddings.append(batch_embeddings[0][:,0,:].numpy())

features = np.concatenate(embeddings)
target = df_news['rubric']

train_features, test_features, train_target, test_target = train_test_split(
    features, target, test_size=200, random_state=42)

model = LogisticRegression()
model.fit(train_features, train_target)

print(model.score(test_features, test_target))