# Классификация на эмбеддингах

In [1]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import accuracy_score

In [2]:
df_tweets = pd.read_csv('/datasets/tweets.csv')
df_tweets = df_tweets.sample(n=400, replace=False, axis=0) 

In [3]:
tokenizer = transformers.BertTokenizer(
    vocab_file='/datasets/ds_bert/vocab.txt')

tokenized = df_tweets['text'].apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [4]:
config = transformers.BertConfig.from_json_file(
    '/datasets/ds_bert/bert_config.json')
model = transformers.BertModel.from_pretrained(
    '/datasets/ds_bert/rubert_model.bin', config=config)

Some weights of the model checkpoint at /datasets/ds_bert/rubert_model.bin were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
batch_size = 100
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
print(np.concatenate(embeddings))

[[-0.8799267  -0.39289227  0.2960124  ... -0.6363873   1.1163164
  -0.84750557]
 [ 0.1490422  -0.08720324 -0.92215    ...  0.11262817  0.87561214
  -0.14010319]
 [ 0.33105883  0.09662683 -0.22858934 ...  0.10178915  0.14729555
   0.07224853]
 ...
 [ 0.0186805   0.02629564 -0.11648229 ...  0.23331897  0.38168383
  -0.41653198]
 [ 0.05847712 -0.11280286 -1.3415956  ... -0.06008217  0.62527317
  -0.5654581 ]
 [ 0.25448525 -0.10225122 -0.19057123 ...  0.09132414  0.21200801
  -0.2871247 ]]


In [8]:
features = np.concatenate(embeddings)
target = df_tweets['positive']

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.5)

model = LogisticRegression()
model.fit(features_train, target_train) 
predictions = model.predict(features_test)
print(accuracy_score(target_test, predictions))

0.885
