In [31]:
import pandas as pd 
import numpy as np

In [19]:
text_train = pd.read_csv("../../data/processed/train/text_train_df.csv", index_col=0)
tags_train = pd.read_csv("../../data/processed/train/tags_train_df.csv", index_col=0)
targets_train = pd.read_csv("../../data/processed/train/target_train_df.csv", index_col=0)

text_val = pd.read_csv("../../data/processed/val/text_val_df.csv", index_col=0)
tags_val = pd.read_csv("../../data/processed/val/tags_val_df.csv", index_col=0)
targets_val = pd.read_csv("../../data/processed/val/target_val_df.csv", index_col=0)

In [21]:
text_train = text_train.fillna("")
text_val = text_val.fillna("")

In [3]:
import torch
from transformers import AutoModel, AutoTokenizer

In [5]:
rubert = AutoModel.from_pretrained("cointegrated/rubert-tiny")
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")

In [17]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [39]:
embed_bert_cls("Здравствуй, мир.", rubert, tokenizer).shape

(312,)

In [54]:
X_train = pd.DataFrame(columns=range(312))
for i, text in enumerate(text_train['text']):
    X_train.loc[i] = embed_bert_cls(text, rubert, tokenizer)
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.090183,0.061351,-0.065949,-0.099538,0.01596,0.069783,-0.066221,0.006863,-0.121204,-0.015498,...,0.00703,-0.034012,0.00774,0.014849,0.04782,0.027915,-0.000539,-0.001336,0.016548,-0.071149
1,-0.021558,0.00223,0.016219,-0.051915,-0.026662,0.040939,0.061662,-0.01569,-0.009321,-0.069248,...,0.01026,0.009149,-0.021632,-0.024911,-0.001464,-0.03434,-0.053662,0.107122,0.102947,-0.071786
2,0.016584,-0.033119,-0.023893,-0.068441,0.04383,0.015474,0.01301,-0.007507,0.006783,-0.09778,...,-0.036983,0.089537,0.04848,0.039076,0.023839,-0.048955,-0.057562,0.104591,0.140778,0.015352
3,0.013997,0.012985,-0.005958,-0.062241,0.006339,0.007202,-0.035187,0.022503,0.002928,-0.024576,...,0.010641,0.13041,-0.03453,0.018799,0.082025,-0.061748,-0.042123,0.059411,0.053553,-0.033946
4,0.017832,0.000991,-0.070902,-0.108358,0.006684,0.037815,-0.020891,-0.015415,-0.015879,0.004351,...,0.021067,-0.118142,-0.056713,-0.011428,-0.001978,0.040969,0.091797,0.002272,0.000739,-0.036165


In [56]:
X_val = pd.DataFrame(columns=range(312))
for i, text in enumerate(text_val['text']):
    X_val.loc[i] = embed_bert_cls(text, rubert, tokenizer)
X_val.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.013822,-0.053897,-0.012163,-0.063863,0.027213,0.023654,0.017786,-0.014802,-0.029428,-0.045091,...,-0.017359,0.121824,0.009049,0.012295,0.033944,0.00411,-0.048581,0.007655,0.042258,-0.040448
1,0.010554,0.00828,-0.057215,-0.02954,-0.001637,0.004654,-0.045566,-0.098403,0.017578,0.019131,...,-0.03805,-0.038038,0.022338,0.031933,0.013576,0.031918,0.034809,-0.063666,0.024365,-0.041229
2,0.065814,0.071933,-0.041662,-0.026181,0.020469,0.018593,-0.059573,-0.026816,-0.005678,0.002147,...,-0.028424,-0.049337,-0.01325,0.02697,-0.008498,0.045078,-0.010328,-0.027391,0.008616,-0.088844
3,-0.013023,0.015756,-0.053805,-0.104616,0.04037,0.045786,-0.008176,-0.029072,-0.043986,-0.103641,...,0.075606,0.117067,-0.019637,-0.018873,0.009311,-0.073178,-0.044545,0.070349,0.078428,-0.064259
4,-0.049665,-0.021748,-0.086969,-0.057905,0.022839,-0.024867,-0.00044,-0.046062,-0.010201,-0.030629,...,-0.009451,0.071562,-0.039206,0.031532,0.015328,-0.027826,0.044672,0.015873,0.030742,-0.014962


In [60]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

In [62]:
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))

In [68]:
model.fit(X_train, targets_train.values)

In [70]:
from sklearn.metrics import accuracy_score

In [72]:
prediction = model.predict(X_val)
accuracy_score(targets_val.values, prediction)

0.20995670995670995

In [74]:
from sklearn.svm import SVC

In [76]:
model = MultiOutputClassifier(SVC())
model.fit(X_train, targets_train.values)

In [77]:
prediction = model.predict(X_val)
accuracy_score(targets_val.values, prediction)

0.2510822510822511