In [1]:
import pandas as pd
import re
import torch
from torch import nn
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas()

In [2]:
df = pd.read_csv('women-clothing-accessories.3-class.balanced.csv', encoding="utf8", sep='\t')

In [3]:
df = df[df['sentiment'] != 'neautral']

In [4]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
model.cuda() 

Downloading (…)okenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/468k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29564, 312, padding_idx=0)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [5]:
def embed_bert_cls(text):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [6]:
df['vector'] = df['review'].progress_apply(embed_bert_cls)

100%|██████████| 60000/60000 [05:02<00:00, 198.19it/s]


In [7]:
df

Unnamed: 0,review,sentiment,vector
0,качество плохое пошив ужасный (горловина напер...,negative,"[0.09496676, 0.019417694, -0.030974533, -0.052..."
1,"Товар отдали другому человеку, я не получила п...",negative,"[0.01727799, -0.020034771, -0.07157768, -0.044..."
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative,"[0.05852786, 0.024297824, 0.019940354, -0.0296..."
3,"товар не пришел, продавец продлил защиту без м...",negative,"[-0.080169454, -0.06430159, -0.06941224, -0.03..."
4,"Кофточка голая синтетика, носить не возможно.",negative,"[0.0016684594, 0.008178324, -0.022118963, -0.0..."
...,...,...,...
89995,сделано достаточно хорошо. на ткани сделан рис...,positive,"[0.04227045, 0.016408293, -0.06218766, -0.0595..."
89996,Накидка шикарная. Спасибо большое провдо линяе...,positive,"[0.07874523, 0.08048287, -0.011663313, -0.0429..."
89997,спасибо большое ) продовца рекомендую.. заказа...,positive,"[-0.004518722, 0.058602538, -0.02277402, -0.01..."
89998,Очень довольна заказом! Меньше месяца в РБ. К...,positive,"[0.06756876, 0.04385761, -0.07193526, -0.03579..."


In [8]:
X = [list(x) for x in df['vector'].values]
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train = [torch.from_numpy(np.array(x)) for x in X_train]
X_test = [torch.from_numpy(np.array(x)) for x in X_test]

In [11]:
y_train = torch.from_numpy(np.array(y_train.values))
y_test = torch.from_numpy(np.array(y_test.values))

In [12]:
model = nn.Sequential(
            nn.Linear(312, 256),
            nn.ReLU(),
            nn.Linear(256, 2),
        )

In [13]:
optimizer = torch.optim.SGD([
                {'params': model.parameters()},
            ], lr=3e-2, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

In [14]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [15]:
model.to(device)

Sequential(
  (0): Linear(in_features=312, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=2, bias=True)
)

In [16]:
# torch.stack делает из списка тензоров один тензор
counter = 0
epochs = 3000
for i in range(0, epochs):
        X = torch.stack(X_train).to(device)
        y = y_train.to(device)
        
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        print(loss.item())

0.6921634674072266
0.6920928359031677
0.6919588446617126
0.691768229007721
0.6915265917778015
0.691240131855011
0.6909137964248657
0.6905518174171448
0.6901583671569824
0.6897371411323547
0.6892911791801453
0.6888225078582764
0.6883341670036316
0.6878271698951721
0.687302827835083
0.6867623925209045
0.6862062811851501
0.6856351494789124
0.6850488185882568
0.6844472885131836
0.6838303804397583
0.6831974983215332
0.6825484037399292
0.6818820834159851
0.6811982989311218
0.6804959177970886
0.679774284362793
0.6790323853492737
0.6782692670822144
0.6774840354919434
0.6766754388809204
0.6758424639701843
0.6749839782714844
0.6740986108779907
0.6731855273246765
0.6722432971000671
0.6712710857391357
0.6702675819396973
0.6692314743995667
0.6681618094444275
0.6670575141906738
0.6659177541732788
0.6647416949272156
0.6635280251502991
0.662276029586792
0.6609846353530884
0.6596528887748718
0.6582797169685364
0.656864583492279
0.6554067730903625
0.6539045572280884
0.6523576378822327
0.6507651209831238

In [17]:
y_pred = model(torch.stack(X_test).to(device)).argmax(axis=1).cpu().numpy()

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
accuracy_score(y_test, y_pred)

0.8815