In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_lengh', #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
        reviews=df.text.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [None]:
class SentimentModel(torch.nn.Module):
    def __init__(self, n_classes):
        super(SentimentModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_classes)

    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask)

def train_model(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    total_loss = 0
    for data in tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model = model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds)
            true_labels.extend(data['label'])
    return predictions, true_labels

In [None]:
def predict_review(review_text, model, tokenizer, max_len=128, device='cpu'):
    encoding = tokenizer.encode_plus(
        review_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    _, prediction = torch.max(output.logits, dim=1)
    return prediction.item()

# ЗАПУСК ОБУЧЕНИЯ

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df_train = pd.read_csv('out_train.csv')

df_subtrain, df_val = train_test_split(df_train, test_size=0.1, random_state=42)

df_test = pd.read_csv('out_test.csv')

train_data_loader = create_data_loader(df_subtrain, tokenizer, max_len=128, batch_size=16)
test_data_loader = create_data_loader(df_val, tokenizer, max_len=128, batch_size=16)

df_train

In [None]:
from torcheval.metrics import MulticlassAccuracy

In [None]:
model = SentimentModel(n_classes=10)
model = model.to(device)
list(model.parameters())[-1].requires_grad = True

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-2)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

for epoch in range(3):
    loss_func_val = train_model(model, train_data_loader, loss_fn, optimizer, device)
    y_pred, y_true = eval_model(model, test_data_loader, device)
    
    metric = MulticlassAccuracy()
    metric.update(y_pred, y_true)
    acc = metric.compute()
    print(f'epoch: {epoch}, loss: {loss_func_val}, accuracy on test: {acc}')
    #print(f'epoch: {epoch}')

In [None]:
review_text = "This is a great product!"
predicted_score = predict_review(review_text, model, tokenizer, device=device)
print(f'Predicted score: {predicted_score}')