## Классификация текстов с использованием предобученных языковых моделей.

В данном задании вам предстоит обратиться к задаче классификации текстов и решить ее с использованием предобученной модели BERT.

In [24]:
import json
# do not change the code in the block below
# __________start of block__________
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython.display import clear_output
from accelerate import optimizer
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline
# __________end of block__________

cuda


Обратимся к набору данных SST-2. Holdout часть данных (которая понадобится вам для посылки) доступна по ссылке ниже.

In [2]:
# do not change the code in the block below
# __________start of block__________

!wget https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
# __________end of block__________

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [86]:
# do not change the code in the block below
# __________start of block__________
df = pd.read_csv(
    "https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv",
    delimiter="\t",
    header=None,
)

texts_train = df[0].values[:5000]
y_train = df[1].values[:5000]
texts_test = df[0].values[5000:]
y_test = df[1].values[5000:]
with open("texts_holdout.json") as iofile:
    texts_holdout = json.load(iofile)
# __________end of block__________

Весь остальной код предстоит написать вам.

Для успешной сдачи на максимальный балл необходимо добиться хотя бы __84.5% accuracy на тестовой части выборки__.

In [83]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


# Функция для токенизации
def tokenize_and_prepare(texts, labels, tokenizer, max_length=128):
    encodings = tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'].to(device),
        'attention_mask': encodings['attention_mask'].to(device),
        'labels': torch.tensor(labels, dtype=torch.long).to(device)
    }


# Подготовка данных
train_data = tokenize_and_prepare(texts_train, y_train, tokenizer)

# Инициализация модели С ОБЯЗАТЕЛЬНЫМ ПЕРЕНОСОМ НА УСТРОЙСТВО
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2).to(device)

# Пример цикла обучения с корректным использованием устройств
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    model.train()

    # Создаем DataLoader прямо из тензоров
    dataset = torch.utils.data.TensorDataset(
        train_data['input_ids'],
        train_data['attention_mask'],
        train_data['labels']
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

    for batch in loader:
        input_ids, attention_mask, labels = batch

        # Все тензоры уже на device, но для надежности:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Сдача взадания в контест
Сохраните в словарь `out_dict` вероятности принадлежности к первому (положительному) классу

In [95]:
# test_data = tokenize_and_prepare(texts_test, y_test, tokenizer)

# print(list(texts_test))

def tokenize_and_prepare(texts, tokenizer=None, max_length=128):
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'].to(device),
        'attention_mask': encodings['attention_mask'].to(device),
    }


def predict_in_batches(model, texts, tokenizer, batch_size=32):
    model.eval()
    predictions = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(
                batch_texts,
                padding='max_length',
                truncation=True,
                max_length=128,
                return_tensors='pt'
            ).to(device)

            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions.append(probs.cpu())  # Переносим на CPU

            # Очищаем память
            del inputs, outputs, probs
            torch.cuda.empty_cache()

    return torch.cat(predictions)

holdout = tokenize_and_prepare(texts_holdout, tokenizer)
out_dict = {
    'train': [float(x.cpu()) for x in predict_in_batches(model, list(texts_train), tokenizer)[:, 1]],
    'test': [float(x.cpu()) for x in predict_in_batches(model, list(texts_test), tokenizer)[:, 1]],
    'holdout': [float(x.cpu()) for x in predict_in_batches(model, texts_holdout, tokenizer)[:, 1]]
}

Несколько `assert`'ов для проверки вашей посылки:

In [113]:
assert isinstance(out_dict["train"], list), "Object must be a list of floats"
assert isinstance(out_dict["train"][0], float), "Object must be a list of floats"
assert (
        len(out_dict["train"]) == 5000
), "The predicted probas list length does not match the train set size"

assert isinstance(out_dict["test"], list), "Object must be a list of floats"
assert isinstance(out_dict["test"][0], float), "Object must be a list of floats"
assert (
        len(out_dict["test"]) == 1920
), "The predicted probas list length does not match the test set size"

assert isinstance(out_dict["holdout"], list), "Object must be a list of floats"
assert isinstance(out_dict["holdout"][0], float), "Object must be a list of floats"
assert (len(
    out_dict["holdout"]) == 500
), "The predicted probas list length does not match the holdout set size"
y_preds = predict_in_batches(model, list(texts_test), tokenizer)


Запустите код ниже для генерации посылки.

In [114]:
from sklearn.metrics import accuracy_score

y_preds = torch.argmax(y_preds, dim=1).cpu().numpy()

print(accuracy_score(y_test, y_preds))
# do not change the code in the block below
# __________start of block__________
FILENAME = "submission_dict_hw_text_classification_with_bert.json"

with open(FILENAME, "w") as iofile:
    json.dump(out_dict, iofile)
print(f"File saved to `{FILENAME}`")
# __________end of block__________

0.9010416666666666
File saved to `submission_dict_hw_text_classification_with_bert.json`


На этом задание завершено. Поздравляем!