## Загрузка необходимых библиотек


In [None]:
%pip install deeppavlov
%pip install transformers
%pip install torch
%pip install tqdm
%pip install fastparquet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m696.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting charset-normalizer~=2.0.0 (from requests<3.0.0,>=2.19.0->deeppavlov)
  Downloading charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: urllib3, charset-normalizer
[0mSuccessfully installed charset-normalizer-2.0.12 urllib3-1.26.18
Defaulting to user installation because normal site-packages is not writeable
Collecting urllib3<1.27,>=1.21.1 (from requests->transformers)
  Using cached urllib3-1.26.18-py2.py3-none-any.whl.metadata (48 kB)
Collecting charset-normalizer~=2.0.0 (from requests->transformers)
  Using cached charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Using cached urllib3-1.26.18-py2.py3-none-an

# Обучение модели на основании данных авторазметки
Описание: реализация совершенно новыго подхода к обучению модели с использованием таких данных как:

1. Время промотра
2. Позиция в списке выдачи
3. Эмоции
4. В топ

In [None]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import torch
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Определение класса CustomDataset
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_dict = self.data.iloc[idx][['query', 'video_id', 'duration', 'position', 'watchtime', 'emotion']]
        label = self.data.iloc[idx]['vtop']

        # Проверка на None и установка значения по умолчанию
        label = 0 if label is None else int(label)

        # Преобразование текста в строку
        text = ' '.join(map(str, text_dict.values))

        return {'text': text, 'label': label}

# Указать путь к файлу Parquet
data_path = 'train_data/train_data/automarkup.parquet'

# Загрузить данные из файла Parquet
df = pd.read_parquet(data_path)[:50000]

# Замените NaN в 'emotion' на значение по умолчанию, например, 'нет эмоции'
df['emotion'].fillna('нет эмоции', inplace=True)

# Предполагаем, что 'duration' измеряется в миллисекундах, преобразуем в секунды
df['duration'] = df['duration'] / 1000

# Разделите данные на обучающий и тестовый наборы
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Создайте DataLoader
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=2000, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2000, shuffle=False)

# Загрузите предварительно обученный токенизатор и модель BERT для русского языка
model_name = 'cointegrated/rubert-tiny2'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2 класса: в топе или нет

# Настройте оптимизатор и функцию потерь
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Обучение модели
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True).to(device)
        labels = batch['label'].to(device)

        outputs = model(**inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Оценка модели
model.eval()
all_predictions = []
all_labels = []

for batch in tqdm(test_loader, desc="Evaluating"):
    inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True).to(device)
    labels = batch['label'].to(device)

    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=1)

    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy: {accuracy}")

# Сохранение модели
model.save_pretrained("BERT_V2.0_100000")
tokenizer.save_pretrained("BERT_V2.0_100000")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/1: 100%|██████████| 40/40 [01:20<00:00,  2.01s/it]


Epoch 1/1, Loss: 0.1917418352793902


Evaluating: 100%|██████████| 10/10 [00:18<00:00,  1.88s/it]

Accuracy: 0.9997





('BERT_V2.0_100000/tokenizer_config.json',
 'BERT_V2.0_100000/special_tokens_map.json',
 'BERT_V2.0_100000/vocab.txt',
 'BERT_V2.0_100000/added_tokens.json')