In [6]:
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.25.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.25.1-py3-none-any.whl (436 kB)
Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
Downloading tokeniz

In [1]:
import pandas as pd
import os
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from collections import Counter

In [2]:
path = '/home/jupyter/datasphere/project/data' # Ваш путь до директории с данными /path/to/data/
data = pd.read_csv(os.path.join(path, 'train_events.csv'))
video = pd.read_csv(os.path.join(path, 'video_info_v2.csv'))
targets = pd.read_csv(os.path.join(path, 'train_targets.csv'))

In [3]:
from tqdm import tqdm

In [4]:
device='cuda'

In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
import torch

tqdm.pandas()

def generate_embeddings(df, model, tokenizer) -> pd.DataFrame:
    """
    Функция для генерации эмбеддингов для текстов из колонки датафрейма с использованием модели из Hugging Face.

    :param df: Датафрейм с колонками 'rutube_video_id' и 'title'.
    :param model: Модель для генерации эмбеддингов.
    :param tokenizer: Токенайзер для модели.
    :return: Датафрейм с добавленными колонками для эмбеддингов.
    """
    
    # Функция для получения эмбеддингов текста
    def get_embedding(text):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # Усредняем эмбеддинг по всем токенам (mean pooling)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        return embedding

    # Получаем эмбеддинги
    embeddings = df['title'].progress_apply(get_embedding)

    # Преобразуем список эмбеддингов в отдельные колонки датафрейма
    embeddings_df = pd.DataFrame(embeddings.tolist(), index=df.index)  # Устанавливаем индексы такие же, как у исходного датафрейма

    # Назначаем имена колонкам, например 'embedding_0', 'embedding_1', и т.д.
    embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

    # Объединяем исходный датафрейм с эмбеддингами
    df_final = pd.concat([df, embeddings_df], axis=1)

    return df_final



In [13]:
# Генерируем эмбеддинги с помощью модели 'DeepPavlov/rubert-base-cased'
model_name = 'cointegrated/rubert-tiny2'

# Загружаем модель и токенизатор
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

model.to(device)



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
   

In [7]:
full_df = data.merge(video, on='rutube_video_id')
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1759616 entries, 0 to 1759615
Data columns (total 13 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   event_timestamp  object
 1   region           object
 2   ua_device_type   object
 3   ua_client_type   object
 4   ua_os            object
 5   ua_client_name   object
 6   total_watchtime  int64 
 7   rutube_video_id  object
 8   viewer_uid       int64 
 9   title            object
 10  category         object
 11  duration         int64 
 12  author_id        int64 
dtypes: int64(4), object(9)
memory usage: 187.9+ MB


In [None]:
df_with_embeddings = generate_embeddings(video, model, tokenizer)

In [15]:
df_with_embeddings.to_csv('/home/jupyter/datasphere/project/video_embed.csv')