In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
%pip install -qU torch transformers

# Imports

In [3]:
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from tqdm import tqdm, trange
from transformers import AutoTokenizer, AutoModel

import polars as pl
import pickle
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Load chunks

In [5]:
chunks_path = Path("/content/drive/MyDrive/legal_ai/docs_db/chunks.pkl")
with open(chunks_path, "rb") as f:
    chunks = pickle.load(f)

# Calculate embeddings

We need to calculate embeddings beforehand and save them in order to not re-calculate

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [9]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [10]:
model_name = 'deepvk/USER-bge-m3'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.33M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

In [14]:
def process_in_batches(model, tokenizer, input_texts, batch_size=16, device=torch.device("cpu")):
    embeddings = []
    for i in trange(0, len(input_texts), batch_size):
        batch_texts = input_texts[i:i + batch_size]

        batch_dict = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

        batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
        embeddings.append(batch_embeddings.cpu())

        del batch_dict, outputs, batch_embeddings
        torch.cuda.empty_cache()


    return torch.cat(embeddings, dim=0)

In [16]:
input_texts = ['\n'.join((ch.metadata['title'], ch.page_content)) for ch in chunks]
len(input_texts)

17285

In [20]:
%%time
embeddings = process_in_batches(model, tokenizer, input_texts, batch_size=64, device=device)
embeddings.shape

100%|██████████| 271/271 [32:05<00:00,  7.10s/it]

CPU times: user 31min 34s, sys: 9.34 s, total: 31min 43s
Wall time: 32min 5s





torch.Size([17285, 1024])

In [21]:
np.save("/content/drive/MyDrive/legal_ai/docs_db/chunks_emb.npy", embeddings)

In [22]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   1377 MiB |   4197 MiB |  17778 GiB |  17777 GiB |
|       from large pool |   1376 MiB |   4195 MiB |  17735 GiB |  17733 GiB |
|       from small pool |      1 MiB |      3 MiB |     43 GiB |     43 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   1377 MiB |   4197 MiB |  17778 GiB |  17777 GiB |
|       from large pool |   1376 MiB |   4195 MiB |  17735 GiB |  17733 GiB |
|       from small pool |      1 MiB |      3 MiB |     43 GiB |     43 GiB |
|---------------------------------------------------------------