In [1]:
%pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9
Note: you may need to restart the kernel to use updated packages.


In [19]:
import numpy as np
import pandas as pd
import os
import torch
import pymupdf

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Preprocess document

In [None]:
file_path = "/kaggle/input/rag-document/data/Computer Networks.pdf"

In [None]:
def clean_text(text: str):
    # filter = ''.join([chr(i) for i in range(1, 32)])
    # text = text.translate(str.maketrans('', '', filter)).strip()
    text = text.replace('-\n', '')
    text = text.replace('\n', ' ')
    text = text.replace(u'\xa0', u' ')

    while text.find('  ') != -1:
        text = text.replace('  ', ' ') 

    return text

def load_document(file_path: str, chunk_size=32):
    """
    Loads pdf from `file_path` and generate list of chunks from the file
    """
    doc = pymupdf.open(file_path)
    output = []

    chunk_id = 0
    for i, page in enumerate(doc):
        # Get text per page
        text = page.get_text()

        # Clean text
        text = clean_text(text)

        words = text.split(' ')
        for j in range(0, len(words) - chunk_size + 1, 2):
            chunk = ' '.join(words[j:j + chunk_size])

            output.append({
                'chunk_id': chunk_id,
                'page': i,
                'text': chunk,
            })

            chunk_id += 1
    
    return output

chunks = load_document(file_path)
chunks[:5]

In [None]:
df = pd.DataFrame(chunks)
df.head()

In [14]:
from transformers import AutoTokenizer, AutoModel

RESET = True
# model_name = "bert-base-uncased"
model_name = "BAAI/bge-small-en-v1.5"

tokenizer = None
model = None

if os.path.isdir("model/tokenizer") and os.path.isdir("model/embedding") and not RESET:
    tokenizer = AutoTokenizer.from_pretrained("model/tokenizer")
    model = AutoModel.from_pretrained("model/embedding")
    
else:
    # Then init
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)

    tokenizer.save_pretrained("model/tokenizer")
    model.save_pretrained("model/embedding")


assert tokenizer is not None
assert model is not None

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

In [15]:
def get_embedding(text: list[str] | str):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        # reduce mean in sequence length axis
        output = model(**inputs).last_hidden_state.mean(dim=1)

    return output.tolist()

384

In [None]:
# Computer embedding for each chunk

batch_size = 1000

total = []

df_values = df["text"].values
num_batch = len(df) // batch_size

for i in range(num_batch + 1):
    print(f"Embedding batch {i + 1}")
    total += get_embedding(df_values[i * batch_size : min((i + 1) * batch_size, len(df))].tolist())

df["embedding"] = pd.Series(total)
df

In [17]:
# Save data
df[['chunk_id', 'page', 'text', 'embedding']].to_csv('Computer Networks.csv', index=False)