In [1]:
%pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9
Note: you may need to restart the kernel to use updated packages.


In [19]:
import numpy as np
import pandas as pd
import os
import torch
import pymupdf

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Preprocess document

In [None]:
file_path = "/kaggle/input/rag-document/data/Computer Networks.pdf"

In [6]:
def clean_text(text: str):
    # filter = ''.join([chr(i) for i in range(1, 32)])
    # text = text.translate(str.maketrans('', '', filter)).strip()
    text = text.replace('-\n', '')
    text = text.replace('\n', ' ')
    text = text.replace(u'\xa0', u' ')

    while text.find('  ') != -1:
        text = text.replace('  ', ' ') 

    return text

def load_document(file_path: str, chunk_size=32):
    """
    Loads pdf from `file_path` and generate list of chunks from the file
    """
    doc = pymupdf.open(file_path)
    output = []

    chunk_id = 0
    for i, page in enumerate(doc):
        # Get text per page
        text = page.get_text()

        # Clean text
        text = clean_text(text)

        words = text.split(' ')
        for j in range(0, len(words) - chunk_size + 1, 2):
            chunk = ' '.join(words[j:j + chunk_size])

            output.append({
                'chunk_id': chunk_id,
                'page': i,
                'text': chunk,
            })

            chunk_id += 1
    
    return output

chunks = load_document(file_path)
chunks[:5]

[{'chunk_id': 0,
  'page': 4,
  'text': 'COMPUTER NETWORKS FIFTH EDITION ANDREW S. TANENBAUM Vrije Universiteit Amsterdam, The Netherlands DAVID J. WETHERALL University of Washington Seattle, WA PRENTICE HALL Boston Columbus Indianapolis New York San Francisco Upper Saddle River'},
 {'chunk_id': 1,
  'page': 4,
  'text': 'FIFTH EDITION ANDREW S. TANENBAUM Vrije Universiteit Amsterdam, The Netherlands DAVID J. WETHERALL University of Washington Seattle, WA PRENTICE HALL Boston Columbus Indianapolis New York San Francisco Upper Saddle River Amsterdam Cape'},
 {'chunk_id': 2,
  'page': 4,
  'text': 'ANDREW S. TANENBAUM Vrije Universiteit Amsterdam, The Netherlands DAVID J. WETHERALL University of Washington Seattle, WA PRENTICE HALL Boston Columbus Indianapolis New York San Francisco Upper Saddle River Amsterdam Cape Town Dubai'},
 {'chunk_id': 3,
  'page': 4,
  'text': 'TANENBAUM Vrije Universiteit Amsterdam, The Netherlands DAVID J. WETHERALL University of Washington Seattle, WA PRENTIC

In [7]:
df = pd.DataFrame(chunks)
df.head()

Unnamed: 0,chunk_id,page,text,num_words,num_char
0,0,4,COMPUTER NETWORKS FIFTH EDITION ANDREW S. TANE...,32,238
1,1,4,FIFTH EDITION ANDREW S. TANENBAUM Vrije Univer...,32,235
2,2,4,ANDREW S. TANENBAUM Vrije Universiteit Amsterd...,32,232
3,3,4,"TANENBAUM Vrije Universiteit Amsterdam, The Ne...",32,236
4,4,4,"Universiteit Amsterdam, The Netherlands DAVID ...",32,232
...,...,...,...,...,...
192777,192777,961,"and MobiSys, and cofounded the ACM HotNets wor...",32,203
192778,192778,961,and cofounded the ACM HotNets workshops. He ha...,32,201
192779,192779,961,the ACM HotNets workshops. He has served on nu...,32,195
192780,192780,961,HotNets workshops. He has served on numerous p...,32,196


In [14]:
from transformers import AutoTokenizer, AutoModel

RESET = True
# model_name = "bert-base-uncased"
model_name = "BAAI/bge-small-en-v1.5"

tokenizer = None
model = None

if os.path.isdir("model/tokenizer") and os.path.isdir("model/embedding") and not RESET:
    tokenizer = AutoTokenizer.from_pretrained("model/tokenizer")
    model = AutoModel.from_pretrained("model/embedding")
    
else:
    # Then init
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)

    tokenizer.save_pretrained("model/tokenizer")
    model.save_pretrained("model/embedding")


assert tokenizer is not None
assert model is not None

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

In [15]:
def compute_embedding(text: list[str] | str):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        # reduce mean in sequence length axis
        output = model(**inputs).last_hidden_state.mean(dim=1)

    return output.tolist()

384

In [16]:
# Computer embedding for each chunk

batch_size = 1000

total = []

df_values = df["text"].values
num_batch = len(df) // batch_size

for i in range(num_batch + 1):
    print(f"Embedding batch {i + 1}")
    total += compute_embedding(df_values[i * batch_size : min((i + 1) * batch_size, len(df))].tolist())

df["embedding"] = pd.Series(total)
df

Embedding batch 1
Embedding batch 2
Embedding batch 3
Embedding batch 4
Embedding batch 5
Embedding batch 6
Embedding batch 7
Embedding batch 8
Embedding batch 9
Embedding batch 10
Embedding batch 11
Embedding batch 12
Embedding batch 13
Embedding batch 14
Embedding batch 15
Embedding batch 16
Embedding batch 17
Embedding batch 18
Embedding batch 19
Embedding batch 20
Embedding batch 21
Embedding batch 22
Embedding batch 23
Embedding batch 24
Embedding batch 25
Embedding batch 26
Embedding batch 27
Embedding batch 28
Embedding batch 29
Embedding batch 30
Embedding batch 31
Embedding batch 32
Embedding batch 33
Embedding batch 34
Embedding batch 35
Embedding batch 36
Embedding batch 37
Embedding batch 38
Embedding batch 39
Embedding batch 40
Embedding batch 41
Embedding batch 42
Embedding batch 43
Embedding batch 44
Embedding batch 45
Embedding batch 46
Embedding batch 47
Embedding batch 48
Embedding batch 49
Embedding batch 50
Embedding batch 51
Embedding batch 52
Embedding batch 53
Em

Unnamed: 0,chunk_id,page,text,num_words,num_char,embedding
0,0,4,COMPUTER NETWORKS FIFTH EDITION ANDREW S. TANE...,32,238,"[-0.24107325077056885, -0.11326102167367935, 0..."
1,1,4,FIFTH EDITION ANDREW S. TANENBAUM Vrije Univer...,32,235,"[0.0692388117313385, 0.11854249238967896, 0.69..."
2,2,4,ANDREW S. TANENBAUM Vrije Universiteit Amsterd...,32,232,"[0.4598182439804077, 0.353718101978302, 0.3641..."
3,3,4,"TANENBAUM Vrije Universiteit Amsterdam, The Ne...",32,236,"[0.3230662941932678, 0.3634697198867798, 0.205..."
4,4,4,"Universiteit Amsterdam, The Netherlands DAVID ...",32,232,"[0.2969452142715454, 0.027495326474308968, 0.2..."
...,...,...,...,...,...,...
192777,192777,961,"and MobiSys, and cofounded the ACM HotNets wor...",32,203,"[-0.18572883307933807, 0.10274776816368103, 0...."
192778,192778,961,and cofounded the ACM HotNets workshops. He ha...,32,201,"[0.012503870762884617, -0.022333532571792603, ..."
192779,192779,961,the ACM HotNets workshops. He has served on nu...,32,195,"[-0.14394520223140717, -0.11508528143167496, 0..."
192780,192780,961,HotNets workshops. He has served on numerous p...,32,196,"[-0.2275371253490448, -0.16411814093589783, 0...."


In [17]:
# Save data
df[['chunk_id', 'page', 'text', 'embedding']].to_csv('Computer Networks.csv', index=False)