In [1]:
import torch.nn.functional as F
import pypdf
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')

contents = []
# creating a pdf reader object

reader = pypdf.PdfReader(r"D:\Download\Mechanics of Materials 10th Edition by Russell C. Hibbeler (z-lib.org) (5).pdf")
for i in tqdm(range(len(reader.pages))):
    content = reader.pages[i].extract_text()
    for j in tokenizer.encode(content, max_length=505, padding=True,truncation=True, return_overflowing_tokens=True):
        tok = j
        if len(j)<10:
            continue
        prefix = tokenizer.encode('passage: ')[:-1]
        prefix.extend(tok[1:])
        passage = tokenizer.decode(prefix)
        contents.append(passage)



100%|████████████████████████████████████████████████████████████████████████████████| 901/901 [02:53<00:00,  5.19it/s]


In [10]:
tok_pass_batch = []
batch_size = 64
i = batch_size
j = 0
while(True):
    tok_pass = tokenizer(contents[j:i], max_length=512, padding=True, truncation=True, return_tensors='pt')
    tok_pass_batch.append(tok_pass)
    i = i + batch_size
    j = j + batch_size
    if i>len(contents):
        i = len(contents)-1
        tok_pass = tokenizer(contents[j:i], max_length=512, padding=True, truncation=True, return_tensors='pt')
        tok_pass_batch.append(tok_pass)
        break

In [11]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

question = ['query: What is shear stress',
               'query: Why strength of matrtial important',]
# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = []
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
model = AutoModel.from_pretrained('intfloat/e5-small-v2').cuda()

# Tokenize the input texts
batch_dict = tokenizer(question, max_length=512, padding=True, truncation=True, return_tensors='pt')
batch_dict = batch_dict.to('cuda')

In [12]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [13]:
import torch

In [14]:
with torch.no_grad():
    outputs = model(**batch_dict)
    embeddings_q = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).to('cpu')
    batch_dict = batch_dict.to('cpu')

In [15]:
list_embeddings_v = []
with torch.no_grad():
    for i in tqdm(tok_pass_batch):
        batch_dict = i
        batch_dict = batch_dict.to('cuda')
        outputs = model(**batch_dict)
        embeddings_v = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).to('cpu')
        del outputs
        del batch_dict
        list_embeddings_v.append(embeddings_v)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.23it/s]


In [24]:

for j in tqdm(range(len(list_embeddings_v))):
    # normalize embeddings
    embeddings_q = F.normalize(embeddings_q, p=2, dim=1)
    embeddings_v = F.normalize(list_embeddings_v[j], p=2, dim=1)
    scores = (embeddings_q @ embeddings_v.T) * 100
    fn_scores = F.softmax(scores)
    break

  fn_scores = F.softmax(scores)
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]


In [25]:
fn_scores

tensor([[2.1231e-05, 3.5656e-05, 9.8224e-07, 3.9608e-06, 2.7174e-05, 2.9820e-03,
         1.7511e-03, 9.0427e-05, 1.2992e-04, 7.7315e-05, 4.4685e-05, 2.8075e-03,
         2.6352e-02, 5.6908e-04, 4.2344e-03, 2.7006e-01, 1.0014e-02, 6.5788e-04,
         1.1644e-03, 1.3069e-04, 3.0106e-02, 1.0383e-03, 1.8039e-04, 5.0195e-05,
         1.2938e-03, 3.0632e-05, 5.7133e-03, 4.5089e-07, 3.6721e-06, 3.1355e-03,
         3.7485e-04, 1.3265e-03, 1.2359e-04, 6.4495e-04, 2.5481e-04, 1.5148e-02,
         1.3561e-01, 7.5287e-07, 2.7957e-03, 8.3291e-04, 8.1535e-03, 9.6101e-04,
         4.6715e-03, 1.6543e-03, 8.4937e-03, 2.4322e-03, 1.3805e-01, 7.0337e-08,
         3.0354e-02, 3.9482e-05, 6.9559e-02, 4.4839e-02, 6.1094e-02, 1.9533e-02,
         8.3044e-03, 1.1173e-02, 1.3067e-02, 6.3372e-03, 8.6362e-03, 1.4141e-02,
         2.2888e-03, 1.2027e-06, 6.6361e-03, 1.9759e-02],
        [5.4593e-04, 2.3580e-03, 1.6316e-04, 1.6104e-03, 1.1433e-02, 2.2643e-03,
         1.6271e-02, 1.8059e-02, 1.1382e-02, 2.1113