# Get embeddings

In [71]:
import os
import torch
from transformers import BertTokenizer, BertModel

from eval import getembeddings
from eval import gpuutils

import importlib
importlib.reload(getembeddings)



ocr_data_dir = os.path.join(os.path.dirname(os.getcwd()), 'digitalize_handwritten')
groundtruth_dir = os.path.join(ocr_data_dir, 'data')
ocr_dir = os.path.join(ocr_data_dir, 'OCR')

In [68]:
# set model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

device = gpuutils.get_gpu_most_memory()
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

## Ground truth embeddings

In [65]:
text_glob_path = os.path.join(groundtruth_dir, 'BenthamDataset', 'GT', 'GT_Extracted', '*.txt')
save_dir = os.path.join(os.getcwd(), 'data', 'embeddings', 'ground_truth', 'bentham')

getembeddings.get_all_embeddings(text_glob_path, save_dir, model, tokenizer, device)


Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors


# Raw OCR embeddings

In [66]:
text_glob_path = os.path.join(ocr_dir, 'completed-OCR', 'Bentham', '*.txt')
save_dir = os.path.join(os.getcwd(), 'data', 'embeddings', 'ocr', 'bentham')

getembeddings.get_all_embeddings(text_glob_path, save_dir, model, tokenizer, device)


# LLM proccessed OCR

# Just testing things

In [74]:
test_text = "366. The evidence of the engagement, confined to a portable instrument, instead of a Book. _ Taken from Exche-quer Bills,_ Differs from Stock Annuities _ Agrees with Irish Debentures, and the now disused Navy Victu-alling Transport and Ordnance Bills or Debentures: _ also with India Bonds, Bank Notes, Banker's Promissory Notes; and private Promissory Notes and Bills of Exchange. 7. The Paper, by its size, shape, texture, are thinness, par--ticularly fitted for circulation _ Taken from Bank Paper. Agrees more or less with Bankers Paper and with the French Assignats. Differs from all the other above mentioned Engagements: _- Eexcept from some late issues of Exchequer Bills, in respect of size. _ 8. Application of the profit of the measure towards the reduction of the National Debt. _ Taken from the Sale of the Land Tax i:e: the exchange of so many portions of the annual produce of that Tax for the portions of Stock Annuities. Differs from all the other engagements above mentioned._ 16 Aug 1800"

embedding = getembeddings.get_embedding_pool(
    text=test_text,
    model=model,
    tokenizer=tokenizer,
    device=device
)
embedding = embedding.cpu()
save_path = os.path.join('data', 'embeddings', 'test', 'bentham', 'test_002_080_001.pt')
torch.save(embedding, save_path)