In [1]:
import pdfplumber
import pytesseract
from PIL import Image
import io
import os
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import spacy
from transformers import DistilBertTokenizer
import re
from pinecone import Pinecone, ServerlessSpec
from transformers import BertTokenizer, BertModel
import torch

# Pinecone connection

In [2]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="09c53ad3-28c3-4a7f-88fa-68260478262e")

pc.create_index(
    name="bert-test88",
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

index = pc.Index("bert-test88")


# PDF Reading

In [3]:
def read_text_from_pdf(pdf_path):
    document_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    document_text += page_text + "\n"
                else:
                    # Perform OCR if no text is found
                    img = page.to_image(resolution=300).original
                    page_text = pytesseract.image_to_string(img)
                    document_text += page_text + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")

    document_length = len(document_text.strip())
    print(f"Document length for {os.path.basename(pdf_path)}: {document_length} characters")
    
    return document_text

# Data Cleaning

In [4]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Trim leading and trailing spaces
    return text

In [5]:
def filter_non_textual_content(text):
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        # This regex matches lines with mostly numbers
        if not re.match(r'^(\d+(\.\d+)?[\s\t,]*){2,}$', line):
            filtered_lines.append(line)
    filtered_text = ' '.join(filtered_lines)
    filtered_text = re.sub(r'\s{2,}', ' ', filtered_text)
    return preprocess_text(filtered_text)

# Text Chunking

In [6]:
model_name = "BlueOrangeDigital/distilbert-cross-segment-document-chunking"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)


In [7]:
def chunk_text(text, max_len=450):
    text = preprocess_text(text)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)  # Sentence splitting
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_len = len(tokenizer.encode(sentence, add_special_tokens=False))

        if current_length + sentence_len > max_len:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_length = 0

        # Check if a single sentence exceeds the maximum length, if so, truncate
        if sentence_len > max_len:
            sentence = tokenizer.decode(tokenizer.encode(sentence, add_special_tokens=False)[:max_len])
            sentence_len = max_len

        current_chunk.append(sentence)
        current_length += sentence_len

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Vector Generation

In [8]:
global counter
counter = 1
def vector_gen(paragraphs,model_name):

    vectors = []
    
    num_paragraphs = len(paragraphs) 

    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    
    for i,text in enumerate(paragraphs):
        
        inputs  = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()

        record_dict = {"id": str(counter)+'-'+str(i), "values":cls_embedding, "metadata":{"text":text}}

        vectors.append(record_dict)

    return vectors

# Main Loop

In [9]:
directory = './'
pdf_paths = []
for file in os.listdir(directory):
    
    if file.endswith('.pdf'):
        print(f"Starting {file}")
        # Read PDF
        document_text = read_text_from_pdf(file)
        # CLean PDF by removing white spaces
        cleaned_text = preprocess_text(document_text)
        # Filter PDF to keep text only and remove any special characters
        filtered_text = filter_non_textual_content(cleaned_text)
        print(f' * Finnished Cleaning')
        # Split the data into chunks
        chunks = chunk_text(filtered_text)
        print(f' * number of chunks = {len(chunks)}')
        # Start Text embeddings
        embeddings = vector_gen(chunks, 'bert-base-uncased')
        
        index.upsert(vectors=embeddings)
        print(f"{file} Finished")
    
    counter += 1
        
        



Starting EGPTds.pdf
Document length for EGPTds.pdf: 434 characters
 * Finnished Cleaning
 * number of chunks = 1
EGPTds.pdf Finished
Starting ForeignCDs.pdf
Document length for ForeignCDs.pdf: 2882 characters
 * Finnished Cleaning
 * number of chunks = 1
ForeignCDs.pdf Finished
Starting ifrs-9-financial-instruments.pdf
Error reading ifrs-9-financial-instruments.pdf: tesseract is not installed or it's not in your PATH. See README file for more information.
Document length for ifrs-9-financial-instruments.pdf: 486264 characters
 * Finnished Cleaning
 * number of chunks = 243
ifrs-9-financial-instruments.pdf Finished
Starting IstithmarFundFactSheet.pdf
Document length for IstithmarFundFactSheet.pdf: 5153 characters
 * Finnished Cleaning
 * number of chunks = 4
IstithmarFundFactSheet.pdf Finished
Starting JPM_Report_May_2024_1714798734_240504_145944.pdf
Document length for JPM_Report_May_2024_1714798734_240504_145944.pdf: 47833 characters
 * Finnished Cleaning
 * number of chunks = 27
JPM_