In [12]:
from PyPDF2 import PdfReader, PdfWriter
from dotenv import load_dotenv
import os
import json
import pdfplumber
from dataclasses import dataclass, field
import numpy as np
import requests
from tqdm import tqdm
from openai import AzureOpenAI, BadRequestError
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
) 
# Load .env file
load_dotenv()

directory = os.path.dirname(os.getcwd())
sample_file = os.getenv("SAMPLE_FILE")
data_file_path = os.path.join(directory, os.getenv("DATA_DIR"), sample_file)
deployment_name = os.getenv("DEPLOYMENT_NAME")
openai_endpoint = os.getenv("OPENAI_ENDPOINT")
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_api_version = os.getenv("OPENAI_API_VERSION")
embedding_model_deployment = os.getenv("EMBEDDING_MODEL_DEPLOYMENT")

def zero_vector():
    return np.zeros(1536)

@dataclass
class ChunkEntity:
    id: str = None
    title: str = None
    content: str = None
    summary: str = None
    source: str = None
    page: str = None
    content_vector: np.ndarray = field(default_factory=zero_vector)
    summary_vector: np.ndarray = field(default_factory=zero_vector)

client = AzureOpenAI(
    api_key = openai_api_key,
    api_version = openai_api_version ,
    azure_endpoint = openai_endpoint
)

In [13]:
def get_embedding(text):
    endpoint = openai_endpoint
    api_key = openai_api_key
    api_version = openai_api_version     
    
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    request_payload = {
        'input': text
    }
    embedding_response = requests.post(request_url, json = request_payload, headers = headers, timeout=None)
   
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        print(f"Error : {embedding_response.text}")
        return []    

@retry(
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=10)
)
def generate_response(content, prompt, deployment_name):
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": content}
    ]
    try:
        response = client.chat.completions.create(
            model=deployment_name,
            messages=messages,
            temperature=0.0,
        )
    except BadRequestError as e: 
        print("content generation failed. Retrying...error: "+ str(e))
        return None       

    return response.choices[0].message.content      

In [14]:
def split_pdf(file_path, chunk_size):
    prompt = "Get a semantic summary of the following text: "
    title_prompt = "Create a title for the following text: "
    entities = []
    with pdfplumber.open(file_path) as pdf:
        total_pages = len(pdf.pages)
        for i in tqdm(range(0, total_pages, chunk_size)):
            text = ''
            for page in range(i, min(i + chunk_size, total_pages)):
                text += pdf.pages[page].extract_text()
            summary = generate_response(text, prompt, deployment_name)
            title = generate_response(text, title_prompt, deployment_name)
            content_vector = get_embedding(text)
            summary_vector = get_embedding(summary)
            entity = ChunkEntity(
                id=i//chunk_size + 1,
                title=title,
                content=text,
                summary=summary,
                source=sample_file,
                page=i//chunk_size + 1,
                content_vector=content_vector,
                summary_vector=summary_vector
            )
            entities.append(entity)
    return entities

entities = split_pdf(data_file_path, 5)

100%|██████████| 39/39 [03:25<00:00,  5.27s/it]


In [16]:
# Open the output file in write mode
output_file = directory + '/data/output_chunks.jsonl'
with open(output_file, 'w') as f:
    for entity in entities:

        # Flatten the content and summary vectors
        entity.content_vector = [item for sublist in entity.content_vector for item in sublist]
        entity.summary_vector = [item for sublist in entity.summary_vector for item in sublist]   
        # Convert the entity to a dictionary and then to a JSON string
        json_str = json.dumps(entity.__dict__)
        # Write the JSON string to the file
        f.write(json_str + '\n')