## Installing required libraries

In [11]:
# !pip install unstructured==0.16.6
# !pip install pdfminer.six
# !pip install pillow 
# !pip install pi_heif
# !pip install unstructured_inference
# !pip install poppler-utils
# !pip install pdf2image
# !conda install -c conda-forge poppler  
# !pip install unstructured_pytesseract
# !conda install -c conda-forge tesseract

## Importing the required components

In [12]:
from unstructured.partition.pdf import partition_pdf

## Parsing the PDF

In [None]:
import os

file_path = "./docs/attention.pdf"

if os.path.exists(file_path) and file_path.endswith('.pdf'):
    try:
        chunks = partition_pdf(
            filename=file_path,
            infer_table_structure=True,
            strategy="hi_res",
            extract_image_block_types=["Image"],
            extract_image_block_to_payload=True,
            chunking_strategy="by_title",
            max_characters=10000,
            combine_text_under_n_chars=2000,
            new_after_n_chars=6000
        )
    except Exception as e:
        print(f"Error processing PDF: {e}")
else:
    print("File does not exist or is not a valid PDF.")

In [None]:
chunks

In [None]:
from IPython.display import display, Markdown

display(Markdown(chunks[2].text))

In [None]:
chunks[2].metadata.orig_elements

In [None]:
chunks[6].metadata.text_as_html

## Separate images, texts, and tables

In [None]:
chunks

In [19]:
import unstructured

texts, tables, images = [], [], []

for chunk in chunks:
        
    if isinstance(chunk, unstructured.documents.elements.Table):
        tables.append(chunk)

    if isinstance(chunk, unstructured.documents.elements.CompositeElement):
        texts.append(chunk)
        
        chunk_elements = chunk.metadata.orig_elements
        
        # iterate over all elements of this chunk
        for element in chunk_elements:
               
            if isinstance(element, unstructured.documents.elements.Image):
                images.append(element.metadata.image_base64)

In [None]:
print("Total Texts:", len(texts))
print("Total Images:", len(images))
print("Total Tables:", len(tables))

In [None]:
import base64
from IPython.display import Image, display

image_data = base64.b64decode(images[1])
    
display(Image(data=image_data))

In [34]:
import base64
from IPython.display import Image, display

for idx, image in enumerate(images):
    
    image_data = base64.b64decode(image)
    
    path = f"image_{idx}.jpeg"
    
    with open(path, "wb") as f:
        f.write(image_data)

In [4]:
# summarize images
import ollama

def get_image_summary(file_path):

    response = ollama.chat(
        model='llama3.2-vision',
        messages=[{
            'role': 'user',
            'content': 'Summarize the image:',
            'images': [file_path]
        }]
    )
    return response.message.content

In [None]:
from tqdm import tqdm
image_summaries = [get_image_summary(f"image_{i+1}.jpeg") \
                   for i in tqdm(range(len(images)))]

In [None]:
import pickle

with open("./docs/image_summaries.pickle", "rb") as h:
    image_summaries = pickle.load(h)

## Summarize texts

In [None]:
def get_table_summary(table_html):

    response = ollama.chat(
        model='llama3.2:1b',
        messages=[{
            'role': 'user',
            'content': f'Summarize this table: {table_html}'
        }]
    )
    
    return response.message.content

table_summaries = [get_table_summary(tables[i].metadata.text_as_html) \
                   for i in tqdm(range(len(tables)))]