Currently no picutures are readin.


In [94]:
# from PyPDF2 import PdfReader

# def extract_pdf_pages(pdf_path):
#     reader = PdfReader(pdf_path)
#     pages = []
#     for i, page in enumerate(reader.pages):
#         text = page.extract_text()
#         if not text:
#             text = "Empty"
#         pages.append({'page_number': i+1, 'text': text})
#     return pages

# pdf_pages = extract_pdf_pages("Makeathon TUM presentation.pdf")

# pdf_pages

### OpenAI Vision API


In [95]:
from openai import OpenAI
import fitz  # PyMuPDF
from PyPDF2 import PdfReader
from PIL import Image
import io

client = OpenAI()

def extract_text_from_pdf(pdf_path):
    """Extracts text from each page of a PDF."""
    reader = PdfReader(pdf_path)
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            text = "Empty"
        pages.append({'page_number': i + 1, 'text': text})
    return pages

def extract_images_from_pdf(pdf_path):
    """Extracts images from each page of a PDF."""
    doc = fitz.open(pdf_path)
    images = []
    for i in range(len(doc)):
        for img in doc[i].get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append({"page_number": i + 1, "image": image})
    return images

def describe_image_with_gpt4v(image):
    """Sends a PIL Image to GPT-4.1 and returns a description."""
    # Convert the image to bytes
    img_byte_array = io.BytesIO()
    image.save(img_byte_array, format="PNG")
    img_bytes = img_byte_array.getvalue()

    # Encode the image in base64
    img_base64 = base64.b64encode(img_bytes).decode('utf-8')

    # Prepare the image data in the required format
    image_data = {
        "type": "image_url",
        "image_url": {
            "url": f"data:image/png;base64,{img_base64}"
        }
    }

    # Send the image to GPT-4 Vision
    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", 
                     # prompt for the image
                     "text": "Describe briefly, mention diagrams or tables and also include any relevant details. Keep it concise. "
                     "This is used in combination with text from the PDF."}, 
                    image_data
                ],
            }
        ],
        max_tokens=500,
    )

    # Extract and return the description
    description = response.choices[0].message.content
    return description

def combine_text_and_images(pdf_path):
    """Combines text extraction and image descriptions page-by-page."""
    text_pages = extract_text_from_pdf(pdf_path)
    images = extract_images_from_pdf(pdf_path)

    # Organize images by page number
    images_by_page = {}
    for img_data in images:
        page_num = img_data["page_number"]
        if page_num not in images_by_page:
            images_by_page[page_num] = []
        images_by_page[page_num].append(img_data["image"])

    # Combine text + image descriptions
    combined_pages = []
    for page in text_pages:
        page_number = page["page_number"]
        text = page["text"]

        image_descriptions = []
        if page_number in images_by_page:
            for image in images_by_page[page_number]:
                description = describe_image_with_gpt4v(image)
                image_descriptions.append(description)

        combined_pages.append({
            "page_number": page_number,
            "text": text,
            "image_descriptions": image_descriptions
        })

    return combined_pages

# Example usage
pdf_path = "Makeathon TUM presentation.pdf"
final_pages = combine_text_and_images(pdf_path)

# Print the combined result
for page in final_pages:
    print(f"Page {page['page_number']}:")
    print(f"Text: {page['text'][:100]}...")  # Print first 100 chars
    if page["image_descriptions"]:
        for idx, desc in enumerate(page["image_descriptions"]):
            print(f"Image {idx+1} description: {desc[:100]}...")  # Print first 100 chars
    print("="*60)


Page 1:
Text: Empty...
Page 2:
Text: Flow
Speaks2) Text Analyzer 
for keywords1) Speech to 
text converter3) Slide mover
Presentation 
sl...
Page 3:
Text: Flow
Speaks2) Text Analyzer 
for keywords1) Speech to 
text converter3a) Slide 
mover
Presentation 
...
Page 4:
Text: Structure of presentation
•Introduction (ideally something funny)
•What problem is our product solvi...
Page 5:
Text: Introduction (ideally something funny)
•30MM (Power Point) presentations are created per day
•https:...
Page 6:
Text: What problem is our product solving?
•Presentation have a fixed storyline and do not allow the prese...
Page 7:
Text: What solutions already exist?
...
Image 1 description: **Description:**

This image is from the Intuiface platform and showcases **Voice-Controlled Present...
Image 2 description: **Description:**

The image shows the logo for "presentations.AI." It features a rounded square icon...
Image 3 description: **Brief Description:**

The image introduces "The World's Best AI Pr

### Huggingface: https://huggingface.co/sentence-transformers


In [96]:
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('all-MiniLM-L6-v2')  # You can pick another
# dim = model.get_sentence_embedding_dimension()  # <<<<<<<< GET DIMENSION EARLY
# print(f"Embedding dimension: {dim}")

# def embed_batch(pdf_pages):
#     for page in pdf_pages:
#         if page['text']:
#             page['embedding'] = model.encode(page['text'])
#     return [page['embedding'] for page in pdf_pages if 'embedding' in page]  # type: ignore

# # After extracting pdf_pages
# embeddings = embed_batch(pdf_pages)
# print(f"Embedded {len(embeddings)} pages.")


### OPENAI Alternative: https://platform.openai.com/docs/guides/embeddings?lang=python


In [97]:
from dotenv import load_dotenv
load_dotenv()                        # make sure OPENAI_API_KEY is in your .env

from openai import OpenAI

client = OpenAI()                   # reads api key from env

def embed_batch(pdf_pages, batch_size: int = 16):
    # 1) gather all page texts (empty string if missing)
    texts = [page.get('text', '') for page in pdf_pages]

    # 2) call the API in chunks of batch_size
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i : i + batch_size]
        resp = client.embeddings.create(
            model="text-embedding-3-small",
            input=chunk
        )
        # resp.data is a list of objects, each with an .embedding list
        all_embeddings.extend([d.embedding for d in resp.data])

    # 3) assign back to pages
    for page, emb in zip(pdf_pages, all_embeddings):
        if page.get('text'):
            page['embedding'] = emb
        else:
            page['embedding'] = None

    return all_embeddings

# — example usage —
# after you’ve built pdf_pages = [ {"page_number":1, "text": "..."} , ... ]
embs = embed_batch(pdf_pages)
dim = len(embs[0])  # <<<<<<<< GET DIMENSION EARLY
print(f"Embedded {len(embs)} pages, each a {len(embs[0])}-dim vector")


Embedded 10 pages, each a 1536-dim vector


### DELETE collection


In [98]:
# client.delete_collection(collection_name="pdf_collection")

In [99]:
import os
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from qdrant_client.models import PointStruct, VectorParams, Distance

load_dotenv()

client = QdrantClient(
    url=os.getenv("QDRANT_HOST"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# client.delete_collection(collection_name="pdf_collection")

COLL = os.getenv("QDRANT_COLLECTION_NAME", "pdf_collection")

# 5) Create collection if it doesn’t exist
collections = client.get_collections().collections
exists = any(c.name == COLL for c in collections)

if not exists:
    client.create_collection(
        collection_name=COLL,
        vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
    )


In [100]:
# 6) Prepare your points and upsert
points = [
    PointStruct(
        id=page["page_number"],
        vector=page["embedding"],
        payload={
            "text":        page["text"],
            "page_number": page["page_number"],
        },
    )
    for page in pdf_pages
]

client.upsert(collection_name=COLL, points=points)
print(f"✅ Upserted {len(points)} pages (dim={dim}) into “{COLL}”.")

✅ Upserted 10 pages (dim=1536) into “pdf_collection2”.


In [106]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient

load_dotenv()

# — clients & globals —
openai_client   = OpenAI()  # reads OPENAI_API_KEY from env
qdrant          = QdrantClient(
    url=os.getenv("QDRANT_HOST"),
    api_key=os.getenv("QDRANT_API_KEY"),
)
EMBEDDING_MODEL = "text-embedding-3-small"
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "pdf_collection")
print(f"Using Qdrant collection “{COLLECTION_NAME}”.")

def semantic_search(query: str, top_k: int = 2):
    # 1️⃣ Embed your query with OpenAI
    resp = openai_client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=[query],
    )
    query_vector = resp.data[0].embedding

    # 2️⃣ Fire the new Query API
    qr = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vector,   # your dense vector
        limit=top_k,          # how many neighbors to return
        with_payload=True,    # pull back your stored page text & page_number
    )                       

    # 3️⃣ Extract the list of ScoredPoint objects
    scored_points = qr.points  # type: ignore[attr-defined] :contentReference[oaicite:0]{index=0}

    # 4️⃣ Format the results
    results = []
    for sp in scored_points:
        results.append({
            "score":        sp.score,
            "page_number":  sp.payload.get("page_number"),
            "text_snippet": sp.payload.get("text", "")[:200].replace("\n", " ")
        })
    return results

# Example usage
if __name__ == "__main__":
    for r in semantic_search("What solution?", top_k=2):
        print(f"Page {r['page_number']}   score={r['score']:.3f}")
        print(f" → {r['text_snippet']!r}\n")


Using Qdrant collection “pdf_collection2”.
Page 7   score=0.603
 → 'What solutions already exist? '

Page 8   score=0.403
 → 'Sources for solutions that already exist •https://intuiville.intuiface.com/usage/voice -controlled -presentations -work •https://www.presentations.ai/ '

