In [None]:
import os
import io
import base64  # encode and decode binary data using Base64 encoding.
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# Load environment variables from .env file
from dotenv import load_dotenv

# Get the directory where the notebook is located and load .env
notebook_dir = Path.cwd()
env_path = notebook_dir / ".env"

load_dotenv(dotenv_path=env_path, override=True)

print(f"Loading .env from: {env_path}")
print(f"QDRANT_URL loaded: {' Yes' if os.getenv('QDRANT_URL') else ' No'}")
print(f"QDRANT_API_KEY loaded: {' Yes' if os.getenv('QDRANT_API_KEY') else ' No'}")
print(f"GOOGLE_API_KEY loaded: {' Yes' if os.getenv('GOOGLE_API_KEY') else ' No'}")

# PDF Processing
import fitz  # PyMuPDF

# Table Extraction
import camelot

# Image Processing
from PIL import Image

# ML/Embeddings
import torch
from transformers import CLIPProcessor, CLIPModel

# Vector Database
from qdrant_client import QdrantClient
from qdrant_client.http import models as qdrant_models
from qdrant_client.http.models import Distance, VectorParams, PointStruct, BinaryQuantization, BinaryQuantizationConfig

import google.generativeai as genai

print("\n All imports successful!")

Loading .env from: c:\Users\Dell\Documents\GitHub\DeepRetrieve\backend\.env
QDRANT_URL loaded:  Yes
QDRANT_API_KEY loaded:  Yes
GOOGLE_API_KEY loaded:  Yes

 All imports successful!


In [None]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Qdrant Cloud Configuration
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
COLLECTION_NAME = "multimodal_rag"

# CLIP Configuration  
CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
EMBEDDING_DIM = 512

# RAG Configuration
TOP_K = 3
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

# Output Paths
OUTPUT_FOLDER = "extracted_content"
IMAGES_FOLDER = os.path.join(OUTPUT_FOLDER, "images")
TABLES_FOLDER = os.path.join(OUTPUT_FOLDER, "tables")

# Create directories
os.makedirs(IMAGES_FOLDER, exist_ok=True)
os.makedirs(TABLES_FOLDER, exist_ok=True)

In [28]:
def init_clip_model():
    """Initialize CLIP model and processor"""
    print("Loading CLIP model...")
    model = CLIPModel.from_pretrained(CLIP_MODEL_NAME)
    processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
    print("CLIP model loaded!")
    return model, processor

def init_qdrant_client():
    """Initialize Qdrant Cloud client"""
    print("Connecting to Qdrant Cloud...")
    client = QdrantClient(
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
    )
    print("Qdrant Cloud connected!")
    return client

def init_gemini():
    """Initialize Gemini model"""
    print("Initializing Gemini...")
    genai.configure(api_key=GOOGLE_API_KEY)
    model = genai.GenerativeModel("gemini-2.0-flash")
    print("Gemini initialized!")
    return model

# Initialize all models
clip_model, clip_processor = init_clip_model()
gemini_model = init_gemini()
qdrant_client = init_qdrant_client()

Loading CLIP model...
CLIP model loaded!
Initializing Gemini...
Gemini initialized!
Connecting to Qdrant Cloud...
CLIP model loaded!
Initializing Gemini...
Gemini initialized!
Connecting to Qdrant Cloud...
Qdrant Cloud connected!
Qdrant Cloud connected!


In [None]:
# PDF Extraction Functions
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract all text from a PDF file"""
    doc = fitz.open(pdf_path)
    full_text = []
    
    for page in doc:
        text = page.get_text("text")
        full_text.append(text)
    
    doc.close()
    return "\n".join(full_text)


def extract_images_from_pdf(pdf_path: str, output_folder: str = IMAGES_FOLDER) -> List[Dict]:
    """Extract all images from a PDF file with metadata"""
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    
    extracted_images = []
    pdf_name = Path(pdf_path).stem
    
    for page_index, page in enumerate(doc):
        image_list = page.get_images(full=True)
        
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Save image
            image = Image.open(io.BytesIO(image_bytes))
            image_filename = f"{pdf_name}_page{page_index}_img{img_index}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)
            image.save(image_path)
            
            extracted_images.append({
                "path": image_path,
                "page": page_index,
                "index": img_index,
                "source_pdf": pdf_path
            })
    
    doc.close()
    return extracted_images


def extract_tables_from_pdf(pdf_path: str, output_folder: str = TABLES_FOLDER) -> List[Dict]:
    """Extract tables from PDF using Camelot and convert to text format"""
    os.makedirs(output_folder, exist_ok=True)
    
    extracted_tables = []
    pdf_name = Path(pdf_path).stem
    
    try:
        # Try 'lattice' method first (for tables with borders)
        tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
        
        # If no tables found, try 'stream' method (for borderless tables)
        if len(tables) == 0:
            tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")
        
        for i, table in enumerate(tables):
            # Get the DataFrame
            df = table.df
            
            # Convert table to markdown format for better LLM understanding
            table_markdown = df.to_markdown(index=False)
            
            # Also save as CSV
            csv_path = os.path.join(output_folder, f"{pdf_name}_table_{i}.csv")
            table.to_csv(csv_path)
            
            # Create a text representation with context
            table_text = f"TABLE {i+1} (Page {table.page}):\n{table_markdown}"
            
            extracted_tables.append({
                "table_index": i,
                "page": table.page,
                "content": table_text,
                "csv_path": csv_path,
                "accuracy": table.accuracy,
                "source_pdf": pdf_path
            })
            
        print(f"   Extracted {len(tables)} tables")
        
    except Exception as e:
        print(f" Table extraction failed: {e}")
    
    return extracted_tables


def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Split text into overlapping chunks"""
    if not text.strip():
        return []
    
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk.strip():
            chunks.append(chunk)
    
    return chunks


def process_pdf(pdf_path: str) -> Dict[str, Any]:
    """Process a PDF and extract text chunks, images, and tables"""
    print(f"Processing: {pdf_path}")
    
    # Extract text and chunk it
    raw_text = extract_text_from_pdf(pdf_path)
    text_chunks = chunk_text(raw_text)
    
    # Extract images
    images = extract_images_from_pdf(pdf_path)
    
    # Extract tables
    tables = extract_tables_from_pdf(pdf_path)
    
    result = {
        "pdf_path": pdf_path,
        "text_chunks": text_chunks,
        "images": images,
        "tables": tables,
        "total_chunks": len(text_chunks),
        "total_images": len(images),
        "total_tables": len(tables)
    }
    
    print(f"   ‚úÖ Extracted {len(text_chunks)} text chunks, {len(images)} images, {len(tables)} tables")
    return result

In [None]:
# Utility: Index Multiple PDFs from a folder
def index_folder(folder_path: str, collection_name: str = COLLECTION_NAME):
    """Index all PDFs in a folder"""
    pdf_files = list(Path(folder_path).glob("*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files in {folder_path}")
    
    total_chunks = 0
    total_images = 0
    
    for pdf_path in pdf_files:
        result = index_pdf(str(pdf_path), collection_name)
        total_chunks += result["text_chunks"]
        total_images += result["images"]
    
    print(f"\n Indexing complete!")
    print(f"   Total text chunks: {total_chunks}")
    print(f"   Total images: {total_images}")

In [30]:
# Embedding Functions (CLIP)

def embed_text(text: str) -> List[float]:
    """Embed text using CLIP"""
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().tolist()


def embed_image(image_input) -> List[float]:
    """Embed image using CLIP. Accepts file path or PIL Image"""
    if isinstance(image_input, str):
        image = Image.open(image_input).convert("RGB")
    elif isinstance(image_input, Image.Image):
        image = image_input.convert("RGB")
    else:
        raise ValueError("Input must be a file path or PIL Image")
    
    inputs = clip_processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().tolist()


def embed_image_base64(base64_string: str) -> List[float]:
    """Embed a base64 encoded image using CLIP"""
    image_bytes = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    return embed_image(image)


print(" Embedding functions defined!")

 Embedding functions defined!


In [31]:
# Qdrant Vector Database Functions

def create_collection(collection_name: str = COLLECTION_NAME, recreate: bool = False):
    """Create a Qdrant collection with Binary Quantization for efficient storage"""
    
    if recreate and qdrant_client.collection_exists(collection_name=collection_name):
        print(f"Deleting existing collection: {collection_name}")
        qdrant_client.delete_collection(collection_name)
    
    if not qdrant_client.collection_exists(collection_name=collection_name):
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=EMBEDDING_DIM,
                distance=Distance.COSINE,
                on_disk=True  # Store vectors on disk for large collections
            ),
            quantization_config=BinaryQuantization(
                binary=BinaryQuantizationConfig(
                    always_ram=True  # Keep quantized vectors in RAM for fast search
                )
            ),
        )
        print(f" Created collection: {collection_name} (with Binary Quantization)")
    else:
        print(f" Collection '{collection_name}' already exists")


def add_text_to_qdrant(
    text_chunks: List[str],
    source_pdf: str,
    collection_name: str = COLLECTION_NAME
) -> int:
    """Add text chunks to Qdrant with embeddings"""
    points = []
    
    for i, chunk in enumerate(text_chunks):
        embedding = embed_text(chunk)

        """
        In Qdrant, every entry in the vector database is called a point
        A point contains three things:
            id - a unique integer
            vector - the embedding (list of floats)
            payload - a dictionary of metadata

        PointStruct is the Qdrant SDK class that represents one point.
        """

        point = PointStruct(
            id=hash(f"{source_pdf}_{i}") % (2**63),  # Unique ID using hashing
            vector=embedding, # The embedding vector
            payload={  # Metadata about the chunk
                "type": "text",
                "content": chunk,
                "source": source_pdf,
                "chunk_index": i
            }
        )
        points.append(point)
    
    if points:
        qdrant_client.upsert(collection_name=collection_name, points=points)
    
    return len(points)


def add_images_to_qdrant(
    images: List[Dict],
    collection_name: str = COLLECTION_NAME
) -> int:
    """Add images to Qdrant with embeddings"""
    points = []
    
    for img_info in images:
        image_path = img_info["path"]
        
        try:
            embedding = embed_image(image_path)
            
            # Read and encode image for storage
            with open(image_path, "rb") as f:
                image_base64 = base64.b64encode(f.read()).decode()
            
            point = PointStruct(
                id=hash(image_path) % (2**63),
                vector=embedding,
                payload={
                    "type": "image",
                    "path": image_path,
                    "image_base64": image_base64,
                    "source": img_info["source_pdf"],
                    "page": img_info["page"]
                }
            )
            points.append(point)
        except Exception as e:
            print(f" Failed to process image {image_path}: {e}")
    
    if points:
        qdrant_client.upsert(collection_name=collection_name, points=points)
    
    return len(points)


def add_tables_to_qdrant(
    tables: List[Dict],
    collection_name: str = COLLECTION_NAME
) -> int:
    """Add extracted tables to Qdrant with embeddings (embedded as markdown text)"""
    points = []
    
    for table_info in tables:
        try:
            # Embed the markdown representation of the table
            table_content = table_info["content"]
            embedding = embed_text(table_content)
            
            point = PointStruct(
                id=hash(f"{table_info['source_pdf']}_table_{table_info['page']}_{table_info['table_index']}") % (2**63),
                vector=embedding,
                payload={
                    "type": "table",
                    "content": table_content,  # Store markdown for LLM context
                    "csv_path": table_info["csv_path"],
                    "source": table_info["source_pdf"],
                    "page": table_info["page"],
                    "table_index": table_info["table_index"]
                }
            )
            points.append(point)
        except Exception as e:
            print(f" Failed to process table from page {table_info.get('page', '?')}: {e}")
    
    if points:
        qdrant_client.upsert(collection_name=collection_name, points=points)
    
    return len(points)


def index_pdf(pdf_path: str, collection_name: str = COLLECTION_NAME):
    """Index a PDF into Qdrant (text + images + tables)"""
    # Process PDF
    pdf_data = process_pdf(pdf_path)
    
    # Add text chunks
    text_count = add_text_to_qdrant(
        pdf_data["text_chunks"],
        pdf_path,
        collection_name
    )
    print(f" Indexed {text_count} text chunks")
    
    # Add images
    image_count = add_images_to_qdrant(
        pdf_data["images"],
        collection_name
    )
    print(f" Indexed {image_count} images")
    
    # Add tables
    table_count = add_tables_to_qdrant(
        pdf_data["tables"],
        collection_name
    )
    print(f" Indexed {table_count} tables")

    return {"text_chunks": text_count, "images": image_count, "tables": table_count}

print(" Qdrant functions defined!")

 Qdrant functions defined!


In [32]:
# Retrieval Functions

def search_similar(
    query: str,
    top_k: int = TOP_K,
    collection_name: str = COLLECTION_NAME,
    content_type: str = None  # "text", "image", "table", or None for all
) -> List[Dict]:
    """Search for similar content in Qdrant"""
    
    # Embed the query
    query_embedding = embed_text(query)
    
    # Build filter if content type specified
    query_filter = None
    if content_type:
        query_filter = {
            "must": [{"key": "type", "match": {"value": content_type}}]
        }
    
    # Search
    results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k,
        query_filter=query_filter
    )
    
    # Format results
    formatted_results = []
    for result in results:
        formatted_results.append({
            "score": result.score,
            "type": result.payload.get("type"),
            "content": result.payload.get("content"),
            "image_base64": result.payload.get("image_base64"),
            "source": result.payload.get("source"),
            "page": result.payload.get("page"),
            "path": result.payload.get("path"),
            "csv_path": result.payload.get("csv_path"),  # For tables
            "table_index": result.payload.get("table_index")  # For tables
        })
    
    return formatted_results


def search_by_image(
    image_input,
    top_k: int = TOP_K,
    collection_name: str = COLLECTION_NAME
) -> List[Dict]:
    """Search using an image as query"""
    
    # Embed the image
    query_embedding = embed_image(image_input)
    
    # Search
    results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )
    
    # Format results
    formatted_results = []
    for result in results:
        formatted_results.append({
            "score": result.score,
            "type": result.payload.get("type"),
            "content": result.payload.get("content"),
            "image_base64": result.payload.get("image_base64"),
            "source": result.payload.get("source"),
            "page": result.payload.get("page"),
            "csv_path": result.payload.get("csv_path"),
            "table_index": result.payload.get("table_index")
        })
    
    return formatted_results


print(" Retrieval functions defined!")

 Retrieval functions defined!


In [None]:
# Gemini LLM Functions

def prepare_context_for_llm(search_results: List[Dict]) -> Tuple[str, List]:
    """Prepare retrieved context for Gemini (text + images + tables)"""
    
    text_context = []
    images_for_llm = []
    
    for i, result in enumerate(search_results):
        if result["type"] == "text":
            text_context.append(f"[Source {i+1} - Text]: {result['content']}")
        
        elif result["type"] == "table":
            # Tables are stored as markdown - include directly in text context
            text_context.append(f"[Source {i+1} - Table (page {result.get('page', 'unknown')})]: \n{result['content']}")
        
        elif result["type"] == "image" and result.get("image_base64"):
            # Decode base64 to PIL Image for Gemini
            image_bytes = base64.b64decode(result["image_base64"])
            image = Image.open(io.BytesIO(image_bytes))
            images_for_llm.append(image)
            text_context.append(f"[Source {i+1} - Image]: Refer to Image {len(images_for_llm)} (from page {result.get('page', 'unknown')})")
    
    context_text = "\n\n".join(text_context)
    return context_text, images_for_llm


def generate_response(
    query: str,
    search_results: List[Dict],
    include_images: bool = True
) -> str:
    """Generate response using Gemini with multimodal context"""
    
    # Prepare context
    context_text, images = prepare_context_for_llm(search_results)
    
    # Build prompt
    prompt = f"""You are an intelligent document assistant specialized in answering questions using retrieved context from a knowledge base.

## Your Task
Answer the user's question accurately using ONLY the information provided in the context below. The context may include:
- **Text excerpts** from PDF documents
- **Tables** (structured data in markdown format) extracted from documents
- **Images** (charts, diagrams, figures) extracted from documents

## Instructions
1. **Analyze all provided context** - text, tables, and images carefully
2. **For tables**: Parse the markdown table structure to extract specific data, values, and relationships
3. **For images**: Describe what you see and extract relevant information (data from charts, text from diagrams)
4. **Synthesize information** from multiple sources when needed
6. **If information is insufficient**: Clearly state what's missing rather than guessing
7. **Format your answer** clearly with proper structure when appropriate

## Retrieved Context
{context_text}

## User Question
{query}

## Answer
Provide a comprehensive, accurate answer based on the context above:"""
    
    # Generate response
    if include_images and images:
        # Multimodal generation with images
        content = [prompt] + images
        response = gemini_model.generate_content(content)
    else:
        # Text-only generation
        response = gemini_model.generate_content(prompt)
    
    return response.text


print("Gemini LLM functions defined!")

Gemini LLM functions defined!


In [34]:
# Main RAG Pipeline

def rag_query(
    query: str,
    top_k: int = TOP_K,
    include_images: bool = True,
    collection_name: str = COLLECTION_NAME
) -> Dict[str, Any]:
    """
    Main RAG pipeline: Search -> Retrieve -> Generate
    
    Args:
        query: User's question
        top_k: Number of results to retrieve
        include_images: Whether to include images in context
        collection_name: Qdrant collection to search
    
    Returns:
        Dict with answer, sources, and metadata
    """
    print(f"üîç Query: {query}")
    
    # Step 1: Retrieve relevant content
    print("   Searching for relevant content...")
    search_results = search_similar(query, top_k=top_k, collection_name=collection_name)
    
    if not search_results:
        return {
            "answer": "No relevant information found in the knowledge base.",
            "sources": [],
            "num_results": 0
        }
    
    # Step 2: Generate response with Gemini
    print("   Generating response with Gemini...")
    answer = generate_response(query, search_results, include_images=include_images)
    
    # Step 3: Prepare sources for reference
    sources = []
    for result in search_results:
        sources.append({
            "type": result["type"],
            "score": round(result["score"], 4),
            "source": result["source"],
            "preview": result["content"][:200] if result["content"] else "[Image]"
        })
    
    print("Response generated!")
    
    return {
        "answer": answer,
        "sources": sources,
        "num_results": len(search_results)
    }


def display_results(result: Dict):
    """Pretty print RAG results"""
    print("\n" + "="*60)
    print("üìù ANSWER:")
    print("="*60)
    print(result["answer"])
    print("\n" + "="*60)
    print(f"üìö SOURCES ({result['num_results']} results):")
    print("="*60)
    for i, source in enumerate(result["sources"], 1):
        print(f"\n{i}. [{source['type'].upper()}] Score: {source['score']}")
        print(f"   Source: {source['source']}")
        print(f"   Preview: {source['preview'][:100]}...")


print("RAG pipeline ready!")

RAG pipeline ready!


In [35]:
# Step 1: Create Collection

create_collection(COLLECTION_NAME, recreate=True)

 Created collection: multimodal_rag (with Binary Quantization)


In [36]:
# Step 2: Index PDF Documents

pdf_path = "data/attention.pdf"

# Index the PDF (extracts text + images and stores in Qdrant)
index_pdf(pdf_path)

Processing: data/attention.pdf
 Table extraction failed: module 'camelot' has no attribute 'read_pdf'
   ‚úÖ Extracted 14 text chunks, 3 images, 0 tables
 Table extraction failed: module 'camelot' has no attribute 'read_pdf'
   ‚úÖ Extracted 14 text chunks, 3 images, 0 tables
 Indexed 14 text chunks
 Indexed 14 text chunks
 Indexed 3 images
 Indexed 0 tables
 Indexed 3 images
 Indexed 0 tables


{'text_chunks': 14, 'images': 3, 'tables': 0}

In [37]:
# Step 3: Query the RAG System

# Ask a question
query = "What is the main topic of the document?"

# Get RAG response
result = rag_query(query, top_k=3)

# Display results
display_results(result)

üîç Query: What is the main topic of the document?
   Searching for relevant content...


  results = qdrant_client.search(


   Generating response with Gemini...
Response generated!

üìù ANSWER:
The document is about the Transformer, a new network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely (Source 1). The paper discusses the Transformer's architecture, including scaled dot-product attention and multi-head attention (Source 3). It also presents the results of applying the Transformer to machine translation tasks, where it achieves superior quality and requires less training time compared to recurrent or convolutional neural networks (Source 1). Additionally, the document touches on the application of the Transformer to English constituency parsing (Source 1). Furthermore, some attention heads exhibit behavior related to the structure of the sentence and anaphora resolution (Source 2).


üìö SOURCES (3 results):

1. [TEXT] Score: 0.7696
   Source: data/attention.pdf
   Preview: Provided proper attribution is provided, Google hereby grants permissi

In [38]:
query = "Can you explain about the Scaled Dot-Product Attention?"

# Get RAG response
result = rag_query(query, top_k=3)

# Display results
display_results(result)

üîç Query: Can you explain about the Scaled Dot-Product Attention?
   Searching for relevant content...


  results = qdrant_client.search(


   Generating response with Gemini...
Response generated!

üìù ANSWER:
Scaled Dot-Product Attention, as explained in Source 2, involves the following steps:

1.  **Input:** The input consists of queries (Q) and keys (K) of dimension dk, and values (V) of dimension dv. These are packed into matrices.
2.  **Dot Products:** Compute the dot products of the query with all keys (Q * KT).
3.  **Scaling:** Divide each dot product by the square root of dk (‚àödk). This scaling mitigates the issue of large dot product magnitudes for larger dk values, which can push the softmax function into regions with extremely small gradients.
4.  **Softmax:** Apply a softmax function to obtain the weights on the values.
5.  **Output:** Compute the matrix of outputs as Attention(Q, K, V ) = softmax(QKT /‚àödk )V.


üìö SOURCES (3 results):

1. [TEXT] Score: 0.77
   Source: data/attention.pdf
   Preview: the representation dimension, k is the kernel size of convolutions and r the size of the neighborhoo...



In [40]:
query = "Can you explain images related to attention mechanism?"

# Get RAG response
result = rag_query(query, top_k=3)

# Display results
display_results(result)

üîç Query: Can you explain images related to attention mechanism?
   Searching for relevant content...


  results = qdrant_client.search(


   Generating response with Gemini...
Response generated!

üìù ANSWER:
Source 1 mentions Figure 4 and Figure 5, both related to attention heads. Figure 4 illustrates two attention heads involved in anaphora resolution, specifically focusing on the word "its." It shows full attentions for head 5 and isolated attentions from the word "its" for attention heads 5 and 6, noting the sharpness of the attentions for this word. Figure 5 provides examples of attention heads exhibiting behavior related to sentence structure, showcasing different tasks learned by different heads from the encoder self-attention at layer 5 of 6.

Source 3 refers to Figure 2, which illustrates "Scaled Dot-Product Attention" on the left and "Multi-Head Attention" on the right, explaining that multi-head attention consists of several attention layers running in parallel.


üìö SOURCES (3 results):

1. [TEXT] Score: 0.8045
   Source: data/attention.pdf
   Preview: should be just - this is what we are missing , in my o