In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
pip install pymupdf sentence-transformers faiss-cpu


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.

In [4]:
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Global model initialization (fast, small, and decent quality)
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF and return as list of (chunk, source_pdf) tuples"""
    doc = fitz.open(pdf_path)
    text_chunks = []
    for page in doc:
        text = page.get_text()
        # Split into chunks (you can make this smarter with NLP techniques)
        chunks = [chunk.strip() for chunk in text.split('\n\n') if chunk.strip()]
        text_chunks.extend([(chunk, os.path.basename(pdf_path)) for chunk in chunks])
    return text_chunks

def build_faiss_index(text_chunks):
    """Create FAISS index from list of (chunk, source_pdf)"""
    texts = [chunk for chunk, _ in text_chunks]
    embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # Inner product is equivalent to cosine with normalized vectors
    index.add(embeddings)

    return index, embeddings, text_chunks

def semantic_search(query, index, text_chunks, top_k=5):
    """Perform semantic search and return top_k results"""
    query_embedding = model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
    query_embedding = np.expand_dims(query_embedding, axis=0)

    scores, indices = index.search(query_embedding, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        text, source = text_chunks[idx]
        results.append({
            "chunk": text,
            "source": source,
            "similarity": float(score)
        })

    return results

# --- Main Script ---

def run_semantic_search(pdf_files, query, top_k=5):
    all_chunks = []
    for pdf_file in pdf_files:
        chunks = extract_text_from_pdf(pdf_file)
        all_chunks.extend(chunks)

    index, embeddings, text_chunks = build_faiss_index(all_chunks)
    results = semantic_search(query, index, text_chunks, top_k=top_k)

    print(f"\nTop {top_k} Results for Query: \"{query}\"\n")
    for i, res in enumerate(results, 1):
        print(f"Rank {i}:")
        print(f"PDF Source   : {res['source']}")
        print(f"Similarity   : {res['similarity']:.4f}")
        print(f"Text Chunk   :\n{res['chunk']}\n{'-'*60}")


2025-05-07 19:25:57.841992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746645957.994401      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746645958.039187      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
pdf_files = ['/kaggle/input/pdf-updated/General/ISLPED_1995_224081.224083.pdf', '/kaggle/input/pdf-updated/General/ISLPED_1995_224081.224084.pdf', '/kaggle/input/pdf-updated/General/ISLPED_1995_224081.224085.pdf']
query = "machine learning optimization techniques"
run_semantic_search(pdf_files, query, top_k=3)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top 3 Results for Query: "machine learning optimization techniques"

Rank 1:
PDF Source   : ISLPED_1995_224081.224084.pdf
Similarity   : 0.2596
Text Chunk   :
Benc
h-
Timing
P
o
w
er
P
o
w
er
x
Avg.
%
P
o
w
er
using
x
Avg.
%
mark
Constrain
t
k
using
V
using
V,
V
%
reduc.
reduc.
V,
V,
.V
%
reduc.
reduc.

y

	
	.
	
	.
Dieq


	
.	
.
.



.
0.
0
.
.



.
.
	.
	
y

	
.
.

FIR
0


	.

.


0
.
0.
.
	.
.



.
.
.

y
00
0
.
.
.	
AR-Lattice
	
00
0

0.
.
Filter
0
00

.0
.
.
.
0.0

00

.
	.
.	

y
0
	0

.0
0.
EWFilter

0

.
	.0
.		

0
0
.

	0.
0.


0

.00
.
.
T
able
:
P
o
w
er
Consumption
Results
for
smaller
Timing
Constrain
ts
y:
Corresp
onds
to
the
longest
path
length
for
the
D
F
G.
Benc
h-
Timing
P
o
w
er
P
o
w
er


In [11]:
# semantic_search_flat.py (Single Script Version)
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# PDF Parsing

def extract_text(pdf_path):
    """Try extracting text using fitz, fallback to PyPDF2."""
    try:
        import fitz
        doc = fitz.open(pdf_path)
        return "\n".join(page.get_text() for page in doc)
    except Exception:
        try:
            from PyPDF2 import PdfReader
            reader = PdfReader(pdf_path)
            return "\n".join(page.extract_text() or '' for page in reader.pages)
        except Exception as e:
            print(f"[Error] Cannot extract text from {pdf_path}: {e}")
            return ""

# Text Chunking

def chunk_text(text, chunk_size=512, overlap=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Embedding

def get_embeddings(text_chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(text_chunks, convert_to_numpy=True)
    return embeddings, model

# FAISS Indexing

def build_faiss_index(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(np.array(embeddings))
    return index

def search_faiss_index(index, model, query, k=5):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)
    return I[0], D[0]

# Main Script

def run_semantic_search(pdf_folder, query, top_k=5):
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
    all_chunks = []
    chunk_sources = []

    for file in pdf_files:
        text = extract_text(file)
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        chunk_sources.extend([os.path.basename(file)] * len(chunks))

    embeddings, model = get_embeddings(all_chunks)
    index = build_faiss_index(embeddings)
    indices, distances = search_faiss_index(index, model, query, k=top_k)

    for idx, dist in zip(indices, distances):
        print(f"\nScore: {1 - dist:.4f}")
        print(f"Source: {chunk_sources[idx]}")
        print(f"Text: {all_chunks[idx][:500]}\n{'-'*60}")

# Example usage
if __name__ == "__main__":
    pdf_folder = "/kaggle/input/pdf-updated/General"
    query = "machine learning optimization techniques"
    run_semantic_search(pdf_folder, query, top_k=5)


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Score: -0.2223
Source: ISLPED_1995_224081.224111.pdf
Text: order, starting with the gates generating the primary outputs. A simple algorithm for p o w er optimal sizing is giv en in gure. . P o w er optimization under dela y con- strain t Our optimization metho d starts with a p o w er minima l la y out con guration and attempts to pro ceed along a p o w er optimal path to meet the required dela y based A lgorithm p ower optimal initial sizing() compute load(); to do list = [gates driving the primary outputs]; while (to do list <> ) do g = remo v e 
------------------------------------------------------------

Score: -0.2786
Source: ISLPED_1995_224081.224091.pdf
Text: Fig. 6. The graph is again sorted by the value of the power cost function for every solution. The best realisation this time is a pointer array accessed by key A, which points to an array of records which is accessed by key B. This solution is about 30 % bet- ter than the pointer array to pointer array solution obtaine