In [1]:
import os
import re
import requests
import fitz  # PyMuPDF
import chromadb
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [4]:
GROQ_API_KEY=os.getenv("GROQ_API_KEY")
SERP_API_KEY=os.getenv("SERP_API_KEY")

In [5]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    return text

In [6]:
sample_pdf=r"C:\Users\harsh\Downloads\MCP.pdf"
text=extract_text_from_pdf(sample_pdf)
text


'MCP\n2025 EDITION\nTHE ILLUSTRATED\nGUIDEBOOK \nAvi Chawla & Akshay Pachaar\nDailyDoseofDS.com\nDaily Dose of\nData Science\nFREE\n\n \nDailyDoseofDS.com \nHow to make the most out of \nthis book and your time? \nThe reading time of this book is about 3 hours. But not all chapters will be of \nrelevance to you. This 2-minute assessment will test your current expertise and \nrecommend chapters that will be most useful to you. \n \nScan the QR code below or open this link to start the assessment. It will only take \n2 minutes to complete. \n \n \nhttps://bit.ly/mcp-assessment \n \n1 \n\n \nDailyDoseofDS.com \nTable of contents \nSection #1) Model Context Protocol…………….3 \n1.1) What is MCP?..................................................................................4-5 \n   Introduction……………………..……………………………………………………………………………4-5 \n \n1.2) Why was MCP created?...............................................................6-8 \n   The problem…………………………………………………………...………………………………………6-7 

In [7]:
# Step 2: Text Chunking
def chunk_text(text, max_words=100):
    words = text.split()
    chunks = [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
    return chunks

In [8]:
chunks=chunk_text(text)
len(chunks)
chunks[0]

'MCP 2025 EDITION THE ILLUSTRATED GUIDEBOOK Avi Chawla & Akshay Pachaar DailyDoseofDS.com Daily Dose of Data Science FREE DailyDoseofDS.com How to make the most out of this book and your time? The reading time of this book is about 3 hours. But not all chapters will be of relevance to you. This 2-minute assessment will test your current expertise and recommend chapters that will be most useful to you. Scan the QR code below or open this link to start the assessment. It will only take 2 minutes to complete. https://bit.ly/mcp-assessment 1 DailyDoseofDS.com Table of contents Section #1) Model Context'

In [9]:
# Step 3: Embed and Store in Vector DB
def store_chunks(chunks):
    client = chromadb.Client()
    collection = client.create_collection("pdf_chunks")
    for i, chunk in enumerate(chunks):
        emb = embedding_model.encode(chunk)
        collection.add(documents=[chunk], ids=[f"chunk_{i}"], embeddings=[emb.tolist()])
    return collection

In [17]:
import os
import requests

SERP_API_KEY = os.getenv("SERP_API_KEY")  # Or hardcode it for testing

def web_search_serper(query):
    if not SERP_API_KEY:
        raise ValueError("❌ SERP_API_KEY is missing or not set.")
    
    headers = {
        "X-API-KEY": SERP_API_KEY,
        "Content-Type": "application/json"
    }
    payload = {"q": query, "num": 5}
    
    res = requests.post("https://google.serper.dev/search", headers=headers, json=payload)
    print(res.status_code)
    
    if res.status_code == 200:
        return res.json().get("organic", [])
    else:
        print("❌ Error:", res.text)
        return []


In [21]:
# Step 5: Sparse Retrieval (TF-IDF)
def sparse_search(query, chunks):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(chunks)
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, X).flatten()
    ranked = sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
    return ranked[:3]

In [25]:
query='what type of data you have?'
res=sparse_search(query,chunks)
res

[("as the transport mechanism: #6) Connect to Cursor Inside you Cursor IDE follow this: Cursor → Settings → Cursor Settings → MCP Then add and start your server like this: The code is available here: https://www.dailydoseofds.com/p/mcp- powered-rag-over-complex-docs/ 51 DailyDoseofDS.com #8) MCP-powered synthetic data generator Learn how to build an MCP server that can generate any type of synthetic dataset. It uses Cursor as the MCP host and SDV to generate realistic tabular synthetic data. Tech Stack ● Cursor as the MCP host ● Datacebo's SDV to generate realistic tabular synthetic data Workﬂow ● User submits a query ● Agent connects",
  0.16040518476246288),
 ("MCP? Imagine you only know English. To get info from a person who only knows: ● French, you must learn French. ● German, you must learn German. ● And so on. In this setup, learning even 5 languages will be a nightmare for you. But what if you add a translator that understands all languages? 4 DailyDoseofDS.com This is simple, 

In [26]:
# Step 6: Hybrid Retrieval (Dense + Sparse Fusion)
def hybrid_retrieval(query, chunks, collection):
    dense_emb = embedding_model.encode(query)
    dense_result = collection.query(query_embeddings=[dense_emb.tolist()], n_results=3)
    dense_chunks = dense_result['documents'][0]
    dense_scores = [0.7] * len(dense_chunks)  # mock scores

    sparse_chunks = sparse_search(query, chunks)
    sparse_texts = [s[0] for s in sparse_chunks]
    sparse_scores = [0.3] * len(sparse_texts)  # mock scores

    combined = list(zip(dense_chunks, dense_scores)) + list(zip(sparse_texts, sparse_scores))
    combined.sort(key=lambda x: x[1], reverse=True)
    return [c[0] for c in combined[:3]]

In [29]:
# Step 7: Generate Answer with Groq
def generate_response(query, pdf_sources, web_sources):
    context = "\n\n".join([f"[PDF] {s}" for s in pdf_sources] + [f"[Web] {s['snippet']} (URL: {s['link']})" for s in web_sources])
    messages = [
        {"role": "system", "content": "You are a helpful research assistant. Answer using sources provided."},
        {"role": "user", "content": f"Query: {query}\n\nSources:\n{context}\n\nAnswer using the most relevant and trustworthy information. Cite source type and link or PDF."}
    ]
    res = requests.post(
        url="https://api.groq.com/openai/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {GROQ_API_KEY}",
            "Content-Type": "application/json"
        },
        json={
            "model": "llama3-70b-8192",
            "messages": messages,
            "temperature": 0.3
        }
    )
    return res.json()['choices'][0]['message']['content']

In [30]:
Execution
def run_pipeline(pdf_path, query):
    print("\nExtracting PDF...")
    pdf_text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(pdf_text)

    print("\nStoring Chunks in Vector DB...")
    collection = store_chunks(chunks)

    print("\nRunning Hybrid Retrieval...")
    top_pdf_chunks = hybrid_retrieval(query, chunks, collection)

    print("\nSearching Web...")
    top_web_results = web_search_serper(query)

    print("\nGenerating Answer...")
    response = generate_response(query, top_pdf_chunks, top_web_results)
    print("\nFinal Answer:\n")
    print(response)


run_pipeline(sample_pdf, "What are recent methods for fine-tuning GPT models?")


NameError: name 'Execution' is not defined