# WEB --> RAG PIPELINE

User Query → Clean → Optimize → Web Search → Extract → Parse → Clean → Embed → Store in FAISS

In [1]:
### USER QUERIES
queries = [
    "best free vector databases for RAG projects in 2026",
    "python faiss installation error windows fix",
    "how transformers work in deep learning explained simply",
    "iphone 15 vs samsung s24 battery performance test",
    "symptoms of vitamin b12 deficiency in young adults",
    "best mutual funds for long term investment india 2026",
    "Budget trip plan for manali 3 days itinerary",
    "roadmap to become data scientis after computer science degree",
    "How to renew driving license online india steps",
    "Open source llm models under 7b parameters for local use"
]
queries

['best free vector databases for RAG projects in 2026',
 'python faiss installation error windows fix',
 'how transformers work in deep learning explained simply',
 'iphone 15 vs samsung s24 battery performance test',
 'symptoms of vitamin b12 deficiency in young adults',
 'best mutual funds for long term investment india 2026',
 'Budget trip plan for manali 3 days itinerary',
 'roadmap to become data scientis after computer science degree',
 'How to renew driving license online india steps',
 'Open source llm models under 7b parameters for local use']

Importing and downloading libraries

In [2]:
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saras\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saras\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saras\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saras\AppData\Roaming\nltk_data...


True

# SPACY

In [2]:
import spacy
import spacy_cleaner
from spacy_cleaner.processing import removers, replacers, mutators

query = 'HeLLo!!??? My Name IS JAi SAra,.swAT,,!'

model = spacy.load("en_core_web_sm")
pipeline = spacy_cleaner.Cleaner(
    model,
    replacers.replace_punctuation_token,
)
pipeline.clean(query)


Cleaning Progress: 100%|██████████| 39/39 [00:00<00:00, 3400.72it/s]


['H',
 'e',
 'L',
 'L',
 'o',
 '_IS_PUNCT_',
 '_IS_PUNCT_',
 '_IS_PUNCT_',
 '_IS_PUNCT_',
 '_IS_PUNCT_',
 '',
 'M',
 'y',
 '',
 'N',
 'a',
 'm',
 'e',
 '',
 'I',
 'S',
 '',
 'J',
 'A',
 'i',
 '',
 'S',
 'A',
 'r',
 'a',
 '_IS_PUNCT_',
 '_IS_PUNCT_',
 's',
 'w',
 'A',
 'T',
 '_IS_PUNCT_',
 '_IS_PUNCT_',
 '_IS_PUNCT_']

**Name-Entity**

In [21]:
sent1 = "What is Machine Learning? What are it's types? How is it related to Artificial Intelligence?"
sent1_doc = model(sent1)
[(token.text, token.label_) for token in sent1_doc.ents]

[('Machine Learning', 'PERSON'), ('Artificial Intelligence', 'ORG')]

**Noun Chunks**

In [22]:
[chunk.text for chunk in sent1_doc.noun_chunks]

['What',
 'Machine Learning',
 'What',
 "it's types",
 'it',
 'Artificial Intelligence']

**Part-Of-Speech Mapping (POS)**

In [23]:
[(token.text, token.pos_) for token in sent1_doc]

[('What', 'PRON'),
 ('is', 'AUX'),
 ('Machine', 'PROPN'),
 ('Learning', 'PROPN'),
 ('?', 'PUNCT'),
 ('What', 'PRON'),
 ('are', 'AUX'),
 ('it', 'PRON'),
 ("'s", 'AUX'),
 ('types', 'NOUN'),
 ('?', 'PUNCT'),
 ('How', 'SCONJ'),
 ('is', 'AUX'),
 ('it', 'PRON'),
 ('related', 'VERB'),
 ('to', 'ADP'),
 ('Artificial', 'PROPN'),
 ('Intelligence', 'PROPN'),
 ('?', 'PUNCT')]

In [24]:
[token.text for token in sent1_doc if token.head == token]

['is', 'are', 'related']

In [25]:
def semantic_data(sent1_doc):    
    return {
    "entities": [(ent.text, ent.label_) for ent in sent1_doc.ents],
    "noun_chunks": [chunk.text for chunk in sent1_doc.noun_chunks],
    "pos_tags": [(token.text, token.pos_) for token in sent1_doc],
    "root_verbs": [token.text for token in sent1_doc if token.head == token],
    }


In [27]:
doc = semantic_data(sent1_doc)
doc

{'entities': [('Machine Learning', 'PERSON'),
  ('Artificial Intelligence', 'ORG')],
 'noun_chunks': ['What',
  'Machine Learning',
  'What',
  "it's types",
  'it',
  'Artificial Intelligence'],
 'pos_tags': [('What', 'PRON'),
  ('is', 'AUX'),
  ('Machine', 'PROPN'),
  ('Learning', 'PROPN'),
  ('?', 'PUNCT'),
  ('What', 'PRON'),
  ('are', 'AUX'),
  ('it', 'PRON'),
  ("'s", 'AUX'),
  ('types', 'NOUN'),
  ('?', 'PUNCT'),
  ('How', 'SCONJ'),
  ('is', 'AUX'),
  ('it', 'PRON'),
  ('related', 'VERB'),
  ('to', 'ADP'),
  ('Artificial', 'PROPN'),
  ('Intelligence', 'PROPN'),
  ('?', 'PUNCT')],
 'root_verbs': ['is', 'are', 'related']}

In [32]:
import contractions
contractions_dict = contractions.contractions_dict
contractions_dict

{"I'm": 'I am',
 "I'm'a": 'I am about to',
 "I'm'o": 'I am going to',
 "I've": 'I have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'd": 'I would',
 "I'd've": 'I would have',
 'Whatcha': 'What are you',
 "amn't": 'am not',
 "ain't": 'are not',
 "aren't": 'are not',
 "'cause": 'because',
 "can't": 'cannot',
 "can't've": 'cannot have',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "daren't": 'dare not',
 "daresn't": 'dare not',
 "dasn't": 'dare not',
 "didn't": 'did not',
 'didn’t': 'did not',
 "don't": 'do not',
 'don’t': 'do not',
 "doesn't": 'does not',
 "e'er": 'ever',
 "everyone's": 'everyone is',
 'finna': 'fixing to',
 'gimme': 'give me',
 "gon't": 'go not',
 'gonna': 'going to',
 'gotta': 'got to',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he've": 'he have',
 "he's": 'he is',
 "he'll": 'he will',
 "he'll've": 'he will have',
 "he'd": 'he would',
 "he'd've": 'he would have',
 

In [43]:
doc

{'entities': [('Machine Learning', 'PERSON'),
  ('Artificial Intelligence', 'ORG')],
 'noun_chunks': ['What',
  'Machine Learning',
  'What',
  "it's types",
  'it',
  'Artificial Intelligence'],
 'pos_tags': [('What', 'PRON'),
  ('is', 'AUX'),
  ('Machine', 'PROPN'),
  ('Learning', 'PROPN'),
  ('?', 'PUNCT'),
  ('What', 'PRON'),
  ('are', 'AUX'),
  ('it', 'PRON'),
  ("'s", 'AUX'),
  ('types', 'NOUN'),
  ('?', 'PUNCT'),
  ('How', 'SCONJ'),
  ('is', 'AUX'),
  ('it', 'PRON'),
  ('related', 'VERB'),
  ('to', 'ADP'),
  ('Artificial', 'PROPN'),
  ('Intelligence', 'PROPN'),
  ('?', 'PUNCT')],
 'root_verbs': ['is', 'are', 'related']}

In [45]:
from nltk import TreebankWordTokenizer, sent_tokenize
import re

tokenizer = TreebankWordTokenizer()

# --- Step 1: Prepare protected phrases ---
entities = [e[0] for e in doc["entities"]]
noun_chunks = [c for c in doc["noun_chunks"] if len(c.split()) > 1]

protected_phrases = set(entities + noun_chunks)

# --- Step 2: Mask phrases ---
phrase_map = {}
masked_text = sent1

for i, phrase in enumerate(protected_phrases):
    key = f"__ENT{i}__"
    phrase_map[key] = phrase.replace(" ", "_")  # join words
    masked_text = re.sub(re.escape(phrase), key, masked_text)

# --- Step 3: Sentence split ---
sentences = sent_tokenize(masked_text)

tokenized_query = []

for sentence in sentences:
    temp = []
    
    # --- Step 4: Contraction expand ---
    for word in sentence.split():
        lw = word.lower()
        if lw in contractions_dict:
            temp.append(contractions_dict[lw])
        else:
            temp.append(word)

    # --- Step 5: Tokenize ---
    tokens = tokenizer.tokenize(" ".join(temp))

    # --- Step 6: Restore phrases ---
    restored = [phrase_map.get(tok, tok) for tok in tokens]

    tokenized_query.append(restored)

tokenized_query


[['What', 'is', 'Machine_Learning', '?'],
 ['What', 'are', "it's_types", '?'],
 ['How', 'is', 'it', 'related', 'to', 'Artificial_Intelligence', '?']]

In [47]:
original_query = sent1
semantics_data = doc

In [49]:
import json
json.dumps(semantics_data, indent=2)

'{\n  "entities": [\n    [\n      "Machine Learning",\n      "PERSON"\n    ],\n    [\n      "Artificial Intelligence",\n      "ORG"\n    ]\n  ],\n  "noun_chunks": [\n    "What",\n    "Machine Learning",\n    "What",\n    "it\'s types",\n    "it",\n    "Artificial Intelligence"\n  ],\n  "pos_tags": [\n    [\n      "What",\n      "PRON"\n    ],\n    [\n      "is",\n      "AUX"\n    ],\n    [\n      "Machine",\n      "PROPN"\n    ],\n    [\n      "Learning",\n      "PROPN"\n    ],\n    [\n      "?",\n      "PUNCT"\n    ],\n    [\n      "What",\n      "PRON"\n    ],\n    [\n      "are",\n      "AUX"\n    ],\n    [\n      "it",\n      "PRON"\n    ],\n    [\n      "\'s",\n      "AUX"\n    ],\n    [\n      "types",\n      "NOUN"\n    ],\n    [\n      "?",\n      "PUNCT"\n    ],\n    [\n      "How",\n      "SCONJ"\n    ],\n    [\n      "is",\n      "AUX"\n    ],\n    [\n      "it",\n      "PRON"\n    ],\n    [\n      "related",\n      "VERB"\n    ],\n    [\n      "to",\n      "ADP"\n    ],\n  

In [58]:
def compress_semantic_data(semantic_data):
    entities = list({e[0] for e in semantic_data["entities"]})
    noun_chunks = [c for c in semantic_data["noun_chunks"] if len(c.split()) > 1]
    root_verbs = semantic_data["root_verbs"][:3]

    return {
        "entities": entities,
        "key_phrases": noun_chunks[:5],
        "intent_verbs": root_verbs
    }


In [None]:
import os
from groq import Groq
import json
from dotenv import load_dotenv

load_dotenv()

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

import re

def extract_json_array(text):
    match = re.search(r'\[.*\]', text, re.S)
    if match:
        return json.loads(match.group())
    return []

def generate_query_variations(original_query, tokenized_query, semantic_data):

    compact_semantic = compress_semantic_data(semantic_data)

    system_prompt = """
You are a Query Reformulation Engine.

OUTPUT RULES:
- Output ONLY a valid JSON array of exactly 5 strings.
- No explanations.
- No markdown.
- No backticks.
- Preserve named entities exactly.
- Maintain original intent.
- Do NOT introduce new topics.
- Each query must be <= 20 words.
- Make sure to divide the inten
- If the original query contains multiple distinct questions or intents that cannot be naturally combined into one clear query, split them into separate focused queries instead of forcing a single long sentence.
- However, still return exactly 5 total queries.
- Each query must remain semantically equivalent to a part of the original intent.
"""

    user_prompt = f"""
Original Query: {original_query}

Semantic Anchors:
{json.dumps(compact_semantic)}

Task: Generate 5 equivalent search queries.
Return ONLY JSON array.
"""

    response = client.chat.completions.create(
        model="openai/gpt-oss-120b",
        temperature=0.25,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )

    text = response.choices[0].message.content.strip()
    # print("RAW LLM OUTPUT:\n", text)

    try:
        return extract_json_array(text)
    except:
        return ["LLM_OUTPUT_PARSE_ERROR", text]


In [73]:
variations = generate_query_variations(
    original_query=original_query,
    tokenized_query=tokenized_query,
    semantic_data=semantics_data
)

print(variations)


['What is Machine Learning?', 'What are the types of Machine Learning?', 'How is Machine Learning related to Artificial Intelligence?', 'Define Machine Learning in the context of Artificial Intelligence.', 'List categories of Machine Learning and their connection to Artificial Intelligence.']


In [84]:
import requests
from bs4 import BeautifulSoup
from readability import Document
from urllib.parse import urlparse, parse_qs, unquote
import base64
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# ---------- BING REDIRECT DECODER ----------
def decode_bing_url(url):
    try:
        parsed = urlparse(url)

        # already real link
        if "bing.com" not in parsed.netloc:
            return url

        qs = parse_qs(parsed.query)
        if "u" not in qs:
            return url

        encoded = qs["u"][0]
        encoded = unquote(encoded)

        # remove leading "a1"
        if encoded.startswith("a1"):
            encoded = encoded[2:]

        # fix base64 padding
        encoded += "=" * (-len(encoded) % 4)

        decoded = base64.b64decode(encoded).decode("utf-8", errors="ignore")

        if decoded.startswith("http"):
            return decoded

    except Exception as e:
        print("Decode error:", e)

    return url


# ---------- SEARCH USING BING HTML ----------
def search_urls(query, max_results=5):
    urls = []
    try:
        r = requests.get(
            "https://www.bing.com/search",
            headers=HEADERS,
            params={"q": query},
            timeout=10
        )

        soup = BeautifulSoup(r.text, "lxml")

        for a in soup.select("li.b_algo h2 a"):
            link = a.get("href")
            if not link:
                continue

            link = decode_bing_url(link)

            # skip tracking or bad links
            if not link.startswith("http") or "bing.com" in link:
                continue

            urls.append(link)

            if len(urls) >= max_results:
                break

    except Exception as e:
        print("Search error:", e)

    return urls


# ---------- FETCH HTML ----------
def fetch_html(url, timeout=10):
    try:
        r = requests.get(url, headers=HEADERS, timeout=timeout)
        if r.status_code == 200:
            return r.text
    except:
        pass
    return None


# ---------- EXTRACT MAIN TEXT ----------
def extract_main_text(html):
    try:
        doc = Document(html)
        summary_html = doc.summary(html_partial=True)

        soup = BeautifulSoup(summary_html, "lxml")

        for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
            tag.decompose()

        text = soup.get_text(separator=" ")
        return " ".join(text.split())
    except:
        return ""


# ---------- NORMALIZE URL ----------
def normalize_url(url):
    try:
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
    except:
        return url


# ---------- MAIN SCRAPER ----------
def scrape_variations(variations, results_per_query=3, max_chars=6000):
    seen_urls = set()
    documents = []

    for query in variations:
        print(f"\nSearching: {query}")
        urls = search_urls(query, results_per_query)
        print("URLs found:", urls)

        for url in urls:
            nurl = normalize_url(url)
            if nurl in seen_urls:
                continue

            seen_urls.add(nurl)
            print("Fetching:", nurl)

            html = fetch_html(nurl)
            if not html:
                continue

            text = extract_main_text(html)

            # skip junk pages
            if len(text) < 150:
                continue

            documents.append({
                "query": query,
                "url": nurl,
                "content": text[:max_chars]
            })

            time.sleep(1)  # polite delay

    return documents

In [86]:
import json

documents = scrape_variations(variations)

print(len(documents))
print(documents[0]["url"])
print(documents[0]["content"][:500])

# ---- WRITE TO FILE ----
with open("scraped_documents.json", "w", encoding="utf-8") as f:
    json.dump(documents, f, indent=2, ensure_ascii=False)

print("Data written to scraped_documents.json")



Searching: What is Machine Learning?
URLs found: ['https://research.ibm.com/topics/machine-learning', 'https://research.ibm.com/publications/quantum-machine-learning-an-interplay-between-quantum-computing-and-machine-learning', 'https://research.ibm.com/blog/ai-fairness-360']
Fetching: https://research.ibm.com/topics/machine-learning
Fetching: https://research.ibm.com/publications/quantum-machine-learning-an-interplay-between-quantum-computing-and-machine-learning
Fetching: https://research.ibm.com/blog/ai-fairness-360

Searching: What are the types of Machine Learning?
URLs found: ['https://ask.csdn.net/questions/8894904', 'https://ask.csdn.net/questions/8233058', 'https://ask.csdn.net/questions/8293218']
Fetching: https://ask.csdn.net/questions/8894904
Fetching: https://ask.csdn.net/questions/8233058
Fetching: https://ask.csdn.net/questions/8293218

Searching: How is Machine Learning related to Artificial Intelligence?
URLs found: ['https://www.geeksforgeeks.org/artificial-intellige