<a href="https://colab.research.google.com/github/MeghnaGup/AIChatBot_DRDO/blob/main/AIChatbot_DRDO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 transformers flask



In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

In [None]:
BASE_URL = "https://www.drdo.gov.in"

In [None]:
def get_all_links(start_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(start_url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to access {start_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    links = set()

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]

        if href.startswith("/") and "http" not in href:
            full_link = BASE_URL + href
            links.add(full_link)

    return list(links)

In [None]:
def scrape_page(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.get_text(separator=" ", strip=True)

    return content


In [None]:
def scrape_drdo_website(start_url, limit=20):
    scraped_data = {}
    links_to_scrape = get_all_links(start_url)

    print(f"Found {len(links_to_scrape)} pages to scrape.")

    for i, link in enumerate(links_to_scrape[:limit]):
        print(f"Scraping {i+1}/{len(links_to_scrape)}: {link}")
        content = scrape_page(link)

        if content:
            scraped_data[link] = content

        time.sleep(2)

    return scraped_data

In [None]:
drdo_data = scrape_drdo_website(BASE_URL, limit=100)

with open("drdo_scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(drdo_data, file, indent=4, ensure_ascii=False)

print("Scraping complete! Data saved.")

Found 63 pages to scrape.
Scraping 1/63: https://www.drdo.gov.in/drdo/accessibility-statement
Scraping 2/63: https://www.drdo.gov.in/drdo/dr-sanjai-k-dwivedi
Scraping 3/63: https://www.drdo.gov.in/drdo/press-release/indias-security-apparatus-must-remain-adaptive-emerging-threats-such-cyber-warfare
Scraping 4/63: https://www.drdo.gov.in/drdo/panchi
Scraping 5/63: https://www.drdo.gov.in/drdo/message-board/new-version-drdo-industry-partner-registration-20-will-be-launched-soon-till-time
Scraping 6/63: https://www.drdo.gov.in/drdo/rti-third-party-audit
Scraping 7/63: https://www.drdo.gov.in/drdo/procurement
Scraping 8/63: https://www.drdo.gov.in/drdo/sitemap
Scraping 9/63: https://www.drdo.gov.in/drdo/rti-cell/home
Scraping 10/63: https://www.drdo.gov.in/drdo/computerized-pilot-selection-system-cpss
Scraping 11/63: https://www.drdo.gov.in/drdo/copyright-policy
Scraping 12/63: https://www.drdo.gov.in/drdo/faqs
Scraping 13/63: https://www.drdo.gov.in/drdo/orgchart
Scraping 14/63: https://ww

In [None]:
!ls

cleaned_drdo_scraped_data.json	drdo_scraped_data.json	drive  sample_data


In [None]:
import os
os.listdir()

['.config',
 'cleaned_drdo_scraped_data.json',
 'drive',
 'drdo_scraped_data.json',
 'sample_data']

In [None]:
from google.colab import files
files.download("drdo_scraped_data.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import re

def clean_and_process_data(data):
    cleaned_data = {}
    seen_data = set()  # Keep track of seen data

    for url, content in data.items():
        if content:
            # Remove whitespace characters
            cleaned_content = re.sub(r'\s+', ' ', content).strip()

            # Handle duplicates
            if cleaned_content not in seen_data:
                cleaned_data[url] = cleaned_content
                seen_data.add(cleaned_content)
            else:
                print(f"Duplicate content found and removed for URL: {url}")

        else:
            print(f"Missing content for URL: {url}")

    return cleaned_data

# Load the JSON data (assuming it's in a variable named 'drdo_data')
# Replace 'drdo_data' with the actual variable name if it's different.

# Example usage (assuming drdo_data is already populated)
cleaned_drdo_data = clean_and_process_data(drdo_data)

with open("cleaned_drdo_scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(cleaned_drdo_data, file, indent=4, ensure_ascii=False)

print("Data cleaning complete! Cleaned data saved to cleaned_drdo_scraped_data.json")


Data cleaning complete! Cleaned data saved to cleaned_drdo_scraped_data.json


In [None]:
with open("cleaned_drdo_scraped_data.json", "r", encoding="utf-8") as file:
    cleaned_data = json.load(file)

print("Sample cleaned data:", list(cleaned_data.items())[:5])  # Show first 5 items

Sample cleaned data: [('https://www.drdo.gov.in/drdo/accessibility-statement', "Accessibility Statement | DRDO This page uses Javascript. Your browser either doesn't support Javascript or you have it turned off. To see this page as it is meant to appear please use a Javascript enabled browser. Feedback Sitemap FAQs User account menu Login Skip to Main Content Screen Reader Access English हिंदी facebook twitter instagram Search Corporate Directory Home DRDO About DRDO Citizen Charter Who's who Nodal Officer DRDO Logo Organisation Organisation Chart Technology Clusters Laboratories & Establishments Corporate Clusters Corporate Directorates Outreach Product for Export Industry Support Products for Industry TOT DIA-CoEs ER & IPR Research Boards TDF Test Facilities Technology Foresight Careers Publications RTI Contact Us Accessibility Statement Home Accessibility Statement We are committed to ensure that the DRDO website is accessible to all users irrespective of device in use, technology o

In [None]:
def clean_and_process_data(data):
    if not data:
        print("Error: No data found!")
        return {}

    cleaned_data = {}
    seen_data = set()

    for url, content in data.items():
        if content:
            cleaned_content = re.sub(r'\s+', ' ', content).strip()

            if cleaned_content not in seen_data:
                cleaned_data[url] = cleaned_content
                seen_data.add(cleaned_content)
            else:
                print(f"❌ Duplicate removed: {url}")
        else:
            print(f"⚠️ Empty content for URL: {url}")  # Debug message

    print(f"✅ Total cleaned entries: {len(cleaned_data)}")
    return cleaned_data


In [None]:
print(f"Raw data received: {drdo_data}")
print(f"Number of entries: {len(drdo_data)}")

Buffered data was truncated after reaching the output size limit.

In [15]:
if drdo_data is None:
    print("drdo_data is None. Check if the file was loaded properly.")
elif not drdo_data:
    print("drdo_data is empty. Check if the scraping was successful.")
else:
    print(f"Number of entries: {len(drdo_data)}")

Number of entries: 63


In [16]:
cleaned_drdo_data = clean_and_process_data(drdo_data)

✅ Total cleaned entries: 63


In [17]:
print(f"Total cleaned entries: {len(cleaned_drdo_data)}")

Total cleaned entries: 63


In [18]:
import json

with open("cleaned_drdo_scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(cleaned_drdo_data, file, indent=4, ensure_ascii=False)

print("Data cleaning complete! Cleaned data saved successfully.")


Data cleaning complete! Cleaned data saved successfully.


In [19]:
from google.colab import files
files.download("cleaned_drdo_scraped_data.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install faiss-cpu



In [20]:
import json

with open("cleaned_drdo_scraped_data.json", "r", encoding="utf-8") as file:
    drdo_data = json.load(file)

print(f"Loaded {len(drdo_data)} entries from the cleaned dataset.")

Loaded 63 entries from the cleaned dataset.


In [21]:
!pip install onedrivesdk
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
print(os.listdir("/content/drive/My Drive/Colab Notebooks/"))

['Untitled0.ipynb', 'Untitled1.ipynb', 'Meghna Gupta, 023 CSE.ipynb', 'Copy of Project 7. Wine Quality Prediction.ipynb', 'Practice.ipynb', 'Untitled2.ipynb', 'Flipkart_Reviews_Sentiment_Analysis_using_Python (1).ipynb', 'Untitled3.ipynb', 'Untitled4.ipynb', 'Untitled5.ipynb', 'classification(1).ipynb', 'Flipkart_Reviews_Sentiment_Analysis_using_Python.ipynb', 'House_Price_Prediction_using_Machine_Learning_.ipynb', 'Customer_Segmentation_in_Python.ipynb', 'customer_churn (2).ipynb', 'customer_churn (1).ipynb', 'Untitled6.ipynb', 'customer_churn.ipynb', 'Untitled', 'credit.ipynb', 'Untitled7.ipynb', 'Experiment - 2.ipynb', 'experiment - 1.ipynb', 'experiment - 2.ipynb', 'experiment - 3.ipynb', 'Experiment - 5.ipynb', 'experiment - 4.ipynb', 'Experiment - 6.ipynb', 'Untitled9.ipynb', 'WebScrapping.ipynb', 'AIChatbot_DRDO.ipynb']


In [23]:
!pip install sentence-transformers
import numpy as np
import gc
import torch
import pickle  # To save embeddings
from sentence_transformers import SentenceTransformer

# Free up memory
gc.collect()
torch.cuda.empty_cache()

# Define drdo_data (Replace this with your actual data)
drdo_data = {
    "doc1": "This is the first document.",
    "doc2": "This is the second document.",
    "doc3": "Here is another example of a document."
}

texts = list(drdo_data.values())

batch_size = 10  # Adjust batch size based on memory
embeddings_list = []

# Load a lightweight model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Process and save embeddings in batches
for i in range(0, len(texts), batch_size):
    batch = texts[i : i + batch_size]
    batch_embeddings = model.encode(batch, convert_to_numpy=True)
    embeddings_list.append(batch_embeddings)

# Combine all embeddings into a single array
embeddings = np.vstack(embeddings_list)

# Define file path in OneDrive (Change to your directory)
file_path = "/content/drive/My Drive/embeddings.pkl"

# Save embeddings to OneDrive
with open(file_path, "wb") as f:
    pickle.dump(embeddings, f)

print("Embeddings saved to OneDrive:", file_path)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embeddings saved to OneDrive: /content/drive/My Drive/embeddings.pkl


In [24]:
with open(file_path, "rb") as f:
    embeddings = pickle.load(f)

print("Loaded embeddings shape:", embeddings.shape)

Loaded embeddings shape: (3, 384)


In [25]:
import json

# File paths
data_path = "/content/drive/My Drive/drdo_data/"
scraped_file = f"/content/drdo_scraped_data.json"
cleaned_file = f"/content/cleaned_drdo_scraped_data.json"

# Load both JSON files
with open(scraped_file, "r", encoding="utf-8") as file:
    scraped_data = json.load(file)

with open(cleaned_file, "r", encoding="utf-8") as file:
    cleaned_data = json.load(file)

# Convert both files into dictionary format
drdo_scraped = {f"doc{i+1}": content for i, content in enumerate(scraped_data.values())}
drdo_cleaned = {f"doc{i+1}": content for i, content in enumerate(cleaned_data.values())}

# Print sample to verify format
print("Scraped Data Sample:", list(drdo_scraped.items())[:3])
print("Cleaned Data Sample:", list(drdo_cleaned.items())[:3])

Scraped Data Sample: [('doc1', "Accessibility Statement | DRDO This page uses Javascript. Your browser either doesn't support Javascript or you have it turned off. To see this page as it is meant to appear please use a Javascript enabled browser. Feedback Sitemap FAQs User account menu Login Skip to Main Content Screen Reader Access English हिंदी facebook twitter instagram Search Corporate Directory Home DRDO About DRDO Citizen Charter Who's who Nodal Officer DRDO Logo Organisation Organisation Chart Technology Clusters Laboratories & Establishments Corporate Clusters Corporate Directorates Outreach Product for Export Industry Support Products for Industry TOT DIA-CoEs ER & IPR Research Boards TDF Test Facilities Technology Foresight Careers Publications RTI Contact Us Accessibility Statement Home Accessibility Statement We are committed to ensure that the DRDO website is accessible to all users irrespective of device in use, technology or ability. It has been built, with an aim, to pr

In [None]:
%cd /content/AIChatbot_DRDO

/content/AIChatbot_DRDO


In [None]:
!mv /content/drdo_scraped_data.json /content/AIChatbot_DRDO/

In [None]:
!mv /content/cleaned_drdo_scraped_data.json /content/AIChatbot_DRDO/

In [None]:
!ls

cleaned_drdo_scraped_data.json	drdo_scraped_data.json


In [None]:
!git add .
!git commit -m "Added internal files"

[main 5613099] Added internal files
 2 files changed, 120 insertions(+), 120 deletions(-)


In [None]:
!git branch

* [32mmain[m


In [None]:
%cd /content/AIChatbot_DRDO

/content/AIChatbot_DRDO


In [None]:
!git add .

In [None]:
!git commit -m "Added more files"

On branch main
Your branch is based on 'origin/main', but the upstream is gone.
  (use "git branch --unset-upstream" to fixup)

nothing to commit, working tree clean


In [None]:
!git pull origin main --rebase

remote: Enumerating objects: 6, done.[K
remote: Counting objects:  16% (1/6)[Kremote: Counting objects:  33% (2/6)[Kremote: Counting objects:  50% (3/6)[Kremote: Counting objects:  66% (4/6)[Kremote: Counting objects:  83% (5/6)[Kremote: Counting objects: 100% (6/6)[Kremote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects:  25% (1/4)[Kremote: Compressing objects:  50% (2/4)[Kremote: Compressing objects:  75% (3/4)[Kremote: Compressing objects: 100% (4/4)[Kremote: Compressing objects: 100% (4/4), done.[K
Unpacking objects:  16% (1/6)Unpacking objects:  33% (2/6)Unpacking objects:  50% (3/6)remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects:  66% (4/6)Unpacking objects:  83% (5/6)Unpacking objects: 100% (6/6)Unpacking objects: 100% (6/6), 13.91 KiB | 890.00 KiB/s, done.
From https://github.com/MeghnaGup/AIChatbot_DRDO
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> orig

In [None]:
!git push origin main

Enumerating objects: 9, done.
Counting objects:  11% (1/9)Counting objects:  22% (2/9)Counting objects:  33% (3/9)Counting objects:  44% (4/9)Counting objects:  55% (5/9)Counting objects:  66% (6/9)Counting objects:  77% (7/9)Counting objects:  88% (8/9)Counting objects: 100% (9/9)Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects: 100% (8/8), done.
error: RPC failed; HTTP 408 curl 22 The requested URL returned error: 408
send-pack: unexpected disconnect while reading sideband packet
Writing objects: 100% (8/8), 129.44 MiB | 1.76 MiB/s, done.
Total 8 (delta 4), reused 0 (delta 0), pack-reused 0
fatal: the remote end hung up unexpectedly
Everything up-to-date


In [None]:
import shutil

# Compress only the two large JSON files
shutil.make_archive("drdo_data", 'zip', ".", "cleaned_drdo_scraped_data.json")
shutil.make_archive("drdo_scraped", 'zip', ".", "drdo_scraped_data.json")

'/content/AIChatbot_DRDO/drdo_scraped.zip'

In [None]:
!git config --global user.email "meghu.gup@gmail.com"
!git config --global user.name "MeghnaGup"

In [None]:
!git add drdo_data.zip
!git commit -m "Added ZIP file"

[main 3cb22fb] Added ZIP file
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 drdo_data.zip


In [None]:
!git add drdo_scraped.zip
!git commit -m "Added ZIP file"

[main fcd28fb] Added ZIP file
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 drdo_scraped.zip


In [26]:
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer

def process_in_chunks(text_list, model, chunk_size, file_prefix, output_path):
    """
    Splits text data into chunks, processes embeddings, and saves them separately.

    Parameters:
        text_list (list): The list of texts to be embedded.
        model (SentenceTransformer): The embedding model.
        chunk_size (int): Number of texts per chunk.
        file_prefix (str): Prefix for output file names.
        output_path (str): Path to save the files.
    """
    total_chunks = (len(text_list) + chunk_size - 1) // chunk_size  # Calculate number of chunks

    for i in range(total_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(text_list))

        chunk_texts = text_list[start_idx:end_idx]  # Extract chunk
        chunk_embeddings = model.encode(chunk_texts, convert_to_numpy=True)  # Generate embeddings

        # Save each chunk separately
        file_name = f"{output_path}{file_prefix}_part{i+1}.pkl"
        with open(file_name, "wb") as f:
            pickle.dump(chunk_embeddings, f)

        print(f"Saved {file_name} ({len(chunk_texts)} embeddings)")


In [None]:
# Initialize model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define paths
output_path = "/content/drive/My Drive/drdo_data/"
num_parts = 20  # Divide into 20 parts

# Process Scraped Data in Chunks
scraped_texts = list(drdo_scraped.values())
chunk_size = max(1, len(scraped_texts) // num_parts)  # Ensure chunk_size is at least 1
process_in_chunks(scraped_texts, model, chunk_size, "scraped_embeddings", output_path)

# Process Cleaned Data in Chunks
cleaned_texts = list(drdo_cleaned.values())
chunk_size = max(1, len(cleaned_texts) // num_parts)  # Ensure chunk_size is at least 1
process_in_chunks(cleaned_texts, model, chunk_size, "cleaned_embeddings", output_path)

print("All embeddings processed and saved in 5 parts.")



Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part1.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part2.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part3.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part4.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part5.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part6.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part7.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part8.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part9.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part10.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part11.pkl (3 embeddings)
Saved /content/drive/My Drive/drdo_data/scraped_embeddings_part12.pkl (3 e

In [None]:
import pickle
import numpy as np
import gc
from sentence_transformers import SentenceTransformer

def process_in_chunks(text_list, model, chunk_size, file_prefix, output_path):
    """
    Splits text data into chunks, processes embeddings, and saves them separately.

    Parameters:
        text_list (list): The list of texts to be embedded.
        model (SentenceTransformer): The embedding model.
        chunk_size (int): Number of texts per chunk.
        file_prefix (str): Prefix for output file names.
        output_path (str): Path to save the files.
    """
    total_chunks = (len(text_list) + chunk_size - 1) // chunk_size  # Number of chunks

    for i in range(total_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(text_list))

        chunk_texts = text_list[start_idx:end_idx]  # Get chunk
        chunk_embeddings = model.encode(chunk_texts, convert_to_numpy=True)  # Generate embeddings

        # Save each chunk separately
        file_name = f"{output_path}{file_prefix}_part{i+1}.pkl"
        with open(file_name, "wb") as f:
            pickle.dump(chunk_embeddings, f)

        print(f"Saved {file_name} ({len(chunk_texts)} embeddings)")

        # Free up memory
        del chunk_embeddings
        gc.collect()

# Reinitialize model (to free up memory)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define paths
output_path = "/content/drive/My Drive/drdo_data/"
num_parts = 30  # Adjust based on memory availability

# Process Cleaned Data in Chunks
cleaned_texts = list(drdo_cleaned.values())
chunk_size = max(1, len(cleaned_texts) // num_parts)  # Ensure at least 1 item per chunk
process_in_chunks(cleaned_texts, model, chunk_size, "cleaned_embeddings", output_path)

print("Cleaned data embeddings processed successfully.")

# Free up memory at the end
del model
gc.collect()


Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part1.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part2.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part3.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part4.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part5.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part6.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part7.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part8.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part9.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part10.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part11.pkl (2 embeddings)
Saved /content/drive/My Drive/drdo_data/cleaned_embeddings_part12.pkl (2 e