<a href="https://colab.research.google.com/github/MeghnaGup/AIChatBot_DRDO/blob/main/AIChatbot_DRDO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 transformers flask



In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

In [None]:
BASE_URL = "https://www.drdo.gov.in"

In [None]:
def get_all_links(start_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(start_url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to access {start_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    links = set()

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]

        if href.startswith("/") and "http" not in href:
            full_link = BASE_URL + href
            links.add(full_link)

    return list(links)

In [None]:
def scrape_page(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.get_text(separator=" ", strip=True)

    return content


In [None]:
def scrape_drdo_website(start_url, limit=20):
    scraped_data = {}
    links_to_scrape = get_all_links(start_url)

    print(f"Found {len(links_to_scrape)} pages to scrape.")

    for i, link in enumerate(links_to_scrape[:limit]):
        print(f"Scraping {i+1}/{len(links_to_scrape)}: {link}")
        content = scrape_page(link)

        if content:
            scraped_data[link] = content

        time.sleep(2)

    return scraped_data

In [None]:
drdo_data = scrape_drdo_website(BASE_URL, limit=100)

with open("drdo_scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(drdo_data, file, indent=4, ensure_ascii=False)

print("Scraping complete! Data saved.")

Found 63 pages to scrape.
Scraping 1/63: https://www.drdo.gov.in/drdo/archives
Scraping 2/63: https://www.drdo.gov.in/drdo/message-board/public-notice-requirement-valid-gate-score-direct-recruitment-scientist-b-drdo
Scraping 3/63: https://www.drdo.gov.in/drdo/about-drdo
Scraping 4/63: https://www.drdo.gov.in/drdo/headquarter-directorates
Scraping 5/63: https://www.drdo.gov.in/drdo/brahmos-0
Scraping 6/63: https://www.drdo.gov.in/drdo/citizen-charter
Scraping 7/63: https://www.drdo.gov.in/drdo/press-release/drdo-conducts-high-altitude-trials-indigenous-integrated-life-support-system-lca
Scraping 8/63: https://www.drdo.gov.in/drdo/publications
Scraping 9/63: https://www.drdo.gov.in/drdo/terms-conditions
Scraping 10/63: https://www.drdo.gov.in/drdo/computerized-pilot-selection-system-cpss
Scraping 11/63: https://www.drdo.gov.in/drdo/varunastra
Scraping 12/63: https://www.drdo.gov.in/drdo/systems-and-subsystems-industry-design-development-and-manufacture
Scraping 13/63: https://www.drdo.go

In [None]:
!ls

AIChatbot_DRDO	cleaned_drdo_scraped_data.json	drdo_scraped_data.json	drive  sample_data


In [None]:
import os
os.listdir()

['.config',
 'cleaned_drdo_scraped_data.json',
 'drive',
 'AIChatbot_DRDO',
 'drdo_scraped_data.json',
 'sample_data']

In [None]:
from google.colab import files
files.download("drdo_scraped_data.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import re

def clean_and_process_data(data):
    cleaned_data = {}
    seen_data = set()  # Keep track of seen data

    for url, content in data.items():
        if content:
            # Remove whitespace characters
            cleaned_content = re.sub(r'\s+', ' ', content).strip()

            # Handle duplicates
            if cleaned_content not in seen_data:
                cleaned_data[url] = cleaned_content
                seen_data.add(cleaned_content)
            else:
                print(f"Duplicate content found and removed for URL: {url}")

        else:
            print(f"Missing content for URL: {url}")

    return cleaned_data

# Load the JSON data (assuming it's in a variable named 'drdo_data')
# Replace 'drdo_data' with the actual variable name if it's different.

# Example usage (assuming drdo_data is already populated)
cleaned_drdo_data = clean_and_process_data(drdo_data)

with open("cleaned_drdo_scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(cleaned_drdo_data, file, indent=4, ensure_ascii=False)

print("Data cleaning complete! Cleaned data saved to cleaned_drdo_scraped_data.json")


Data cleaning complete! Cleaned data saved to cleaned_drdo_scraped_data.json


In [None]:
with open("cleaned_drdo_scraped_data.json", "r", encoding="utf-8") as file:
    cleaned_data = json.load(file)

print("Sample cleaned data:", list(cleaned_data.items())[:5])  # Show first 5 items

Sample cleaned data: [('https://www.drdo.gov.in/drdo/archives', "Archives | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India This page uses Javascript. Your browser either doesn't support Javascript or you have it turned off. To see this page as it is meant to appear please use a Javascript enabled browser. Feedback Sitemap FAQs User account menu Login Skip to Main Content Screen Reader Access English हिंदी facebook twitter instagram Search Corporate Directory Home DRDO About DRDO Citizen Charter Who's who Nodal Officer DRDO Logo Organisation Organisation Chart Technology Clusters Laboratories & Establishments Corporate Clusters Corporate Directorates Outreach Product for Export Industry Support Products for Industry TOT DIA-CoEs ER & IPR Research Boards TDF Test Facilities Technology Foresight Careers Publications RTI Contact Us Archive Listing Home Archives Career Events Message Board DRDO in News Press Release What's New Connect with us 

In [None]:
def clean_and_process_data(data):
    if not data:
        print("Error: No data found!")
        return {}

    cleaned_data = {}
    seen_data = set()

    for url, content in data.items():
        if content:
            cleaned_content = re.sub(r'\s+', ' ', content).strip()

            if cleaned_content not in seen_data:
                cleaned_data[url] = cleaned_content
                seen_data.add(cleaned_content)
            else:
                print(f"❌ Duplicate removed: {url}")
        else:
            print(f"⚠️ Empty content for URL: {url}")  # Debug message

    print(f"✅ Total cleaned entries: {len(cleaned_data)}")
    return cleaned_data


In [None]:
print(f"Raw data received: {drdo_data}")
print(f"Number of entries: {len(drdo_data)}")

Buffered data was truncated after reaching the output size limit.

In [None]:
if drdo_data is None:
    print("drdo_data is None. Check if the file was loaded properly.")
elif not drdo_data:
    print("drdo_data is empty. Check if the scraping was successful.")
else:
    print(f"Number of entries: {len(drdo_data)}")

Number of entries: 63


In [None]:
cleaned_drdo_data = clean_and_process_data(drdo_data)

✅ Total cleaned entries: 63


In [None]:
print(f"Total cleaned entries: {len(cleaned_drdo_data)}")

Total cleaned entries: 63


In [None]:
import json

with open("cleaned_drdo_scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(cleaned_drdo_data, file, indent=4, ensure_ascii=False)

print("Data cleaning complete! Cleaned data saved successfully.")


Data cleaning complete! Cleaned data saved successfully.


In [None]:
from google.colab import files
files.download("cleaned_drdo_scraped_data.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install faiss-cpu



In [None]:
import json

with open("cleaned_drdo_scraped_data.json", "r", encoding="utf-8") as file:
    drdo_data = json.load(file)

print(f"Loaded {len(drdo_data)} entries from the cleaned dataset.")

Loaded 63 entries from the cleaned dataset.


In [None]:
!pip install onedrivesdk
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(os.listdir("/content/drive/My Drive/Colab Notebooks/"))

['Untitled0.ipynb', 'Untitled1.ipynb', 'Meghna Gupta, 023 CSE.ipynb', 'Copy of Project 7. Wine Quality Prediction.ipynb', 'Practice.ipynb', 'Untitled2.ipynb', 'Flipkart_Reviews_Sentiment_Analysis_using_Python (1).ipynb', 'Untitled3.ipynb', 'Untitled4.ipynb', 'Untitled5.ipynb', 'classification(1).ipynb', 'Flipkart_Reviews_Sentiment_Analysis_using_Python.ipynb', 'House_Price_Prediction_using_Machine_Learning_.ipynb', 'Customer_Segmentation_in_Python.ipynb', 'customer_churn (2).ipynb', 'customer_churn (1).ipynb', 'Untitled6.ipynb', 'customer_churn.ipynb', 'Untitled', 'credit.ipynb', 'Untitled7.ipynb', 'Experiment - 2.ipynb', 'experiment - 1.ipynb', 'experiment - 2.ipynb', 'experiment - 3.ipynb', 'Experiment - 5.ipynb', 'experiment - 4.ipynb', 'Experiment - 6.ipynb', 'Untitled9.ipynb', 'WebScrapping.ipynb', 'Untitled8.ipynb']


In [None]:
!pip install sentence-transformers
import numpy as np
import gc
import torch
import pickle  # To save embeddings
from sentence_transformers import SentenceTransformer

# Free up memory
gc.collect()
torch.cuda.empty_cache()

# Define drdo_data (Replace this with your actual data)
drdo_data = {
    "doc1": "This is the first document.",
    "doc2": "This is the second document.",
    "doc3": "Here is another example of a document."
}

texts = list(drdo_data.values())

batch_size = 10  # Adjust batch size based on memory
embeddings_list = []

# Load a lightweight model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Process and save embeddings in batches
for i in range(0, len(texts), batch_size):
    batch = texts[i : i + batch_size]
    batch_embeddings = model.encode(batch, convert_to_numpy=True)
    embeddings_list.append(batch_embeddings)

# Combine all embeddings into a single array
embeddings = np.vstack(embeddings_list)

# Define file path in OneDrive (Change to your directory)
file_path = "/content/drive/My Drive/embeddings.pkl"

# Save embeddings to OneDrive
with open(file_path, "wb") as f:
    pickle.dump(embeddings, f)

print("Embeddings saved to OneDrive:", file_path)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embeddings saved to OneDrive: /content/drive/My Drive/embeddings.pkl


In [None]:
with open(file_path, "rb") as f:
    embeddings = pickle.load(f)

print("Loaded embeddings shape:", embeddings.shape)

Loaded embeddings shape: (3, 384)


In [None]:
import json

# File paths
data_path = "/content/drive/My Drive/drdo_data/"
scraped_file = f"/content/drdo_scraped_data.json"
cleaned_file = f"/content/cleaned_drdo_scraped_data.json"

# Load both JSON files
with open(scraped_file, "r", encoding="utf-8") as file:
    scraped_data = json.load(file)

with open(cleaned_file, "r", encoding="utf-8") as file:
    cleaned_data = json.load(file)

# Convert both files into dictionary format
drdo_scraped = {f"doc{i+1}": content for i, content in enumerate(scraped_data.values())}
drdo_cleaned = {f"doc{i+1}": content for i, content in enumerate(cleaned_data.values())}

# Print sample to verify format
print("Scraped Data Sample:", list(drdo_scraped.items())[:3])
print("Cleaned Data Sample:", list(drdo_cleaned.items())[:3])

Scraped Data Sample: [('doc1', "Archives | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India This page uses Javascript. Your browser either doesn't support Javascript or you have it turned off. To see this page as it is meant to appear please use a Javascript enabled browser. Feedback Sitemap FAQs User account menu Login Skip to Main Content Screen Reader Access English हिंदी facebook twitter instagram Search Corporate Directory Home DRDO About DRDO Citizen Charter Who's who Nodal Officer DRDO Logo Organisation Organisation Chart Technology Clusters Laboratories & Establishments Corporate Clusters Corporate Directorates Outreach Product for Export Industry Support Products for Industry TOT DIA-CoEs ER & IPR Research Boards TDF Test Facilities Technology Foresight Careers Publications RTI Contact Us Archive Listing Home Archives Career Events Message Board DRDO in News Press Release What's New Connect with us facebook twitter instagram Relate

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
import torch

# Initialize the model with GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Function to encode text in batches with memory optimization
def encode_in_batches(texts, batch_size=16):
    embeddings = []
    with torch.no_grad():  # Prevents storing computational graphs in memory
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_embeddings = model.encode(batch, convert_to_numpy=True)
            embeddings.append(batch_embeddings)
    return np.vstack(embeddings)  # Combine batch results

# Create embeddings for the scraped data
scraped_texts = list(drdo_scraped.values())
scraped_embeddings = encode_in_batches(scraped_texts, batch_size=16)

# Clear memory after processing
del scraped_texts
torch.cuda.empty_cache()

# Create embeddings for the cleaned data
cleaned_texts = list(drdo_cleaned.values())
cleaned_embeddings = encode_in_batches(cleaned_texts, batch_size=16)

# Clear memory after processing
del cleaned_texts
torch.cuda.empty_cache()

# Save embeddings to Google Drive
output_path = "/content/drive/My Drive/drdo_data/"
with open(f"{output_path}scraped_embeddings.pkl", "wb") as f:
    pickle.dump(scraped_embeddings, f)

with open(f"{output_path}cleaned_embeddings.pkl", "wb") as f:
    pickle.dump(cleaned_embeddings, f)

print("Embeddings saved successfully!")
