## Installing dependencies

In [1]:
%pip install selenium

Note: you may need to restart the kernel to use updated packages.


### importing selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service

###Other imports here

In [3]:
%pip install webdriver_manager
%pip install wget

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import wget
import time
import json
from urllib.parse import urljoin
from webdriver_manager.chrome import ChromeDriverManager

chromeDriver Path


In [11]:
chrome_driver_path = r"C:\Users\Chetan\chromedriver-win32\chromedriver.exe"

In [None]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

missions = ["insat-3d", "insat-3dr","megha-tropiques", "scatsat-1", "kalpana-1", "saral-altika", "oceansat-2", "oceansat-3"]
mission_subpages = ["introduction", "objectives", "spacecraft", "payloads", "references"]

# Base URL
base_url = "https://www.mosdac.gov.in"
pdf_icon_url = "https://www.mosdac.gov.in/images/PDF.gif"

# Dictionary to store extracted data
mission_data_store = {}

# for _ in range(1):
for mission in missions:
    mission_url = f"{base_url}/{mission}"
    driver.get(mission_url)
    # driver.get("https://www.mosdac.gov.in/"+missions[6])
    
    try:
        # Wait for the content area to load
        content_area = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".field-item.even"))
        )
        print(f"Element found for: {mission}")

        # Extract data from the main mission page
        mission_data = {
            "text": content_area.text,
            "images": [],
            "tables": [],
            "pdfs": []
        }

        # Extract all images within the section
        images = content_area.find_elements(By.TAG_NAME, "img")
        for img in images:
            img_src = urljoin(base_url, img.get_attribute("src"))
            if img_src != pdf_icon_url:
                mission_data["images"].append(img_src)

        # Extract all tables
        tables = content_area.find_elements(By.TAG_NAME, "table")
        for table in tables:
            mission_data["tables"].append(table.get_attribute("outerHTML"))  # table parsing will be done later

        # Extract all PDF links
        pdf_links = content_area.find_elements(By.CSS_SELECTOR, "p a[href$='.pdf']")
        for pdf in pdf_links:
            relative_url = pdf.get_attribute("href")
            full_url = urljoin(base_url, relative_url)
            mission_data["pdfs"].append(full_url)
         
         # Store extracted data
        mission_data_store[mission] = mission_data
    
    except Exception as e:
        print(f"Error: Element not found for {mission}: {e}")

    time.sleep(2)
    
    
    # Extract data from subpages
    for subpage in mission_subpages:
        subpage_url = f"{base_url}/{mission}-{subpage}"
        driver.get(subpage_url)

        try:
            # Wait for the content area to load
            content_area = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".field-item.even"))
            )
            print(f"Element found for: {mission}-{subpage}")

            # Extract data from the subpage
            subpage_data = {
                "text": content_area.text,
                "images": [],
                "tables": [],
                "pdfs": []
            }

            # Extract images
            images = content_area.find_elements(By.TAG_NAME, "img")
            for img in images:
                img_src = urljoin(base_url, img.get_attribute("src"))
                subpage_data["images"].append(img_src)

            # Extract tables
            tables = content_area.find_elements(By.TAG_NAME, "table")
            for table in tables:
                subpage_data["tables"].append(table.get_attribute("outerHTML"))

            # Extract PDF links
            pdf_links = content_area.find_elements(By.CSS_SELECTOR, "p a[href$='.pdf']")
            for pdf in pdf_links:
                relative_url = pdf.get_attribute("href")
                full_url = urljoin(base_url, relative_url)
                subpage_data["pdfs"].append(full_url)

            # Store extracted data
            mission_data_store[f"{mission}-{subpage}"] = subpage_data

        except Exception as e:
            print(f"Error: Element not found for {mission}-{subpage}: {e}")

        time.sleep(2)
        


# Close the browser
driver.quit()

Element found for: insat-3d
Element found for: insat-3d-introduction
Element found for: insat-3d-objectives
Element found for: insat-3d-spacecraft
Element found for: insat-3d-payloads
Element found for: insat-3d-references
Element found for: insat-3dr
Element found for: insat-3dr-introduction
Element found for: insat-3dr-objectives
Element found for: insat-3dr-spacecraft
Element found for: insat-3dr-payloads
Element found for: insat-3dr-references
Element found for: megha-tropiques
Element found for: megha-tropiques-introduction
Element found for: megha-tropiques-objectives
Element found for: megha-tropiques-spacecraft
Element found for: megha-tropiques-payloads
Element found for: megha-tropiques-references
Element found for: scatsat-1
Element found for: scatsat-1-introduction
Element found for: scatsat-1-objectives
Element found for: scatsat-1-spacecraft
Element found for: scatsat-1-payloads
Element found for: scatsat-1-references
Element found for: kalpana-1
Element found for: kalpan

In [15]:
with open("mosdac_mission_data.json", "w", encoding="utf-8") as json_file:
    json.dump(mission_data_store, json_file, ensure_ascii=False, indent=4)

print("\n✅ Data extraction completed! Saved as 'mosdac_mission_data.json'.")


✅ Data extraction completed! Saved as 'mosdac_mission_data.json'.


In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

In [6]:
driver = webdriver.Chrome()

catalog_pages = {
    "satellite": "https://www.mosdac.gov.in/catalog/satellite.php",
    "in-situ": "https://www.mosdac.gov.in/catalog/insitu.php",
    "radar": "https://www.mosdac.gov.in/internal/catalog-radar"
}

wait = WebDriverWait(driver, 10)

data_list = []

for category, url in catalog_pages.items():
    driver.get(url)
    time.sleep(3)

# Locate the Satellite dropdown
    satellite_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "satellite")))  # Update ID
    satellite_select = Select(satellite_dropdown)

    # Locate the Sensor dropdown
    sensor_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "sensor")))  # Update ID
    sensor_select = Select(sensor_dropdown)

    # Loop through each Satellite option
    for satellite_option in satellite_select.options:
        satellite_name = satellite_option.text.strip()
        satellite_option.click()
        time.sleep(2)  # Wait for the Sensor dropdown to update

        # Loop through each Sensor option
        for sensor_option in sensor_select.options:
            sensor_name = sensor_option.text.strip()
            sensor_option.click()
            time.sleep(3)  # Wait for table to update

            # Locate table and extract rows
            table = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="tabledata"]')))  # Update ID
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract data from table
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                if cols:
                    row_data = [col.text.strip() for col in cols]
                    row_data.insert(0, sensor_name)
                    row_data.insert(0, satellite_name)
                    row_data.insert(0, category)
                    data_list.append(row_data)

            print(f"Scraped: {category} > {satellite_name} > {sensor_name}")

# # Convert to DataFrame and save
# df = pd.DataFrame(data_list, columns=["Category", "Satellite", "Sensor"] + [f"Col_{i}" for i in range(len(data_list[0])-3)])
# # Convert data to JSON format
# json_data = []
# for row in data_list:
#     json_entry = {
#         "Category": row[0],
#         "Satellite": row[1],
#         "Sensor": row[2],
#         "Data": row[3:]  # Remaining table columns
#     }
#     json_data.append(json_entry)

# # Save to JSON file
# import json
# with open("mosdac_data.json", "w", encoding="utf-8") as f:
#     json.dump(json_data, f, indent=4)

# print("Scraping Complete! Data saved to mosdac_data.json")

import json

# Determine the maximum number of columns in data_list
max_cols = max(len(row) for row in data_list)

# Define column names dynamically
column_names = ["Category", "Satellite", "Sensor"] + [f"Col_{i}" for i in range(max_cols - 3)]

# Pad shorter rows with empty values to match the max column length
normalized_data = [row + [""] * (max_cols - len(row)) for row in data_list]

# Convert to DataFrame
df = pd.DataFrame(normalized_data, columns=column_names)

# Convert to JSON
json_data = df.to_dict(orient="records")

# # Save JSON file
# with open("mosdac_data.json", "w", encoding="utf-8") as f:
#     json.dump(json_data, f, indent=4)


# Close driver
driver.quit()
# print("Scraping Complete! Data saved to mosdac_data.json")  

Scraped: satellite > EOS-06 > OCM
Scraped: satellite > EOS-06 > SCATTEROMETER
Scraped: satellite > INSAT-3DR > IMAGER
Scraped: satellite > INSAT-3DR > SOUNDER
Scraped: satellite > INSAT-3DS > IMAGER
Scraped: satellite > INSAT-3DS > SOUNDER
Scraped: satellite > SARAL > ALTIMETER
Scraped: satellite > EOS-07 > MHS
Scraped: satellite > INSAT-3A > CCD
Scraped: satellite > INSAT-3A > VHRR
Scraped: satellite > INSAT-3D > IMAGER
Scraped: satellite > INSAT-3D > SOUNDER
Scraped: satellite > KALPANA-1 > VHRR
Scraped: satellite > MEGHATROPIQUES > MADRAS
Scraped: satellite > MEGHATROPIQUES > ROSA
Scraped: satellite > MEGHATROPIQUES > SAPHIR
Scraped: satellite > MEGHATROPIQUES > SCARAB
Scraped: satellite > OCEANSAT-2 > SCATTEROMETER
Scraped: satellite > SCATSAT-1 > SCATTEROMETER
Scraped: in-situ > In-situ > AMS
Scraped: in-situ > In-situ > AWS
Scraped: in-situ > In-situ > AWSAGRI
Scraped: in-situ > In-situ > AWSUPG
Scraped: in-situ > In-situ > IMDAWS
Scraped: radar > RADAR > S BAND DWR CHERRAPUNJI
S

In [7]:
# Remove empty columns from JSON output
json_data = []
for row in df.to_dict(orient="records"):
    cleaned_row = {key: value for key, value in row.items() if value != ""}
    json_data.append(cleaned_row)

# Save JSON file
with open("mosdac_data.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4)