In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from astroquery.gaia import Gaia
from astroquery.simbad import Simbad
from astropy.coordinates import SkyCoord
import astropy.units as u
import numpy as np
import pyvo as vo
import pickle
from astroquery.vizier import Vizier
import concurrent.futures
import requests
import os
from tqdm import tqdm
import concurrent.futures
import requests
import time
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

## Defining the main functions to download the spectra

In [2]:
# Function to download a file with retries
def download_file(url):
    file_name = os.path.join(save_folder, url.split('/')[-1])
    
    # If the file already exists, return success
    if os.path.exists(file_name):
        return True, file_name
    
    # Retry mechanism
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    session = requests.Session()
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url)
        response.raise_for_status()  # Check for HTTP errors
        
        # Save the file content
        with open(file_name, 'wb') as f:
            f.write(response.content)
        
        return True, file_name  # Success
    except requests.HTTPError as e:
        return False, f"Failed to download {url}: {e}"  # Failure
    except Exception as e:
        return False, f"Other error occurred: {e}"

# Main function to download all files
def download_all_files(urls):
    success_count = 0
    failure_count = 0

    # Reduce the number of threads to avoid overwhelming the network
    with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
        # Create a progress bar with the total number of URLs
        with tqdm(total=len(urls), desc="Downloading", unit="file") as pbar:
            futures = {executor.submit(download_file, url): url for url in urls}
            
            for future in concurrent.futures.as_completed(futures):
                success, result = future.result()
                pbar.update(1)  # Update the progress bar for each completed download
                
                if success:
                    success_count += 1
                else:
                    failure_count += 1
                    #print(f"Error: {result}")

    # Print final counts for successful and failed downloads
    print(f"\nDownload complete! Successfully downloaded: {success_count}, Failed: {failure_count}")

##

## Applying function to non-AGN Galaxies

In [8]:
# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
gal_data = pd.read_pickle("gal_data.pkl")  # Loaded GAL data
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Ensure that RA and Dec columns are numeric and have units
gal_data['ra'] = pd.to_numeric(gal_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
gal_data['dec'] = pd.to_numeric(gal_data['dec'], errors='coerce')
lamost_catalog['ra'] = pd.to_numeric(lamost_catalog['ra'], errors='coerce')
lamost_catalog['dec'] = pd.to_numeric(lamost_catalog['dec'], errors='coerce')

# Drop rows with NaN values in RA or Dec
gal_data = gal_data.dropna(subset=['ra', 'dec'])
lamost_catalog = lamost_catalog.dropna(subset=['ra', 'dec'])

# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
gal_coords = SkyCoord(ra=gal_data['ra'].values*u.deg, dec=gal_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=lamost_catalog['ra'].values*u.deg, dec=lamost_catalog['dec'].values*u.deg)

# Perform the crossmatch using astropy's match_to_catalog_sky function
idx, d2d, _ = gal_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_gal = gal_data.iloc[matches]
matched_lamost = lamost_catalog.iloc[idx[matches]]

# Combine matched data
gal_lamost_data = pd.concat([matched_gal.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

# Save the crossmatched data
gal_lamost_data.to_pickle("gal_lamost_data.pkl")

print(f"Number of matches: {gal_lamost_data.shape[0]}")
print("out of ", gal_data.shape[0])
del gal_data, lamost_catalog

Number of matches: 2533
out of  33531


In [4]:
# Load the crossmatched data
gal_lamost_data = pd.read_pickle("gal_lamost_data.pkl")
obsid_list = gal_lamost_data['obsid'].values

# Specify the folder to save the files
save_folder = "gal_spectra"

# Create the folder if it doesn't exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# List of URLs to download (assuming obsid_list is available)
urls = [f"https://www.lamost.org/dr7/v2.0/spectrum/fits/{obsid}" for obsid in obsid_list]

# Call the function to start downloading
download_all_files(urls)

# Free up memory by deleting the large DataFrame
del gal_lamost_data

Downloading: 100%|██████████| 2533/2533 [01:21<00:00, 31.17file/s]


Download complete! Successfully downloaded: 2099, Failed: 434





## Applying function to Binaries data

In [8]:
# Load your BIN data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
bin_data = pd.read_pickle("bin_data.pkl")  # Loaded BIN data
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Ensure that RA and Dec columns are numeric and have units
bin_data['ra'] = pd.to_numeric(bin_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
bin_data['dec'] = pd.to_numeric(bin_data['dec'], errors='coerce')   
lamost_catalog['ra'] = pd.to_numeric(lamost_catalog['ra'], errors='coerce')
lamost_catalog['dec'] = pd.to_numeric(lamost_catalog['dec'], errors='coerce')

# Drop rows with NaN values in RA or Dec
bin_data = bin_data.dropna(subset=['ra', 'dec'])
lamost_catalog = lamost_catalog.dropna(subset=['ra', 'dec'])

# Convert BIN and LAMOST data to SkyCoord objects for crossmatching
bin_coords = SkyCoord(ra=bin_data['ra'].values*u.deg, dec=bin_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=lamost_catalog['ra'].values*u.deg, dec=lamost_catalog['dec'].values*u.deg)

# Perform the crossmatch using astropy's match_to_catalog_sky function
idx, d2d, _ = bin_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_bin = bin_data.iloc[matches]
matched_lamost = lamost_catalog.iloc[idx[matches]]

# Combine matched data
bin_lamost_data = pd.concat([matched_bin.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

# Save the crossmatched data
bin_lamost_data.to_pickle("bin_lamost_data.pkl")

print(f"Number of matches: {bin_lamost_data.shape[0]}")
print("out of ", bin_data.shape[0])
del bin_data, lamost_catalog


Number of matches: 45070
out of  1700440


In [9]:
# Load the crossmatched data
bin_lamost_data = pd.read_pickle("bin_lamost_data.pkl")
obsid_list = bin_lamost_data['obsid'].values

# Specify the folder to save the files
save_folder = "bin_spectra"

# Create the folder if it doesn't exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# List of URLs to download (assuming obsid_list is available)
urls = [f"https://www.lamost.org/dr7/v2.0/spectrum/fits/{obsid}" for obsid in obsid_list]

# Call the function to start downloading
download_all_files(urls)

# Free up memory by deleting the large DataFrame
del bin_lamost_data

Downloading: 100%|██████████| 45070/45070 [07:49<00:00, 96.03file/s] 


Download complete! Successfully downloaded: 41079, Failed: 3991





## Applying function to Star data

In [10]:
# Load your STAR data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
star_data = pd.read_pickle("star_data.pkl")  # Loaded STAR data
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Ensure that RA and Dec columns are numeric and have units
star_data['ra'] = pd.to_numeric(star_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
star_data['dec'] = pd.to_numeric(star_data['dec'], errors='coerce')
lamost_catalog['ra'] = pd.to_numeric(lamost_catalog['ra'], errors='coerce')
lamost_catalog['dec'] = pd.to_numeric(lamost_catalog['dec'], errors='coerce')

# Drop rows with NaN values in RA or Dec
star_data = star_data.dropna(subset=['ra', 'dec'])
lamost_catalog = lamost_catalog.dropna(subset=['ra', 'dec'])

# Convert STAR and LAMOST data to SkyCoord objects for crossmatching
star_coords = SkyCoord(ra=star_data['ra'].values*u.deg, dec=star_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=lamost_catalog['ra'].values*u.deg, dec=lamost_catalog['dec'].values*u.deg)

# Perform the crossmatch using astropy's match_to_catalog_sky function
idx, d2d, _ = star_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_star = star_data.iloc[matches]
matched_lamost = lamost_catalog.iloc[idx[matches]]

# Combine matched data
star_lamost_data = pd.concat([matched_star.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

# Save the crossmatched data
star_lamost_data.to_pickle("star_lamost_data.pkl")

print(f"Number of matches: {star_lamost_data.shape[0]}")
print("out of ", star_data.shape[0])
del star_data, lamost_catalog

Number of matches: 94651
out of  1499508


In [11]:
# Load the crossmatched data
star_lamost_data = pd.read_pickle("star_lamost_data.pkl")
obsid_list = star_lamost_data['obsid'].values

# Specify the folder to save the files
save_folder = "star_spectra"

# Create the folder if it doesn't exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# List of URLs to download (assuming obsid_list is available)
urls = [f"https://www.lamost.org/dr7/v2.0/spectrum/fits/{obsid}" for obsid in obsid_list]

# Call the function to start downloading
download_all_files(urls)

# Free up memory by deleting the large DataFrame
del star_lamost_data

Downloading: 100%|██████████| 94651/94651 [07:30<00:00, 209.87file/s] 


Download complete! Successfully downloaded: 86484, Failed: 8167





## Applying function to AGN data

In [12]:
# Load your AGN data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
agn_data = pd.read_pickle("agn_data.pkl")  # Loaded AGN data
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Ensure that RA and Dec columns are numeric and have units
agn_data['ra'] = pd.to_numeric(agn_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
agn_data['dec'] = pd.to_numeric(agn_data['dec'], errors='coerce')
lamost_catalog['ra'] = pd.to_numeric(lamost_catalog['ra'], errors='coerce')
lamost_catalog['dec'] = pd.to_numeric(lamost_catalog['dec'], errors='coerce')

# Drop rows with NaN values in RA or Dec
agn_data = agn_data.dropna(subset=['ra', 'dec'])
lamost_catalog = lamost_catalog.dropna(subset=['ra', 'dec'])

# Convert AGN and LAMOST data to SkyCoord objects for crossmatching
agn_coords = SkyCoord(ra=agn_data['ra'].values*u.deg, dec=agn_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=lamost_catalog['ra'].values*u.deg, dec=lamost_catalog['dec'].values*u.deg)

# Perform the crossmatch using astropy's match_to_catalog_sky function
idx, d2d, _ = agn_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_agn = agn_data.iloc[matches]
matched_lamost = lamost_catalog.iloc[idx[matches]]

# Combine matched data
agn_lamost_data = pd.concat([matched_agn.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

# Save the crossmatched data
agn_lamost_data.to_pickle("agn_lamost_data.pkl")

print(f"Number of matches: {agn_lamost_data.shape[0]}")
print("out of ", agn_data.shape[0])
del agn_data, lamost_catalog

Number of matches: 40138
out of  412025


In [13]:
# Load the crossmatched data
agn_lamost_data = pd.read_pickle("agn_lamost_data.pkl")
obsid_list = agn_lamost_data['obsid'].values

# Specify the folder to save the files
save_folder = "agn_spectra"

# Create the folder if it doesn't exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# List of URLs to download (assuming obsid_list is available)
urls = [f"https://www.lamost.org/dr7/v2.0/spectrum/fits/{obsid}" for obsid in obsid_list]

# Call the function to start downloading
download_all_files(urls)

# Free up memory by deleting the large DataFrame
del agn_lamost_data

Downloading: 100%|██████████| 40138/40138 [09:01<00:00, 74.06file/s]  


Download complete! Successfully downloaded: 36336, Failed: 3802



