In [2]:
#imports

# requests module allows use of http GET requests to scrape from the NED database
from requests import get as get_request
from requests.models import Response

# bs4 is a web scraping parser for disecting html
from bs4 import BeautifulSoup

# pandas for manipulating dataframes
from pandas import DataFrame, concat as df_concat

# other standard library utilities
from concurrent.futures import ThreadPoolExecutor, as_completed
from re import search as regex_search
import logging

In [16]:
NED_URL_STEM = 'https://ned.ipac.caltech.edu/cgi-bin/OBJatt?'
def make_ned_search_url(values: list[tuple[str]]) = NED_URL_STEM + (
    values
    |> starmap$((x, y) => f"{x}={y}")
    |> => '&'.join(_)
)

In [17]:
search_vals = [
    ('delimeter', 'bar'),
    ('NO_LINKS', '1'),
    ('crosid', 'objname'),
    ('position', 'z'),
    ('gadata', 'morphol'),
    ('M','2121')
]

In [18]:
url = make_ned_search_url(search_vals)

In [6]:
def get_page_count(string: str) = (
    BeautifulSoup(string, 'lxml')
    |> .find("form")
    |> .get_text()
    |> .replace("\xa0", "")
    |> .strip()
    |> regex_search$(r"Page [0-9]+ of [0-9]+")
    |> .group(0)
    |> regex_search$(r"[0-9]+$")
    |> .group(0)
    |> int
)

In [10]:
def find_table(response: Response) = (
    BeautifulSoup(response.text, "lxml")
    |> .find("table")
    |> .find("pre")
)    

def remove_header(soup: BeautifulSoup):
    soup.find("strong").decompose()
    return soup

def parse_page(soup: BeautifulSoup) = (
    soup
    |> .get_text()
    |> .splitlines()
)

def extract_data(table_rows: list[str]) = (
    table_rows 
    |> map$(=>_.split("|") |> map$(=>_.strip()))
    |> list
    |> .[1:]
)

DF_COLS = ["Name", "Redshift", "Classification"]
def data_to_df(table_rows: list[list[str]]) = DataFrame(table_rows, columns=DF_COLS)

def url_request(url: str) = (
    url
    |> get_request
    |> find_table
    |> remove_header
    |> parse_page 
    |> extract_data
    |> data_to_df
)

def url_request_pages(url: str, pages: int):
    match pages:
        case 1: return (
            url
            |> url_request
            |> .sort_values("Name", ignore_index=True)
        )
        case _: return (
            range(pages)
            |> map$(=> f"{url}&page={_ + 1}")
            |> process_map$(url_request)
            |> df_concat$(?, keys=["x","y","z"])
            |> .sort_values("Name", ignore_index=True)
        )

In [11]:
pages = get_request(url).text |> get_page_count

In [12]:
url_request_pages(url, pages)

Unnamed: 0,Name,Redshift,Classification
0,ABELL 1139:KYDISC 00016,0.039943,(R)SB(r)0/a
1,ARP 321 NED03,0.021655,SB(s)0/a pec HII
2,CGCG 011-064,0.025708,(R)SB(r)0/a
3,CGCG 012-024,0.022561,(R')SB(r)0/a
4,CGCG 039-041,0.028844,(R)SB(r)0/a
...,...,...,...
250,UGC 12433,0.023299,SB0/a
251,VCC 0513,0.006105,SB0/a pec? HII
252,WISEA J001826.80+162544.6,0.563520,
253,WISEA J002634.25+170821.7,0.404000,SBa/S0
