In [5]:
#imports

# requests module allows use of http GET requests to scrape from the NED database
from requests import get as get_request
from requests.models import Response

# bs4 is a web scraping parser for disecting html
from bs4 import BeautifulSoup

# pandas for manipulating dataframes
from pandas import DataFrame, concat as df_concat

# other standard library utilities
from concurrent.futures import ThreadPoolExecutor, as_completed
from re import search as regex_search
from itertools import starmap
import logging

In [6]:
def make_ned_search_url(values: list[tuple[str, str]]) -> str:
    return "https://ned.ipac.caltech.edu/cgi-bin/OBJatt?" \
        + '&'.join(starmap(lambda arg, val: f"{arg}={val}", values))

In [7]:
search_vals = [
    ('delimeter', 'bar'),
    ('NO_LINKS', '1'),
    ('crosid', 'objname'),
    ('position', 'z'),
    ('gadata', 'morphol'),
    ('M','2121')
]

In [8]:
search_url = make_ned_search_url(search_vals)
search_url

'https://ned.ipac.caltech.edu/cgi-bin/OBJatt?delimeter=bar&NO_LINKS=1&crosid=objname&position=z&gadata=morphol&M=2121'

In [9]:
def get_page_count(string: str) -> int:
    soup = BeautifulSoup(string, "lxml")
    form = soup.find("form")
    text = form.get_text().replace("\xa0", "").strip()
    page_numbers = regex_search(r"Page [0-9]+ of [0-9]+", text).group(0)
    return int(regex_search(r"[0-9]+$", page_numbers).group(0))

In [10]:
def request_page(url: str) -> Response:
    logging.info(f"requesting: {url}")
    return get_request(url)

def parse_page(response: Response) -> list[str]:
    soup = BeautifulSoup(response.text, "lxml")
    table = soup.find("table")
    table_rows = table.find("pre")
    table_rows.find("strong").decompose()
    return table_rows.get_text().splitlines()[1:]
    
def extract_data(table_rows: list[str]) -> list[list[str]]:
    return [
        list(map(lambda item: item.strip(), string.split("|")))
        for string in table_rows
    ]

df_cols = ["Name", "Redshift", "Classification"]
def data_to_df(table_rows: list[list[str]]) -> DataFrame:
    df = DataFrame(columns=df_cols)
    for row in table_rows:
        df_row = {col: val for (col, val) in zip(df_cols, row)}
        df.loc[len(df)] = df_row
    return df

def pipeline(*funcs: list) -> any:
    val = funcs[0]
    for func in funcs[1:]:
        val = func(val)
    return val

def request_pipeline(url: str) -> DataFrame:
    return pipeline(url, request_page, parse_page, extract_data, data_to_df)

# https://docs.python.org/3/library/concurrent.futures.html
def request_all_pages(url: str, pages: int) -> DataFrame:
    if pages == 1:
        return [request_pipeline(url)]
    
    urls = map(lambda page: f"{url}&page={page}", range(1, pages + 1))
    responses = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_url_requests = {executor.submit(request_pipeline, url): url for url in urls}
        for future in as_completed(future_url_requests):
            url = future_url_requests[future]
            try:
                responses.append(future.result())
            except (Exception, e):
                print(f"Bad response from {url}:\n\n{str(e)}")
                
    return df_concat(responses, keys=["x","y","z"]).sort_values("Name", ignore_index=True)             

In [3]:
new_url = "https://ned.ipac.caltech.edu/cgi-bin/OBJatt?delimiter=bar&NO_LINKS=1&crosid=objname&position=z&attdat_CON=M&M=2121"

In [17]:
pages = request_all_pages(new_url, get_page_count(get_request(new_url).text))

In [18]:
pages

Unnamed: 0,Name,Redshift,Classification
0,ABELL 1139:KYDISC 00016,0.039943,(R)SB0/a(r)
1,ARP 321 NED03,0.021655,SB0/a(s) pec?
2,CGCG 011-064,0.025708,(R)SB0/a(r)
3,CGCG 012-024,0.022561,(R')SB0/a(r)
4,CGCG 039-041,0.028844,(R)SB0/a(r)
...,...,...,...
250,UGC 12433,0.023299,SB0/a
251,VCC 0513,0.006105,SB0/a pec?
252,WISEA J001826.80+162544.6,0.563520,SB0/a
253,WISEA J002634.25+170821.7,0.404000,SB0/a


In [14]:
response = request_page(search_url)

In [16]:
extract_data(parse_page(response))

[['UGC 00013', '0.025748', "(R')SB(s)0/a   LINER"],
 ['UGC 00033', '0.017869', 'SB0/a'],
 ['UGC 00141', '0.022729', 'SB0/a'],
 ['ESO 350- G 007', '0.025304', "(R)SB(r'l)0/a"],
 ['WISEA J001826.80+162544.6', '0.563520', ''],
 ['CL 0016+1609:[DG92] 103', '', 'Sa/S0'],
 ['CL 0016+1609:[DG92] 057', '', ''],
 ['MCG -02-01-050', '0.022579', "(R')SB(r)0/a?"],
 ['NGC 0078A', '0.016921', 'SB(r)0/a:'],
 ['NGC 0088', '0.011284', 'SB(rs)0/a: pec LINER'],
 ['NGC 0089', '0.010964', 'SB(s)0/a? pec  LINER'],
 ['WISEA J002634.25+170821.7', '0.404000', 'SBa/S0'],
 ['ZwCl 0024.0+1652:[DG92] 197', '0.388000', 'SBa/0'],
 ['UGC 00274', '0.022285', 'SB0/a'],
 ['NGC 0159', '0.027976', '(R)SB(r)0/a'],
 ['NGC 0174', '0.011905', 'SB(rs)0/a      HII'],
 ['NGC 0186', '0.015551', 'SB0+ pec:'],
 ['ESO 242- G 024', '0.012322', "(R'_1)SB(s)a"],
 ['NGC 0223', '0.017876', "(R')SB(r)0/a:"],
 ['NGC 0314', '0.018730', '(R)SB(r)0^+'],
 ['NGC 0320', '0.018379', '(R)SB(rs)0/a:  HII'],
 ['NGC 0351', '0.013874', "(R')SB(r)0/a: 