In [None]:
import os
import json
import requests

from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

## Setup Gemini model

In [None]:
import google.generativeai as genai

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
import pprint

for model in genai.list_models():
    pprint.pprint(model)

In [None]:
model = genai.GenerativeModel("models/gemini-2.5-flash")
model

In [None]:
import textwrap

html_code_to_process = textwrap.dedent("""
    <center>
    <b> A Double Detonation Supernova </b> <br>
    <b> Image Credit: </b>
    <a href="https://www.eso.org/">ESO</a>,
    <a href="https://www.unsw.edu.au/hdr/priyam-das">P. Das</a> et al.;
    Background stars
    (<a href="https://www.nasa.gov/">NASA</a>/<a href="https://science.nasa.gov/mission/hubble/">Hubble</a>): K. Noll et al.
    </center>
    """)

system_prompt = textwrap.dedent(f"""
    You are an expert text extraction AI. Your sole purpose is to extract author and credit information from a snippet of HTML code.

    Follow these rules precisely:
    1.  Find the text that comes **after** the "Image Credit" or "Illustration Credit" line.
    2.  Extract all names and sources, including the text inside `<a>` tags.
    3.  Combine everything into a **single string**.
    4.  Replace any semicolons (`;`) with commas (`,`).

    **CRITICAL:** Your response must **only** be the final extracted string. Do not write explanations, code, or any other text.

    ---

    ### Example 1

    **Input HTML:**
    ```html
    <center>
    <b> The Great Globular Cluster in Hercules </b> <br/>
    <b>Image Credit &amp;
    <a href="lib/about_apod.html#srapply">Copyright</a>:</b>
    <a href="[https://www.distant-luminosity.com/about.html](https://www.distant-luminosity.com/about.html)">Jan Beckmann, Julian Zoller, Lukas Eisert, Wolfgang Hummel</a>
    </center>
    ````

    **Output:**
    `Jan Beckmann, Julian Zoller, Lukas Eisert, Wolfgang Hummel`

    -----

    ### Example 2

    **Input HTML:**

    ```html
    <center>
    <b> NGC 602: Oyster Star Cluster </b> <br>
    <b> Image Credit: </b>
    X-ray: Chandra: NASA/CXC/Univ.Potsdam/L.Oskinova et al; <br>
    Optical: Hubble: NASA/STScI; Infrared: Spitzer: NASA/JPL-Caltech
    </center>
    ```

    **Output:**
    `X-ray: Chandra: NASA/CXC/Univ.Potsdam/L.Oskinova et al, Optical: Hubble: NASA/STScI, Infrared: Spitzer: NASA/JPL-Caltech`

    -----

    Now, process the following HTML.

    **Input HTML:**

    ```html
    {html_code_to_process}
    ```

    **Output:**
    """)

response = model.generate_content(system_prompt)
print(response.text)

In [None]:
import time


def scrapeAPODWebsite():
    print("Scraping the APOD website.")
    createOutputFileIfNotExists()

    a_tags = get_a_tags()
    start_time = time.time()
    rpm_counter = 0
    for tag in tqdm(a_tags, total=len(a_tags)):
        try:
            item = scrape_a_tag(tag)
            if item["image_url"] is None:
                # This means that this is a video link, we don't want to include it
                continue

            with open("../data/apod_data.json", "r") as file:
                data = json.load(file)

            if itemExistsInData(data, item):
                print(f"Item for date {item['date']} already exists in the data. Stop scraping.")
                break
            else:
                data.append(item)

            # Write the updated data back to the file
            with open("../data/apod_data.json", "w") as file:
                json.dump(data, file, indent=4)
            # to avoid hitting the rate limit of 15 requests per minute
            end_time = time.time()
            if end_time - start_time < 60:
                rpm_counter += 1
                if rpm_counter == 10:  # 10 requests per minute just to leave a margin
                    time.sleep(60)
                    rpm_counter = 0
                    start_time = time.time()
        except Exception as e:
            print(f"An error occurred while scraping: {e}")
            continue


def createOutputFileIfNotExists():
    if not os.path.exists("../data/apod_data.json"):
        with open("../data/apod_data.json", "w") as file:
            file.write("[]")


def get_a_tags():
    source = requests.get("https://apod.nasa.gov/apod/archivepix.html").text
    soup = BeautifulSoup(source, "lxml")

    b_tag = soup.find_all("b")[1]
    return b_tag.find_all("a")


def scrape_a_tag(a_tag):
    dictionary = {}

    date = a_tag.find_previous(string=True).strip()
    title = a_tag.text.strip()
    url = f"https://apod.nasa.gov/apod/{a_tag['href']}"
    image, explanation = get_image_and_explanation(url)
    authors = get_authors(url)

    dictionary["date"] = date
    dictionary["title"] = title
    dictionary["url"] = url
    dictionary["image_url"] = image
    dictionary["explanation"] = explanation
    dictionary["authors"] = authors

    return dictionary


def get_image_and_explanation(url):
    source = requests.get(url).text
    soup = BeautifulSoup(source, "lxml")

    p_tags = soup.find_all("p")
    img_tag = soup.find("img")
    explanation = p_tags[2].get_text()

    try:
        img_url = f"https://apod.nasa.gov/apod/{img_tag['src']}"
    except Exception:
        img_url = None

    return img_url, explanation


def get_authors(url):
    source = requests.get(url).text
    soup = BeautifulSoup(source, "lxml")

    center_tags = soup.find_all("center")
    credit_center_tag = center_tags[1]
    authors = extract_authors_with_gemini(credit_center_tag)

    return authors


def extract_authors_with_gemini(html_code_to_process):
    system_prompt = textwrap.dedent(f"""
    You are an expert text extraction AI. Your sole purpose is to extract author and credit information from a snippet of HTML code.

    Follow these rules precisely:
    1.  Find the text that comes **after** the "Image Credit" or "Illustration Credit" line.
    2.  Extract all names and sources, including the text inside `<a>` tags.
    3.  Combine everything into a **single string**.
    4.  Replace any semicolons (`;`) with commas (`,`).

    **CRITICAL:** Your response must **only** be the final extracted string. Do not write explanations, code, or any other text.

    ---

    ### Example 1

    **Input HTML:**
    ```html
    <center>
    <b> The Great Globular Cluster in Hercules </b> <br/>
    <b>Image Credit &amp;
    <a href="lib/about_apod.html#srapply">Copyright</a>:</b>
    <a href="[https://www.distant-luminosity.com/about.html](https://www.distant-luminosity.com/about.html)">Jan Beckmann, Julian Zoller, Lukas Eisert, Wolfgang Hummel</a>
    </center>
    ````

    **Output:**
    `Jan Beckmann, Julian Zoller, Lukas Eisert, Wolfgang Hummel`

    -----

    ### Example 2

    **Input HTML:**

    ```html
    <center>
    <b> NGC 602: Oyster Star Cluster </b> <br>
    <b> Image Credit: </b>
    X-ray: Chandra: NASA/CXC/Univ.Potsdam/L.Oskinova et al; <br>
    Optical: Hubble: NASA/STScI; Infrared: Spitzer: NASA/JPL-Caltech
    </center>
    ```

    **Output:**
    `X-ray: Chandra: NASA/CXC/Univ.Potsdam/L.Oskinova et al, Optical: Hubble: NASA/STScI, Infrared: Spitzer: NASA/JPL-Caltech`

    -----

    Now, process the following HTML.

    **Input HTML:**

    ```html
    {html_code_to_process}
    ```

    **Output:**
    """)

    model = genai.GenerativeModel("models/gemini-2.5-flash")
    response = model.generate_content(system_prompt)
    return response.text


def convert_date(date):
    date = date.replace(":", "")
    return datetime.strptime(date, "%Y %B %d").date()


def itemExistsInData(data, item):
    for entry in data:
        if entry["date"] == item["date"]:
            return True
    return False

In [None]:
scrapeAPODWebsite()