In [1]:
import os
import json
import requests

from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

True

## Setup Gemini model

In [None]:
import google.generativeai as genai

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [3]:
import pprint

for model in genai.list_models():
    pprint.pprint(model)

Model(name='models/chat-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 Chat (Legacy)',
      description='A legacy text-only model optimized for chat conversations',
      input_token_limit=4096,
      output_token_limit=1024,
      supported_generation_methods=['generateMessage', 'countMessageTokens'],
      temperature=0.25,
      max_temperature=None,
      top_p=0.95,
      top_k=40)
Model(name='models/text-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 (Legacy)',
      description='A legacy model that understands text and generates text as an output',
      input_token_limit=8196,
      output_token_limit=1024,
      supported_generation_methods=['generateText', 'countTextTokens', 'createTunedTextModel'],
      temperature=0.7,
      max_temperature=None,
      top_p=0.95,
      top_k=40)
Model(name='models/embedding-gecko-001',
      base_model_id='',
      version='001',
      display_name='Embedding Gecko

In [4]:
model = genai.GenerativeModel("models/gemini-2.0-flash")
model

genai.GenerativeModel(
    model_name='models/gemini-2.0-flash',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

In [5]:
html_code = """
<center>
<b> The Great Globular Cluster in Hercules </b> <br/>
<b>Image Credit &amp;
<a href="lib/about_apod.html#srapply">Copyright</a>:</b>
<a href="https://www.distant-luminosity.com/about.html">Jan Beckmann, Julian Zoller, Lukas Eisert, Wolfgang Hummel</a>
</center>
"""

# html_code = """
# <center>
# <b> NGC 602: Oyster Star Cluster </b> <br>
# <b> Image Credit: </b>
# X-ray: Chandra: NASA/CXC/Univ.Potsdam/L.Oskinova et al; <br>
# Optical: Hubble: NASA/STScI; Infrared: Spitzer: NASA/JPL-Caltech
# </center> <p>
# """

# html_code = """
# <center>
# <b> Time Spiral </b> <br>
# <b> Illustration Credit: </b>
# <a href="https://commons.wikimedia.org/wiki/User:Unmismoobjetivo"
# >Pablo Carlos Budassi</a> via
# <a href="https://en.wikipedia.org/wiki/Wikipedia">Wikipedia</a>
# </center> <p>
# """

query = f"""Given the following HTML code snippet, your role is to extract the credit information from the center tag.
The extracted credit information should be returned as a string and separated by a comma to denote multiple authors.
Dont't include prefix text like "Image Credit" or "Illustration Credit".

The HTML code snippet is as follows:
{html_code}
"""

response = model.generate_content(query)
print(response.text)

Jan Beckmann, Julian Zoller, Lukas Eisert, Wolfgang Hummel



In [None]:
import time


def scrapeAPODWebsite():
    print("Scraping the APOD website.")
    createOutputFileIfNotExists()

    a_tags = get_a_tags()
    start_time = time.time()
    rpm_counter = 0
    for tag in tqdm(a_tags, total=len(a_tags)):
        try:
            item = scrape_a_tag(tag)
            if item["image_url"] is None:
                # This means that this is a video link, we don't want to include it
                continue

            with open("../data/apod_data.json", "r") as file:
                data = json.load(file)

            if itemExistsInData(data, item):
                print(f"Item for date {item['date']} already exists in the data. Stop scraping.")
                break
            else:
                data.append(item)

            # Write the updated data back to the file
            with open("../data/apod_data.json", "w") as file:
                json.dump(data, file, indent=4)
            # to avoid hitting the rate limit of 15 requests per minute
            end_time = time.time()
            if end_time - start_time < 60:
                rpm_counter += 1
                if rpm_counter == 10:  # 10 requests per minute just to leave a margin
                    time.sleep(60)
                    rpm_counter = 0
                    start_time = time.time()
        except Exception as e:
            print(f"An error occurred while scraping: {e}")
            continue


def createOutputFileIfNotExists():
    if not os.path.exists("../data/apod_data.json"):
        with open("../data/apod_data.json", "w") as file:
            file.write("[]")


def get_a_tags():
    source = requests.get("https://apod.nasa.gov/apod/archivepix.html").text
    soup = BeautifulSoup(source, "lxml")

    b_tag = soup.find_all("b")[1]
    return b_tag.find_all("a")


def scrape_a_tag(a_tag):
    dictionary = {}

    date = a_tag.find_previous(string=True).strip()
    title = a_tag.text.strip()
    url = f"https://apod.nasa.gov/apod/{a_tag['href']}"
    image, explanation = get_image_and_explanation(url)
    authors = get_authors(url)

    dictionary["date"] = date
    dictionary["title"] = title
    dictionary["url"] = url
    dictionary["image_url"] = image
    dictionary["explanation"] = explanation
    dictionary["authors"] = authors

    return dictionary


def get_image_and_explanation(url):
    source = requests.get(url).text
    soup = BeautifulSoup(source, "lxml")

    p_tags = soup.find_all("p")
    img_tag = soup.find("img")
    explanation = p_tags[2].get_text()

    try:
        img_url = f"https://apod.nasa.gov/apod/{img_tag['src']}"
    except Exception:
        img_url = None

    return img_url, explanation


def get_authors(url):
    source = requests.get(url).text
    soup = BeautifulSoup(source, "lxml")

    center_tags = soup.find_all("center")
    credit_center_tag = center_tags[1]
    authors = extract_authors_with_gemini(credit_center_tag)

    return authors


def extract_authors_with_gemini(center_tag):
    query = f"""Given the following HTML code snippet, your role is to extract the credit information from the center tag.
The extracted credit information should be returned as a string and separated by a comma to denote multiple authors.
Dont't include prefix text like "Image Credit" or "Illustration Credit".

The HTML code snippet is as follows:
{center_tag}
"""

    response = model.generate_content(query)
    return response.text


def convert_date(date):
    date = date.replace(":", "")
    return datetime.strptime(date, "%Y %B %d").date()


def itemExistsInData(data, item):
    for entry in data:
        if entry["date"] == item["date"]:
            return True
    return False

In [None]:
scrapeAPODWebsite()