In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [3]:
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [None]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers) #request package to retrieve the webpage
        self.body = response.content #get the content of the webpage
        soup = BeautifulSoup(self.body, 'html.parser') #parse the content with BeautifulSoup
        self.title = soup.title.string if soup.title else "No title found" 
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]): #remove script, style, img, input tags
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')] #extract all hyperlinks
        self.links = [link for link in links if link] 

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
site=Website("https://www.apple.com/")
print(site.get_contents())

Webpage Title:
Apple
Webpage Contents:
Apple
Apple
Store
Mac
iPad
iPhone
Watch
Vision
AirPods
TV & Home
Entertainment
Accessories
Support
0
+
Last chance to get AirPods or an eligible accessory of your choice when you buy Mac or iPad with education savings. Ends 9.30.
1
Shop
iPhone 17 Pro
All out Pro.
Available starting 9.19
Learn more
Pre-order
iPhone Air
The thinnest iPhone ever. With the power of pro inside.
Available starting 9.19
Learn more
Pre-order
iPhone 17
Magichromatic.
Available starting 9.19
Learn more
Pre-order
AirPods Pro 3
The world’s best in-ear Active Noise Cancellation.
Available starting 9.19
Learn more
Pre-order
Apple Watch Series 11
The ultimate way to watch your health.
Available starting 9.19
Learn more
Pre-order
Apple Watch SE 3
Walk it. Talk it. Track it. Love it.
Available starting 9.19
Learn more
Pre-order
Apple Watch Ultra 3
Personal beast.
Available starting 9.19
Learn more
Pre-order
Any condition carrier deals are here.
Select carriers accept eligible trad

In [6]:
print(site.links)

['/', '/us/shop/goto/store', '/mac/', '/ipad/', '/iphone/', '/watch/', '/apple-vision-pro/', '/airpods/', '/tv-home/', '/entertainment/', '/us/shop/goto/buy_accessories', 'https://support.apple.com/?cid=gn-ols-home-hp-tab', '/us/search', '/us/shop/goto/bag', '#footnote-1', '/us-edu/shop/goto/edu_store', '/iphone-17-pro/', '/iphone-17-pro/', '/us/shop/goto/buy_iphone/iphone_17_pro', '/iphone-air/', '/iphone-air/', '/us/shop/goto/buy_iphone/iphone_air', '/iphone-17/', '/iphone-17/', '/us/shop/goto/buy_iphone/iphone_17', '/airpods-pro/', '/airpods-pro/', '/us/shop/goto/buy_airpods/airpods_pro_3', '/apple-watch-series-11/', '/apple-watch-series-11/', '/us/shop/goto/buy_watch/apple_watch_series_11', '/apple-watch-se-3/', '/apple-watch-se-3/', '/us/shop/goto/buy_watch/apple_watch_se', '/apple-watch-ultra-3/', '/apple-watch-ultra-3/', '/us/shop/goto/buy_watch/apple_watch_ultra_3', '/us/shop/goto/buy_iphone/carrier_offers', '/us/shop/goto/buy_iphone/carrier_offers', '/us/shop/goto/trade_in', '

In [None]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [8]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [9]:
def get_links_user_prompt(website): #website is an instance of the Website class
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [10]:
print(get_links_user_prompt(site))

Here is the list of links on the website of https://www.apple.com/ - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/us/shop/goto/store
/mac/
/ipad/
/iphone/
/watch/
/apple-vision-pro/
/airpods/
/tv-home/
/entertainment/
/us/shop/goto/buy_accessories
https://support.apple.com/?cid=gn-ols-home-hp-tab
/us/search
/us/shop/goto/bag
#footnote-1
/us-edu/shop/goto/edu_store
/iphone-17-pro/
/iphone-17-pro/
/us/shop/goto/buy_iphone/iphone_17_pro
/iphone-air/
/iphone-air/
/us/shop/goto/buy_iphone/iphone_air
/iphone-17/
/iphone-17/
/us/shop/goto/buy_iphone/iphone_17
/airpods-pro/
/airpods-pro/
/us/shop/goto/buy_airpods/airpods_pro_3
/apple-watch-series-11/
/apple-watch-series-11/
/us/shop/goto/buy_watch/apple_watch_series_11
/apple-watch-se-3/
/apple-watch-se-3/
/us/shop/goto/buy_watch/apple_watch_se
/apple-watch-ultr

In [None]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content 
    return json.loads(result)

In [12]:
get_links("https://www.apple.com/")

{'links': [{'type': 'about page', 'url': 'https://www.apple.com/about/'},
  {'type': 'careers page', 'url': 'https://www.apple.com/careers/us/'},
  {'type': 'company news', 'url': 'https://www.apple.com/newsroom/'},
  {'type': 'leadership page', 'url': 'https://www.apple.com/leadership/'},
  {'type': 'investor relations', 'url': 'https://investor.apple.com/'}]}