In [8]:
import os
import requests
import ollama
import json
import openai 
from dotenv import load_dotenv
from IPython.display import Markdown,display
from bs4 import BeautifulSoup

In [9]:
load_dotenv(override=True)
openai=OpenAI()
openai.api_key=os.getenv('OPENAI_API_KEY')
OLLAMA_API="http:///locahost:11434/api/chat"
HEADERS={"Content-Type":"application/json"}
MODEL="LLAMA3.2"

In [81]:
headers={"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"}
class Website:
    def __init__(self,url):
        self.url=url
        response=requests.get(url,headers=headers)
        self.body = response.content
        soup=BeautifulSoup(response.content,'html.parser')
        self.title=soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script","style","img","input"]):
                           irrelevant.decompose()
        self.text=soup.body.get_text(separator="\n",strip=True)

        self.links=[link.get('href') for link in soup.find_all('a') if link.get('href')]
        
    def get_contents(self):
        return self.text

In [82]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages,or announcements page.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"},
        {"type": "announcements page":"https://next.full.url/announcements"}
    ]
}
"""
def link_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt
print(link_user_prompt(website))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2024/12/21/

In [83]:
website=Website("https://edwarddonner.com")
website.links 

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [84]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/Qwen/QwQ-32B',
 '/deepseek-ai/DeepSeek-R1',
 '/microsoft/Phi-4-multimodal-instruct',
 '/SparkAudio/Spark-TTS-0.5B',
 '/CohereForAI/aya-vision-8b',
 '/models',
 '/spaces/ASLP-lab/DiffRhythm',
 '/spaces/Qwen/QwQ-32B-Demo',
 '/spaces/Wan-AI/Wan2.1',
 '/spaces/nanotron/ultrascale-playbook',
 '/spaces/black-forest-labs/FLUX.1-dev',
 '/spaces',
 '/datasets/facebook/natural_reasoning',
 '/datasets/FreedomIntelligence/medical-o1-reasoning-SFT',
 '/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k',
 '/datasets/KodCode/KodCode-V1',
 '/datasets/SynthLabsAI/Big-Math-RL-Verified',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/gramm

In [85]:
def links_display(website):
    response=openai.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role":"user", "content":link_user_prompt(website)},
        {"role":"system", "content":link_system_prompt}
        ],
      response_format={"type":"json_object"}
        )
    result=response.choices[0].message.content
    return json.loads(result)
print(links_display(website))


{'links': [{'type': 'homepage', 'url': 'https://edwarddonner.com/'}, {'type': 'about page', 'url': 'https://edwarddonner.com/about-me-and-about-nebula/'}, {'type': 'resource page', 'url': 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/'}, {'type': 'resource page', 'url': 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/'}, {'type': 'resource page', 'url': 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/'}, {'type': 'resource page', 'url': 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/'}, {'type': 'blog page', 'url': 'https://edwarddonner.com/posts/'}, {'type': 'LinkedIn profile', 'url': 'https://www.linkedin.com/in/eddonner/'}, {'type': 'Twitter profile', 'url': 'https://twitter.com/edwarddonner'}, {'type': 'Facebook profile', 'url': 'https://www.facebook.com/edward.donner.52'}]}


In [87]:
def links_display(huggingface):
    response=openai.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role":"user", "content":link_user_prompt(huggingface)},
        {"role":"system", "content":link_system_prompt}
        ],
      response_format={"type":"json_object"}
        )
    result=response.choices[0].message.content
    return json.loads(result)
print(links_display(huggingface))

{'links': [{'type': 'home', 'url': 'https://huggingface.co/'}, {'type': 'models', 'url': 'https://huggingface.co/models'}, {'type': 'datasets', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing', 'url': 'https://huggingface.co/pricing'}, {'type': 'documentation', 'url': 'https://huggingface.co/docs'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'status page', 'url': 'https://status.huggingface.co'}, {'type': 'GitHub', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


In [93]:
#Assembel all the links into another frontier model as the results
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = links_display(website)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result
url = "https://www.example.com"
print(get_all_details(url))

Found links: {'links': [{'type': 'home page', 'url': 'https://edwarddonner.com/'}, {'type': 'about page', 'url': 'https://edwarddonner.com/about-me-and-about-nebula/'}, {'type': 'blog page', 'url': 'https://edwarddonner.com/posts/'}, {'type': 'workshop resources', 'url': 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/'}, {'type': 'LLM resources', 'url': 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/'}, {'type': 'engineering resources', 'url': 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/'}, {'type': 'career development resources', 'url': 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/'}, {'type': 'LinkedIn', 'url': 'https://www.linkedin.com/in/eddonner/'}, {'type': 'Twitter', 'url': 'https://twitter.com/edwarddonner'}, {'type': 'Facebook', 'url': 'https://www.facebook.com/edward.donner.52'}]}
Landing page:
Example Domain
This domain is for use in illustrative examples i

In [97]:
#Assembel all the links into another frontier model as the results
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = links_display(huggingface)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result
url = "https://www.huggingface.co"
print(get_all_details(url))

Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co/'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'posts page', 'url': 'https://huggingface.co/posts'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'forum page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}]}
Landing page:
Hugging Face


In [100]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
 and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
 Include details of company culture, customers and careers/jobs if you have the information."

def user_prompt_bronchure(company_name,url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5000] # Truncate if more than 5,000 characters
    return user_prompt
print(user_prompt_bronchure("huggingface","https://huggingface.co"))

Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co/'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
You are looking at a company called: huggingface
Here are the contents of its landing page and other relevant pages; use this information to build a short brochu

In [None]:
def stream_bronchure(company_name,url):
    ollama_via_openai=OpenAI(base_url="http://localhost:11434/v1" , api_key="ollama")
    ollama_via_openai.chat.completions.create(
        model= "llama3.2",
        messages=[{"role":"system","content":system_prompt} ,
                  {"role":"user", "content":user_prompt_bronchure(company_name,url)}],
        stream=True
    )
    response=" "
    build_display=display(Markdown(" "),display_id=True)

    for chunk in stream:
        response +=chunk.choices[0].delta.content or ""
        bronchure=response.replace("''",""),replace('markdown',"")
        update_display(Markdown(bronchure),display_id=build_display.display_id)
                       
    return chunk

stream_bronchure("HuggingFace", "https://huggingface.co")