Import dependencies

In [27]:
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from extractor import fetch_website_links, fetch_website_contents
import os
import json

load_dotenv()

True

Define Open AI keys

In [28]:
OPENROUTER_API_KEY= os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL= os.getenv("OPENROUTER_BASE_URL")



Rail guards check

In [29]:
if OPENROUTER_API_KEY:
    print("API_KEY found")
else:
    print("No API_KEY found")
    exit(1)

API_KEY found


Define OpenAI client library

In [30]:
client = OpenAI(base_url=OPENROUTER_BASE_URL, api_key=OPENROUTER_API_KEY)

Define system and user prompt 

In [31]:

system_one_shot_prompt = """You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [32]:
def user_prompts(url):
    user_prompt = f"""
    Here is the list of links on the website {url} -
    Please decide which of these are relevant web links for a brochure about the company, 
    respond with the full https URL in JSON format.
    Do not include Terms of Service, Privacy, email links.

    Links (some might be relative links):

    """
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt
    

Send request to open ai model

In [33]:
def select_relevant_link(url):
  response = client.chat.completions.create(
          model="openai/gpt-3.5-turbo",
          messages=[
                {
                  "role": "system",
                  "content": system_one_shot_prompt
                },
                {
                    "role": "user",
                    "content": user_prompts(url)
                }
              ],
          response_format= {"type": "json_object"}

      )
  result = response.choices[0].message.content
  links = json.loads(result)

  print(f"Found {len(links['links'])} relevant links")
  return links


In [34]:
select_relevant_link("https://huggingface.co")

Found 4 relevant links


{'links': [{'url': 'https://huggingface.co/enterprise'},
  {'url': 'https://huggingface.co/pricing'},
  {'url': 'https://endpoints.huggingface.co'},
  {'url': 'https://apply.workable.com/huggingface/'}]}

Bronchure Generator


In [35]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_link(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [36]:
fetch_page_and_all_relevant_links("https://huggingface.co")

Found 7 relevant links


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


'## Landing Page:\n\nHugging Face ‚Äì The AI community building the future.\n\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 2M+ models\nTrending on\nthis week\nModels\ntencent/HY-MT1.5-1.8B\nUpdated\n6 days ago\n‚Ä¢\n5.59k\n‚Ä¢\n608\nQwen/Qwen-Image-2512\nUpdated\n7 days ago\n‚Ä¢\n14.3k\n‚Ä¢\n488\nLGAI-EXAONE/K-EXAONE-236B-A23B\nUpdated\nabout 21 hours ago\n‚Ä¢\n2.06k\n‚Ä¢\n396\nIQuestLab/IQuest-Coder-V1-40B-Loop-Instruct\nUpdated\n4 days ago\n‚Ä¢\n5.2k\n‚Ä¢\n255\ntencent/HY-Motion-1.0\nUpdated\n7 days ago\n‚Ä¢\n497\n‚Ä¢\n281\nBrowse 2M+ models\nSpaces\nRunning\nFeatured\n3.71k\nWan2.2 Animate\nüëÅ\n3.71k\nWan2.2 Animate\nRunning\non\nZero\n1.07k\nZ Image Turbo\nüñº\n1.07k\nGenerate images from text prompts\nRunning\non\nZero\nMCP\nFeatured\n234\nQwen-Image-Edit-2511-

In [37]:
brochure_system_prompt = """
You are a competent assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

In [38]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a company called: {company_name}
Here are the contents of its landing page and other relevant pages;
use this information to build a short brochure of the company in markdown without code blocks.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [None]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found 4 relevant links


In [None]:
def create_brochure(company_name, url):
    response = client.chat.completions.create(
        model="openai/gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [None]:
create_brochure("HuggingFace", "https://huggingface.co")

Found 13 relevant links


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


NameError: name 'Markdown' is not defined