## Importing libraries

In [1]:
import requests
import json
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

## Simple Website Scraper

In [2]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [17]:
# Testing the scraper
test = Website("https://news.google.com")
print(test.get_contents())

Webpage Title:
Google News
Webpage Contents:
News
Google News
Advanced search
Help
Help
Privacy
Terms
About Google
Get the Android app
Get the iOS app
Send feedback
Settings
Settings
Language & region
English (United States)
Sign in
Home
For you
Following
U.S.
World
Local
Business
Technology
Entertainment
Sports
Science
Health
More
News
Google News
Your
briefing
Today
38°
28°
Thu
37°
28°
Fri
37°
27°
Sat
37°
28°
Tehran
36°C
Google Weather
Top stories
The Guardian
More
57 minutes ago
By Jane Clinton, Nick Visser & Kate Lamb
Hawaii News Now
More
7 hours ago
FOX Weather
More
Tsunami waves begin to hit Hawaii after massive 8.8 quake strikes off Russia
6 hours ago
By Scott Sistek
Fox News
More
Tsunami waves arrive on Japan's coast after earthquake in Russia, triggering tsunami alerts for US West Coast
4 hours ago
By Landon Mion
Full Coverage
ABC News
More
NYC Mayor Eric Adams shares Midtown shooting surveillance footage details, calls for gun reform
6 hours ago
By Doc Louallen
BBC
More
Two h

## STEP 1 : Getting relevent link from a webpage
Calling the LLM to decide if a webpage is relavant for creating a brouchure or not.

In [25]:
# Setting up user and system prompt for finding relevent links using an LLM.
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON and nothing more, skip if it's not a link and you don't need provide additional information, respond just JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

# Calling local LLaMA using openai library
MODEL = "llama3.2"

ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

def get_links(url):
    website = Website(url)
    response = ollama_via_openai.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ]
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [24]:
# Testing get_links
test_get_links = get_links("https://huggingface.co")
print(test_get_links)

{'links': [{'type': 'company page', 'url': 'https://huggingface.co'}, {'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'model links', 'url': 'https://huggingface.co/models/'}, {'type': 'dataset links', 'url': 'https://huggingface.co/datasets/'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'join/discord link', 'url': 'https://huggingface.co/join/discord'}, {'type': 'github link', 'url': 'https://github.com/huggingface'}, {'type': 'twitter link', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


## STEP 2 : Creating the Brouchure
Calling the LLM again, this time for making the Brouchure.

In [29]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    # I'm removing facebook and youtube page for technical reasons, if you want to keep them you can delete 2 next line.
    links['links'] = [link for link in links['links'] if 'facebook' not in link['type'].lower()]
    links['links'] = [link for link in links['links'] if 'youtube' not in link['type'].lower()]
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

# Setting up user and system prompt so we can make a brouchure out of some candidate links.
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

# As the name says this function will create the brouchure, i just wanted to add some comment :)
def create_brochure(company_name, url):
    response = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [34]:
# And the result ...
create_brochure("Real Python", "https://realpython.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://realpython.com/about/'}, {'type': 'mission', 'url': 'https://realpython.com/mission/'}, {'type': 'teams', 'url': 'https://realpython.com/team/'}, {'type': 'newsletter', 'url': 'https://realpython.com/newsletter/'}, {'type': 'media-kit', 'url': 'https://realpython.com/media-kit/'}, {'type': 'sponsorships', 'url': 'https://realpython.com/sponsorships/'}, {'type': 'contact', 'url': 'https://realpython.com/contact/'}]}


# Real Python Brochure

Real Python is a community-driven platform dedicated to the advancement of Python programming and software development.

## Our Mission

At Real Python, our mission is to provide high-quality tutorials, resources, and support to learners of all levels. We aim to bridge the gap between beginners and experienced developers by offering guided learning paths, interactive quizzes, and personalized code assistance.

## Values

*   Community: We believe in building a vibrant community of Pythonistas who can learn from each other, share knowledge, and collaborate on projects.
*   Quality: We are committed to delivering the highest-quality content that is accurate, informative, and engaging.
*   Accessibility: We strive to make programming skills accessible to everyone, regardless of background or experience.

## Customer and Learner Experience

Our customers are learners who want to master Python and its ecosystem. They receive:

*   In-depth tutorials and video courses
*   Guided learning paths for accelerated learning
*   Interactive quizzes to evaluate progress
    Our commitment is to support our customers throughout their learning journey.

## Careers and Jobs

Real Python offers various career opportunities for professionals who share our passion for programming and community development. We are dedicated to nurturing:

*   Talented individuals who can help us grow and improve
*   Employers who can find skilled Python developers on our job board