# Brochure Generator from Website

In [1]:
#all imports
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import google.generativeai as genai
import ollama
import gradio as gr

## Step 1: Get data and links

In [2]:
load_dotenv(override=True)
api_key = os.getenv('GEMINI_API_KEY')

# connect to gemini
genai.configure(api_key = api_key)

In [3]:

# Some websites need you to use proper headers when fetching them:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links.
    """
    url: str
    title: str
    body: str
    text: str
    links: List[str]

    def __init__(self, url: str):
        self.url = url
        self.title = ""
        self.body = ""
        self.text = ""
        self.links = []
        self.scrape()

    def scrape(self):
        """Scrapes the content of the website at the provided URL."""
        try:
            response = requests.get(self.url, headers=headers)
            if response.status_code != 200:
                print(f"Failed to retrieve the URL. Status code: {response.status_code}")
                return
            
            soup = BeautifulSoup(response.content, 'html.parser')
            self.body = response.content
            self.title = soup.title.string if soup.title else "No title found!"

            # Remove irrelevant tags as we only need text
            for irrelevant in soup.body.find_all(["script", "style", "img", "input"]):
                irrelevant.decompose()
            
            self.text = soup.body.get_text(separator='\n', strip=True) if soup.body else ""

            # Extract links
            links = [link.get('href') for link in soup.find_all('a')]
            self.links = [link for link in links if link]

        except Exception as e:
            print(f"An error occurred while scraping: {e}")
    
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Content:\n{self.text}\n\n"


In [None]:
web = Website("https://edwarddonner.com")
web.links

In [13]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [None]:
print(link_system_prompt)

In [5]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links. Don't include json term in output\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [6]:
def get_links_gemini(url):
    website = Website(url)
    model = genai.GenerativeModel("gemini-1.5-flash", system_instruction=link_system_prompt)
    response = model.generate_content(get_links_user_prompt(website))
    result = json.loads(response.text)
    return result

def get_links_ollama(url):
    website = Website(url)
    messages=[
        {'role':'system', 'content':link_system_prompt},
        {'role':'user', 'content':get_links_user_prompt(website)},
    ]
    response = ollama.chat(model="llama3.2", messages=messages)
    result = json.loads(response['message']['content'])
    return result

In [7]:
get_links_ollama("https://anthropic.com")

{'links': [{'type': 'about page', 'url': 'https://www.anthropic.com'},
  {'type': 'company page', 'url': 'https://www.anthropic.com/company'},
  {'type': 'careers page', 'url': 'https://www.anthropic.com/careers'},
  {'type': 'news page', 'url': 'https://www.anthropic.com/news'},
  {'type': 'research page',
   'url': 'https://www.anthropic.com/research#entry:8@1:url'},
  {'type': 'press contact email', 'url': 'mailto:press@anthropic.com'},
  {'type': 'support webpage', 'url': 'https://support.anthropic.com/'},
  {'type': 'status page', 'url': 'https://status.anthropic.com/'},
  {'type': 'trust page', 'url': 'https://trust.anthropic.com/'},
  {'type': 'twitter handle', 'url': 'https://twitter.com/AnthropicAI'},
  {'type': 'linkedin company page',
   'url': 'https://www.linkedin.com/company/anthropicresearch'},
  {'type': 'youtube channel', 'url': 'https://www.youtube.com/@anthropic-ai'}]}

## Step 2: Generate Brochure

In [23]:
def get_details_gemini(url):
    result = "Landing Page:\n"
    website = Website(url)
    result += website.get_contents()
    links = get_links_gemini(url)
    for link in links['links']:
        result += f"\n\n{link['type']}:\n"
        result += Website(link['url']).get_contents()
    return result

def get_details_ollama(url):
    result = "Landing Page:\n"
    website = Website(url)
    result += website.get_contents()
    links = get_links_ollama(url)
    for link in links['links']:
        print(link)
        result += f"\n\n{link['type']}:\n"
        result += Website(link['url']).get_contents()
    return result

In [None]:
print(get_details_ollama("https://anthropic.com"))

In [20]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information. If including links then do not \
put placeholder for them instead put clickable links."

In [15]:
def get_brochure_user_prompt_gemini(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_details_gemini(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

def get_brochure_user_prompt_ollama(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_details_ollama(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
get_brochure_user_prompt("Anthropic", "https://anthropic.com")

In [None]:
# get brochure from the prompt data

def generate_brochure(company_name, url):
    model = genai.GenerativeModel("gemini-1.5-flash", 
    system_instruction=system_prompt
    )
    response = model.generate_content(
        get_brochure_user_prompt(company_name, url), 
    )
    display(Markdown(response.text))

In [None]:
generate_brochure("Anthropic", "https://anthropic.com")

In [None]:
# generate stream of content

def generate_brochure_stream_gemini(company_name, url):
    model = genai.GenerativeModel("gemini-1.5-flash",
    system_instruction=system_prompt
    )
    response = model.generate_content(
        get_brochure_user_prompt(company_name, url), 
        stream = True, 
    )
    result = ""
    display_handler = display(Markdown(""), display_id=True)
    for chunk in response:
        chunk.text.replace("'''", "").replace("markdown", "")
        result += chunk.text
        update_display(Markdown(result), display_id=display_handler.display_id)


def generate_brochure_stream_ollama(company_name, url):
    messages = [
        {'role':'system', 'content':system_prompt},
        {'role':'user', 'content': get_brochure_user_prompt(company_name, url)}
    ]
    response = ollama.chat(model='llama3.2', messages=messages)
    
    result = ""
    display_handler = display(Markdown(""), display_id=True)
    for chunk in response:
        message = chunk['message']['content']
        message.replace("'''", "").replace("markdown", "")
        result += message
        update_display(Markdown(result), display_id=display_handler.display_id)


In [None]:
generate_brochure_stream("Anthropic", "https://anthropic.com")

In [16]:
# generate stream of content

def generate_brochure_stream_gemini_gradio(company_name, url):
    model = genai.GenerativeModel("gemini-1.5-flash",
    system_instruction=system_prompt
    )
    response = model.generate_content(
        get_brochure_user_prompt_gemini(company_name, url), 
        stream = True, 
    )
    result = ""
    for chunk in response:
        chunk.text.replace("'''", "").replace("markdown", "")
        result += chunk.text
        yield result

def generate_brochure_stream_ollama_gradio(company_name, url):
    messages = [
        {'role':'system', 'content':system_prompt},
        {'role':'user', 'content': get_brochure_user_prompt_ollama(company_name, url)}
    ]
    response = ollama.chat(model='llama3.2', messages=messages,stream=True)
    
    result = ""
    for chunk in response:
        message = chunk['message']['content']
        message.replace("'''", "").replace("markdown", "")
        result += message
        yield result

def stream_model(model, company_name, url):
    if model.upper()=="GEMINI":
       result = generate_brochure_stream_gemini_gradio(company_name, url)
    elif model.upper()=="LAMA":
       result = generate_brochure_stream_ollama_gradio(company_name, url)
    else:
        raise ValueError("unknown model!")
    yield from result

In [24]:
view = gr.Interface(
    fn=stream_model,
    inputs=[gr.Dropdown(["Gemini", "Lama"], label="Select model", value="Gemini"), gr.Textbox(label="Company Name:"), gr.Textbox(label="Website Link:")],
    outputs=[gr.Markdown(label="Response:")],
    flagging_mode="never"
)
view.launch()
# view.launch(share=True)

* Running on local URL:  http://127.0.0.1:7871

To create a public link, set `share=True` in `launch()`.




{'type': 'About page', 'url': 'https://www.anthropic.com/company'}
{'type': 'Research page', 'url': 'https://www.anthropic.com/research#entry:8@1:url'}
{'type': 'Claude AI website', 'url': 'https://claude.ai/'}
{'type': 'Jobs/Careers page', 'url': 'https://www.anthropic.com/jobs'}
{'type': 'News article', 'url': 'https://www.anthropic.com/news/core-views-on-ai-safety'}
{'type': 'Customer section', 'url': 'https://www.anthropic.com/customers'}
{'type': 'Support website', 'url': 'https://support.anthropic.com/'}
{'type': 'Status page', 'url': 'https://status.anthropic.com/'}
{'type': 'Country support', 'url': 'https://www.anthropic.com/supported-countries'}
{'type': 'Company social media profiles', 'url': 'https://www.linkedin.com/company/anthropicresearch'}
{'type': 'Company Twitter profile', 'url': 'https://twitter.com/AnthropicAI'}
