# Web Scraping to Generate a Brochure with Language Model Selection (GPT, Claude, Gemini, Ollama) and Visualization in Gradio

In [1]:
from bs4 import BeautifulSoup
import requests
from openai import OpenAI
import google.generativeai
import anthropic
import ollama
from dotenv import load_dotenv
import os
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


## Configure api connections

In [2]:
load_dotenv()
openai  = os.getenv("OPENAI_API_KEY")
claude  = os.getenv("ANTHROPIC_API_KEY")
gemini  = os.getenv("GOOGLE_API_KEY")

openai_model = "gpt-4o-mini"
claude_model = "claude-3-haiku-20240307"
gemini_model = "gemini-1.5-flash"
ollama_model = "llama3.2"

openai = OpenAI()
claude = anthropic.Anthropic()
gemini = google.generativeai.configure()

## System prompt

In [3]:
system_prompt = f"""You are an assistant that analyzes
 the content of a company's website and creates a brief 
 brochure about the company for potential clients, 
 investors, and new employees. Respond in Markdown 
 format. Include details about the company's culture, 
 customers, careers/jobs, and courses/packages for future 
 jobs if available. Add emojis."""


## Multi model response (streaming mode)

In [4]:
def stream_openai(user_prompt):
    # Create a streaming response from OpenAI's chat completions
    stream = openai.chat.completions.create(
        model    = openai_model,  # Specify the model to use
        messages = [{"role": "system", "content": system_prompt},  # System message to set assistant behavior
                    {"role": "system", "content": user_prompt}],  # User's input message
        stream = True  # Enable streaming mode
    )
    
    ans = ""  # Initialize an empty string to accumulate the response
    for chunk in stream:  # Iterate over each chunk in the streaming response
        ans += chunk.choices[0].delta.content or ""  # Append the content of the chunk to the response
        yield ans  # Yield the partial response as it is generated

def stream_claude(user_prompt):
    # Create a streaming response from Claude's model
    stream = claude.messages.stream(
        model  = claude_model,  # Specify the Claude model to use
        system = system_prompt,  # Set the system message (defines assistant behavior)
        messages = [{"role": "user", "content": user_prompt}],  # User's input message
        max_tokens = 500,  # Limit the maximum number of tokens for the response
    )
    
    ans = ""  # Initialize an empty string to accumulate the response
    with stream as chunk:  # Iterate over each chunk of text in the streaming response
        for text in chunk.text_stream:  # Iterate over the text stream within the chunk
            ans += text or ""  # Append each piece of text to the response
            yield ans  # Yield the partial response as it is generated

def stream_gemini(user_prompt):
    # Configure and initialize the Gemini model
    config = google.generativeai.GenerativeModel(
        model_name = gemini_model,  # Specify the Gemini model to use
    )
    
    # Combine system prompt and user input into a single prompt
    prompt = f"{system_prompt}\n{user_prompt}"
    
    # Create a streaming response from the Gemini model
    stream = config.generate_content(
        prompt,  # Provide the combined prompt
        stream = True  # Enable streaming mode
    )
    
    ans = ""  # Initialize an empty string to accumulate the response
    for chunk in stream:  # Iterate over each chunk in the streaming response
        ans += chunk.text or ""  # Append the content of each chunk to the response
        yield ans  # Yield the partial response as it is generated

def stream_ollama(user_prompt):
    # Create a streaming response from Ollama's model
    stream = ollama.chat(
        model = ollama_model,  # Specify the Ollama model to use
        messages = [{"role": "system", "content": system_prompt},  # System message to define assistant behavior
                    {"role": "user", "content": user_prompt}],  # User's input message
        stream = True  # Enable streaming mode
    )
    
    ans = ""  # Initialize an empty string to accumulate the response
    for chunk in stream:  # Iterate over each chunk in the streaming response
        ans += chunk["message"]["content"] or ""  # Append the content of each chunk to the response
        yield ans  # Yield the partial response as it is generated


## Webpage Content Processor

In [5]:
class Webpage:
    """
    A class to represent a webpage and extract its content.

    Attributes:
        url (str): The URL of the webpage.
        title (str): The title of the webpage, or "Sin titulo" if not available.
        text (str): The main body text of the webpage, excluding irrelevant elements.
    """
    
    def __init__(self, url):
        """
        Initializes the Webpage class by fetching and parsing the webpage content.

        Args:
            url (str): The URL of the webpage to be fetched and parsed.
        """
        self.url = url  # Store the webpage URL
        html = requests.get(self.url)  # Fetch the HTML content of the page
        content = BeautifulSoup(html.content, "html.parser")  # Parse the HTML content using BeautifulSoup

        # Extract the title of the webpage, defaulting to "Sin titulo" if not present
        self.title = content.title.string if content.title else "Sin titulo"

        # Extract the body text, removing irrelevant elements such as scripts, styles, and images
        if content.body:
            for irrelevant in content.body(["script", "style", "img", "input"]):
                irrelevant.decompose()  # Remove the irrelevant elements from the body
            # Get the text from the body, joining paragraphs with newline characters and stripping extra spaces
            self.text = content.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""  # Set text to an empty string if the body is not available

def get_user_prompt(nombre, webpage):
    """
    Generates a user prompt by incorporating the company name and webpage content.

    Args:
        nombre (str): The name of the company.
        webpage (str): The URL of the webpage to be processed.

    Returns:
        str: A formatted string containing the company name and webpage text.
    """
    webpage = Webpage(webpage)  # Create a Webpage object by passing the URL
    return f"""
    El nombre de la empresa es {nombre} y este es el sitio web {webpage} que contiene esta
    informacion {webpage.text}
    """

def get_brochure(nombre, webpage, model):
    """
    Generates a brochure based on the company name, webpage content, and chosen model.

    Args:
        nombre (str): The name of the company.
        webpage (str): The URL of the webpage to be processed.
        model (str): The model to be used for generating the brochure. Options include "gpt", "claude", "gemini", and "ollama".

    Returns:
        generator: A generator yielding the generated response from the chosen model.
    """
    user_prompt = get_user_prompt(nombre, webpage)  # Get the user prompt by processing the webpage content
    if model == "gpt":
        result = stream_openai(user_prompt)  # Use OpenAI's GPT model
    elif model == "claude":
        result = stream_claude(user_prompt)  # Use Claude model
    elif model == "gemini":
        result = stream_gemini(user_prompt)  # Use Gemini model
    elif model == "ollama":
        result = stream_ollama(user_prompt)  # Use Ollama model
    else:
        raise ValueError("Modelo Desconocido")  # Raise an error if the model is unknown
    yield from result  # Yield the results generated by the chosen model

## Chat interface

In [6]:
view = gr.Interface(
    fn=get_brochure,
    inputs=[gr.Textbox(label="Your company's name:"), 
            gr.Textbox(label="The link with http...:"), 
            gr.Dropdown(["gpt", "claude", "gemini", "ollama"], label="Select a model:", value="gpt")],
    outputs=[gr.Markdown(label="Brochure:")]
)
view.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


--------


