In [None]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from openai import OpenAI
from urllib.parse import urlparse

# Load your OpenAI API key from .env or environment
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

MODEL = 'gpt-4o-mini'
openai = OpenAI()

# Standard browser header helps prevent blocks
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# Scrapes and parses website data and links
class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        # Try to get a good page title
        self.title = soup.title.string if soup.title else "No title found"
        # Clean out non-content elements
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            # Grab main visible text
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        # Collect all links (for LLM filtering later)
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

# Prompt OpenAI to filter out only relevant links (About, Careers, etc)
link_system_prompt = (
    "You are provided with a list of links found on a webpage. "
    "Decide which of the links are most relevant for a company brochure, "
    "such as links to an About page, Company page, or Careers/Jobs pages.\n"
    "You should respond in JSON as in this example:\n"
    '{\n'
    '    "links": [\n'
    '        {"type": "about page", "url": "https://full.url/goes/here/about"},\n'
    '        {"type": "careers page", "url": "https://another.full.url/careers"}\n'
    '    ]\n'
    '}\n'
)

def get_links_user_prompt(website):
    # Create the user prompt for OpenAI to select best links
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. "
    user_prompt += "Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

def get_links(url):
    # Scrape and ask GPT to filter for relevant links
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

def get_all_details(url):
    # Scrape main page and all key linked pages for brochure
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

# Main system prompt for brochure creation
system_prompt = (
    "You are an assistant that analyzes the contents of several relevant pages from a company website "
    "and creates a short brochure about the company for prospective customers, investors, and recruits. "
    "Respond in clean, organized plain text only. "
    "Include clickable social/media links (as full URLs, not just platform names). "
    "Do NOT use asterisks or any special symbols for formatting or bold text—just plain text. "
    "Never use markdown formatting. "
    "Cover company culture, customers, careers/jobs if possible."
)

def get_brochure_user_prompt(company_name, url):
    # Prepare all details into the final user prompt for OpenAI
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in plain text.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if over 5,000 characters (OpenAI limit)
    return user_prompt

def extract_company_name(url):
    # Pulls a company name out of the URL (e.g., 'facebook' from 'facebook.com')
    netloc = urlparse(url).netloc
    company = netloc.split('.')[-2] if '.' in netloc else netloc
    return company.capitalize()

def create_brochure_plain(url):
    # Combines scraping and LLM for brochure generation
    company_name = extract_company_name(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response.choices[0].message.content
    # Remove any unexpected html/markdown formatting
    plain_result = BeautifulSoup(result, "html.parser").get_text()
    return plain_result




def run_brochure_flow(url):
    # Ensure the URL has a scheme
    if not url.startswith("http://") and not url.startswith("https://"):
        url = "https://" + url
    print("\nGenerating company brochure. Please wait...\n")
    brochure = create_brochure_plain(url)
    print("=== Company Brochure ===\n")
    print(brochure)
    return brochure

run_brochure_flow("www.linkedin.com")



Generating company brochure. Please wait...

=== Company Brochure ===

LinkedIn Company Brochure

Welcome to LinkedIn - Your Professional Community

LinkedIn is a leading online platform that connects professionals from various industries, allowing individuals to build and engage with their professional networks. With a focus on career development, educational resources, and job opportunities, LinkedIn empowers users to navigate their professional lives effectively.

Company Culture:
At LinkedIn, we foster a culture of connection, growth, and continual learning. Our mission is to create economic opportunity for every member of the global workforce. We believe in promoting diversity, equity, and inclusion in the workplace and encourage collaboration and open conversations among our community members.

Customers:
LinkedIn serves millions of members worldwide, from job seekers looking for new opportunities to companies seeking top talent. Users can discover relevant job openings, connect

'LinkedIn Company Brochure\n\nWelcome to LinkedIn - Your Professional Community\n\nLinkedIn is a leading online platform that connects professionals from various industries, allowing individuals to build and engage with their professional networks. With a focus on career development, educational resources, and job opportunities, LinkedIn empowers users to navigate their professional lives effectively.\n\nCompany Culture:\nAt LinkedIn, we foster a culture of connection, growth, and continual learning. Our mission is to create economic opportunity for every member of the global workforce. We believe in promoting diversity, equity, and inclusion in the workplace and encourage collaboration and open conversations among our community members.\n\nCustomers:\nLinkedIn serves millions of members worldwide, from job seekers looking for new opportunities to companies seeking top talent. Users can discover relevant job openings, connect with industry experts, and access a wealth of curated conten