In [30]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
import google.generativeai as genai


In [31]:
# load env variables from env
load_dotenv(override=True)
api_key = os.getenv('GEMINI_API_KEY')

# connect to gemini
genai.configure(api_key = api_key)

## Why use headers?
User-Agent header is used to identify the client (browser) making the request, which can help avoid blocking or restrictions from the website. It helps make the request look like it’s coming from a human using a browser, which can prevent the request from being blocked by the website, because Some websites block requests that don’t have a valid User-Agent, assuming they’re coming from bots or scrapers.

In [32]:
# use beautiful soup for web-scraping text from a website

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    url: str
    title: str
    text: str

    def __init__(self, url):
        self.url = url
        self.scrape()

    def scrape(self):
        '''scrapes the content of the website at the provided url'''
        response = requests.get(self.url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to retrieve the URL. Status code: {response.status_code}")
            return;
        soup = BeautifulSoup(response.text, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"

        # remove irrelevant tags as we need only texts
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator='\n', strip=True)

 

        

In [None]:
page = Website("https://edwarddonner.com")
print(page.title)
print(page.text)

In [54]:
system_prompt = 'You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown.'

In [55]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += '\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n'
    user_prompt += website.text
    return user_prompt

In [36]:
def get_message(system_prompt, website):
    message = f"context: {system_prompt} \n prompt: {user_prompt_for(website)}"
    return message




In [39]:
def summarize(url):
    website = Website(url)
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(get_message(system_prompt, website))
    display(Markdown(response.text))

In [46]:
summarize("https://edwarddonner.com")

## Edward Donner Website Summary

This website is a personal portfolio for Ed Donner, co-founder and CTO of Nebula.io, an AI company focused on talent discovery.  Ed describes his interests as coding, LLMs, DJing, and electronic music production. He's also the founder and former CEO of untapt, an AI startup acquired in 2021.  His website features "Outsmart," an arena where LLMs compete in diplomacy and deviousness.

**Recent Posts:**

* **December 21, 2024:** Welcomes SuperDataScientists.
* **November 13, 2024:**  Provides resources on mastering AI and LLM engineering.
* **October 16, 2024:** Offers resources for transitioning from software engineer to AI data scientist.
* **August 6, 2024:** Announces the Outsmart LLM arena.


## Running models locally (Ollama)

### Recap on installation of Ollama

Simply visit [ollama.com](https://ollama.com) and install!

Once complete, the ollama server should already be running locally.  
If you visit:  
[http://localhost:11434/](http://localhost:11434/)

You should see the message `Ollama is running`. 


In [64]:
import ollama
def summarizeLocally(url):
    website = Website(url)
    MODEL = "llama3.2"
    messages = [
    {'role': "system", 'content': system_prompt},
    {'role': 'user', "content": user_prompt_for(website)}]
    response = ollama.chat(model= MODEL, messages=messages)
    display(Markdown(response['message']['content']))

In [65]:
summarizeLocally("https://edwarddonner.com")

# Website Summary
### Home - Edward Donner
This website is a personal blog of Edward Donner, the co-founder and CTO of Nebula.io. The site features:

*   **About Me**: A brief introduction to Edward's interests, experience, and work in AI.
*   **Personal Projects**: Links to his other ventures, including untapt (acquired in 2021) and Nebula.io.
*   **News/Announcements**:
    *   **December 21, 2024**: Welcome message for SuperDataScientists, indicating a community or newsletter signup process.
    *   **November 13, 2024**: Mastering AI and LLM Engineering – Resources, suggesting a series of articles or resources on the topic.
    *   **October 16, 2024**: From Software Engineer to AI Data Scientist – resources, another resource-focused announcement.
    *   **August 6, 2024**: Outsmart LLM Arena – a battle of diplomacy and deviousness, describing a unique arena where LLMs compete in diplomatic challenges.

The site also features links to Edward's professional profiles on LinkedIn, Twitter, Facebook, and a newsletter subscription process.