In [None]:
# Project Overview

This project explores the capabilities of different AI models, including **OpenAI’s GPT** and **Ollama**, to summarize online training content. It extracts material from Microsoft’s **Quantum Computing Fundamentals** learning path, cleans it, and generates concise summaries per lesson as well as an overall course summary, allowing a comparison of model outputs.

## Key Features

- Fetches and parses webpages using **requests** and **BeautifulSoup**  
- Produces summaries in multiple languages (e.g., English, Spanish, or any language) and at varying levels of detail (short, medium, detailed)  
- Compares outputs from different AI models to evaluate coverage, clarity, and accuracy  
- Presents results as clean, structured **Markdown** directly in the notebook  

## Tech Stack

- **Models**: GPT-4o-mini, Ollama  
- **Language**: Python  
- **Libraries**: BeautifulSoup, OpenAI  

## Purpose

This project demonstrates how AI can streamline understanding of technical documentation and online courses, highlighting differences in performance and output quality between models.



In [None]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

In [None]:
# Load environment variables  from .env file (not included)

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them")
else:
    print("API key found and looks good so far!")


In [None]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.


In [None]:
# ollama api
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL_LLAMA = "llama3.2"

In [None]:
!ollama pull llama3.2

In [None]:
ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

In [None]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

        links = [a.get("href") for a in soup.find_all("a", href=True)]
        self.links = [link.strip() for link in links if link]

In [None]:
# Create a system prompt function that can use different language and length 

def build_system_prompt(language="Spanish", length="short"):
    return f"""You are an assistant that analyzes the contents of a website and provides a {length} summary, ignoring text that might be navigation related.
    Respond in 20 words or less markdown, and respond in {language}.
    """
    
    
                        

In [None]:
# Create a function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary in {language} of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:

def messages_for(website, language="Spanish", length="short"):
    return [
        {"role": "system", "content": build_system_prompt(language, length)},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
link_system_prompt = """
You are provided with a list of links found on a Microsoft Learn training page.

Decide which of the links are lessons within the Quantum Computing Fundamentals training path
(https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/).

- Only include links that are actual lesson/module pages within this training path.
- Replace relative links (like /training/modules/...) with full https://learn.microsoft.com/... URLs.
- Ignore links to navigation, terms of service, privacy, blogs, or anything not part of this course.

You should respond in JSON in this format:

{
    "links": [
        {"type": "lesson", "url": "https://learn.microsoft.com/en-us/training/modules/intro-to-azure-quantum/1-introduction"},
        {"type": "lesson", "url": "https://learn.microsoft.com/en-us/training/modules/intro-to-azure-quantum/2-what-is-quantum-compute"}
    ]
}
"""


In [None]:
def get_links_user_prompt(website):
    user_prompt = (
        f"Here are the raw links scraped from {website.url}.\n"
        "Only include links that are lesson/module pages for the Quantum Computing Fundamentals path.\n"
        "A valid module URL will contain '/training/modules/'.\n"
        "Return a JSON object with an array 'links' where each item has type:'lesson' and url: full https URL.\n"
        "Do not include Terms of Service, Privacy, navigation, blog, external marketing, or mailto links.\n\n"
        "Links (some may be relative):\n"
        + "\n".join(website.links)
    )
    return user_prompt


In [None]:
import json

def get_links(url):
    website = Website(url)   # Website.links should be the raw href list
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    # parse model output (response.choices[0].message.content is JSON text)
    result = response.choices[0].message.content
    return json.loads(result)


In [None]:
import json

def get_links_ollama(url):
    website = Website(url)   # Website.links should be the raw href list
    response = ollama_via_openai.chat.completions.create(
        model= MODEL_LLAMA,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    # parse model output (response.choices[0].message.content is JSON text)
    result = response.choices[0].message.content
    return json.loads(result)


In [None]:
get_links("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
get_links_ollama("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
#call the OpenAI API. 

def summarize(url, language="Spanish", length="short"):
    website = Website(url)
    response = openai.chat.completions.create(
        model= "gpt-4o-mini",
        messages=messages_for(website, language, length)
    )
    return response.choices[0].message.content
    

In [None]:
#call ollama 

def summarize_ollama(url, language="Spanish", length="short"):
    website = Website(url)
    response = ollama_via_openai.chat.completions.create(
        model= MODEL_LLAMA,
        messages=messages_for(website, language, length)
    )
    return response.choices[0].message.content
    

In [None]:
#Summarize all the lessons in microsoft quantum computer training

def summarize_training(path_url, language="Spanish", length="short"):
    data = get_links(path_url)
    links = data.get("links", [])
    print(f"Found {len(links)} lessons")

    all_summaries = []

    for link in links:
        url = link["url"]
        print(f"Summarizing {url}...")
        summary = summarize(url, language, length)
        all_summaries.append(f"### {url}\n{summary}\n")

    combined_prompt = "Here are summaries of each lesson:\n\n" + "\n".join(all_summaries)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": build_system_prompt(language, length)},
            {"role": "user", "content": "Please summarize the entire training path based on these lesson summaries:\n\n" + combined_prompt}
        ]
    )

    return "\n".join(all_summaries) + "\n\n## General Course Summary\n" + response.choices[0].message.content
    

In [None]:
#Summarize all the lessons in microsoft quantum computer training

def summarize_training_ollama(path_url, language="Spanish", length="short"):
    data = get_links_ollama(path_url) #returns json with links
    links = data.get("links", []) # extract list of lesson dicts
    print(f"Found {len(links)} lessons")

    all_summaries = []

    for link in links:
        url = link["url"]
        print(f"Summarizing {url}...")
        summary = summarize_ollama(url, language, length)
        all_summaries.append(f"### {url}\n{summary}\n")

    combined_prompt = "Here are summaries of each lesson:\n\n" + "\n".join(all_summaries)
    response = ollama_via_openai.chat.completions.create(
        model= MODEL_LLAMA,
        messages=[
            {"role": "system", "content": build_system_prompt(language, length)},
            {"role": "user", "content": "Please summarize the entire training path based on these lesson summaries:\n\n" + combined_prompt}
        ]
    )

    return "\n".join(all_summaries) + "\n\n## General Course Summary\n" + response.choices[0].message.content
    

In [None]:
summarize("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
summarize_training("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
summarize_training_ollama("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize_training(url)
    display(Markdown(summary))

In [None]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary_ollama(url):
    summary = summarize_training_ollama(url)
    display(Markdown(summary))

In [None]:
display_summary("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
display_summary_ollama("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")