# OpenAI vs Ollama Streaming

This notebook demonstrates how to stream responses from large language models using both **OpenAI’s API** and **Ollama (local models)**.  
It highlights the differences between cloud-based and local inference, showing how streaming delivers tokens in real time instead of waiting for a full response.  


In [None]:
# imports
import os
import requests
from IPython.display import Markdown, display, update_display
from dotenv import load_dotenv
from openai import OpenAI

In [None]:
# ollama api
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL_LLAMA = "llama3.2"


In [None]:
# constants

MODEL_GPT = 'gpt-4o-mini'
MODEL_LLAMA = 'llama3.2'

In [None]:
# load ollama
!ollama pull llama3.2

In [None]:
response = request

In [None]:
# load key from open ai
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key?")
openai = OpenAI()    

In [None]:
# from openai import OpenAI
ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')



In [None]:
system_prompt = "You are an expert in quantum computing, you explain topics step by step,\
using simple language, analogies and concise explanations that even a beginner \
can understand."

In [None]:
# here is the question; type over this to ask something new

question = """
Please explain quantum entanglement in simple terms
"""

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]
    

In [None]:
# Get gpt-4o-mini to answer, with streaming
def stream_answer():
    stream = openai.chat.completions.create(
        model=MODEL_GPT,
        messages=messages,
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
stream_answer()

In [None]:
# Get Llama 3.2 to answer, with streaming
def stream_answer_ollama():
    stream = ollama_via_openai.chat.completions.create(
        model=MODEL_LLAMA,
        messages=messages,
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
stream_answer_ollama()