# Ollama Testing Notebook

This notebook demonstrates various methods of interacting with Ollama, including:
- Different invocation methods (native Python, LangChain, raw REST)
- Streaming vs non-streaming responses
- Metrics collection (token counts, response times, durations)
- Model management and listing
- Performance comparisons

**Prerequisites:**
- Ollama running locally on `http://localhost:11434`
- Model `gemma3n:e4b` pulled and available
- Python packages: `ollama`, `langchain-community`, `langchain-core`, `pandas`


In [4]:
# Check Python environment and version
import sys
import os
from pathlib import Path

print("🐍 Python Environment Information:")
print(f"   Python Version: {sys.version}")
print(f"   Python Executable: {sys.executable}")
print(f"   Python Path: {sys.path[0]}")

# Check if we're in a virtual environment
if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
    print(f"   Virtual Environment: {sys.prefix}")
    venv_name = Path(sys.prefix).name
    print(f"   Environment Name: {venv_name}")
else:
    print("   Virtual Environment: Not detected (using system Python)")

# Check current working directory
print(f"   Working Directory: {os.getcwd()}")

# Check if common packages are available
packages_to_check = ['ollama', 'langchain', 'langchain_community', 'pandas', 'requests']
print(f"\n📦 Package Availability Check:")
for package in packages_to_check:
    try:
        __import__(package)
        print(f"   ✅ {package}: Available")
    except ImportError:
        print(f"   ❌ {package}: Not installed")


🐍 Python Environment Information:
   Python Version: 3.13.3 (tags/v3.13.3:6280bb5, Apr  8 2025, 14:47:33) [MSC v.1943 64 bit (AMD64)]
   Python Executable: d:\Dev\studio\studio-backend\.venv\Scripts\python.exe
   Python Path: C:\Python313\python313.zip
   Virtual Environment: d:\Dev\studio\studio-backend\.venv
   Environment Name: .venv
   Working Directory: d:\Dev\studio\studio-backend\inference

📦 Package Availability Check:
   ✅ ollama: Available
   ✅ langchain: Available
   ✅ langchain_community: Available
   ✅ pandas: Available
   ✅ requests: Available


In [5]:
# Configuration and Environment Setup
import time
import json
import os
import sys
import requests
from pprint import pprint
import pandas as pd

# Configuration
BASE_URL = os.getenv("OLLAMA_HOST", "http://localhost:11434")
MODEL_NAME = os.getenv("OLLAMA_MODEL", "gemma3n:e4b")

print(f"🔧 Configuration:")
print(f"   Ollama Host: {BASE_URL}")
print(f"   Model: {MODEL_NAME}")

# Helper function to display metrics in a readable format
def show_metrics(response_data: dict, title: str = "Metrics"):
    """Extract and display timing/token metrics from Ollama response"""
    metrics = {}
    
    # Extract timing metrics (convert from nanoseconds to milliseconds)
    for key in ["total_duration", "load_duration", "prompt_eval_duration", "eval_duration"]:
        if key in response_data:
            metrics[key.replace("_duration", "_ms")] = round(response_data[key] / 1e6, 2)
    
    # Extract token counts
    for key in ["prompt_eval_count", "eval_count"]:
        if key in response_data:
            metrics[key] = response_data[key]
    
    # Calculate tokens per second if we have the data
    if "eval_count" in response_data and "eval_duration" in response_data and response_data["eval_duration"] > 0:
        tokens_per_sec = response_data["eval_count"] / (response_data["eval_duration"] / 1e9)
        metrics["tokens_per_second"] = round(tokens_per_sec, 2)
    
    print(f"\n⏱️  {title}")
    for key, value in metrics.items():
        print(f"   {key}: {value}")
    
    return metrics

# Test connection to Ollama
try:
    response = requests.get(f"{BASE_URL}/api/version", timeout=5)
    print(f"\n✅ Ollama connection successful")
    print(f"   Version: {response.json().get('version', 'unknown')}")
except Exception as e:
    print(f"\n❌ Failed to connect to Ollama: {e}")
    print("   Make sure Ollama is running on http://localhost:11434")


🔧 Configuration:
   Ollama Host: http://localhost:11434
   Model: gemma3n:e4b

✅ Ollama connection successful
   Version: 0.9.3


### 1. Model Management and Listing


In [6]:
# List all available models using ollama Python client
import ollama

print("📋 Available Models (using ollama client):")
try:
    models = ollama.list()
    #print(models)
    if models and hasattr(models, 'models'):
        for i, model in enumerate(models.models, 1):
            print(f"   {i}. {model.model}")  # Use .model instead of ['name']
            print(f"      Size: {model.size} bytes")  # Use .size instead of .get('size')
            print(f"      Modified: {model.modified_at}")  # Use .modified_at instead of .get()
            if hasattr(model, 'details'):
                details = model.details
                print(f"      Family: {details.family}")
                print(f"      Format: {details.format}")
                print(f"      Parameter Size: {details.parameter_size}")
                print(f"      Quantization: {details.quantization_level}")
            print()
    # else:
    #     print("   No models found")
except Exception as e:
    print(f"   Error listing models: {e}")

📋 Available Models (using ollama client):
   1. gemma3:4b
      Size: 3338801804 bytes
      Modified: 2025-07-02 21:33:20.686480+05:00
      Family: gemma3
      Format: gguf
      Parameter Size: 4.3B
      Quantization: Q4_K_M

   2. deepscaler:latest
      Size: 3560419491 bytes
      Modified: 2025-07-02 21:30:07.716100+05:00
      Family: qwen2
      Format: gguf
      Parameter Size: 1.8B
      Quantization: F16

   3. gemma3n:e4b
      Size: 7547589116 bytes
      Modified: 2025-07-02 17:12:43.919932+05:00
      Family: gemma3n
      Format: gguf
      Parameter Size: 6.9B
      Quantization: Q4_K_M



In [7]:
MODEL_NAME = "deepscaler"

# Get detailed information about our specific model
print(f"🔍 Model Details for {MODEL_NAME}:")
try:
    model_info = ollama.show(MODEL_NAME)
    
    print(f"   Model: {model_info.get('modelfile', 'N/A')}")
    print(f"   License: {model_info.get('license', 'N/A')}")
    
    if 'details' in model_info:
        details = model_info['details']
        print(f"   Family: {details.get('family', 'N/A')}")
        print(f"   Format: {details.get('format', 'N/A')}")
        print(f"   Parameters: {details.get('parameter_size', 'N/A')}")
        print(f"   Quantization: {details.get('quantization_level', 'N/A')}")
    
    # Show model file contents (truncated)
    if 'modelfile' in model_info:
        modelfile = model_info['modelfile']
        lines = modelfile.split('\n')[:10]  # First 10 lines
        print(f"\n   Modelfile (first 10 lines):")
        for line in lines:
            if line.strip():
                print(f"      {line}")
        if len(modelfile.split('\n')) > 10:
            print(f"      ... ({len(modelfile.split('\n')) - 10} more lines)")
            
except Exception as e:
    print(f"   Error getting model info: {e}")


🔍 Model Details for deepscaler:
   Model: # Modelfile generated by "ollama show"
# To build a new Modelfile based on this, replace FROM with:
# FROM deepscaler:latest

FROM C:\Users\Jawad\.ollama\models\blobs\sha256-95ff0bccfe6096c58d176bcbe8d0c87ccc4b517c0eade8acaa0797a9e441122e
TEMPLATE """{{- if .System }}{{ .System }}{{ end }}
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1}}
{{- if eq .Role "user" }}<｜User｜>{{ .Content }}
{{- else if eq .Role "assistant" }}<｜Assistant｜>{{ .Content }}{{- if not $last }}<｜end▁of▁sentence｜>{{- end }}
{{- end }}
{{- if and $last (ne .Role "assistant") }}<｜Assistant｜>{{- end }}
{{- end }}"""
PARAMETER stop <｜begin▁of▁sentence｜>
PARAMETER stop <｜end▁of▁sentence｜>
PARAMETER stop <｜User｜>
PARAMETER stop <｜Assistant｜>
PARAMETER temperature 0.6
PARAMETER top_p 0.95
LICENSE """MIT License

Copyright (c) 2025 Agentica

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated do

### 2. Method 1: Native Ollama Python Client (Non-Streaming)


In [8]:
# Non-streaming generation using ollama.generate()
prompt = "Explain quantum entanglement in exactly 2 sentences."

print("🔄 Non-Streaming Generation (ollama.generate)")
print(f"Prompt: {prompt}")
print("\nResponse:")

start_time = time.time()
try:
    response = ollama.generate(model=MODEL_NAME, prompt=prompt)
    wall_time = time.time() - start_time
    
    print(f"{response['response']}")
    print(f"\n🕒 Wall time: {wall_time:.2f}s")
    
    # Show detailed metrics
    show_metrics(response, "Non-Streaming Metrics")
    
    # Store response for later comparison
    non_streaming_response = response
    
except Exception as e:
    print(f"Error: {e}")


🔄 Non-Streaming Generation (ollama.generate)
Prompt: Explain quantum entanglement in exactly 2 sentences.

Response:
<think>
Okay, so I need to explain quantum entanglement in exactly two sentences. Let me start by recalling what I know about it. Quantum entanglement is a phenomenon where particles become interconnected such that the state of one particle instantly influences the state of another, no matter how far apart they are.

First sentence: Maybe talk about pairs of particles being connected and how their states affect each other immediately. Use terms like "spooky action at a distance" to illustrate the mystery or the non-local nature.

Second sentence: Need to elaborate on what happens when you measure one particle's state, it affects the other instantly. Emphasize that this is not classical physics but instead relies on quantum mechanics principles like superposition and entanglement.

I should make sure both sentences are concise and cover the key aspects without being too t

### 3. Method 2: Native Ollama Python Client (Streaming)


In [9]:
# Streaming generation using ollama.generate()
prompt = "List the first 8 prime numbers and explain why 1 is not considered prime."

print("🌊 Streaming Generation (ollama.generate)")
print(f"Prompt: {prompt}")
print("\nResponse:")

start_time = time.time()
token_times = []
first_token_time = None
final_response = None

try:
    stream = ollama.generate(model=MODEL_NAME, prompt=prompt, stream=True)
    
    for chunk in stream:
        current_time = time.time()
        
        # Track first token latency
        if first_token_time is None:
            first_token_time = current_time - start_time
        
        # Track inter-token latency
        if len(token_times) > 0:
            inter_token_latency = current_time - token_times[-1]
            token_times.append(current_time)
        else:
            token_times.append(current_time)
        
        # Print the token
        print(chunk['response'], end='', flush=True)
        
        # Keep reference to final chunk for metrics
        final_response = chunk
    
    wall_time = time.time() - start_time
    
    print(f"\n\n🕒 Wall time: {wall_time:.2f}s")
    print(f"🚀 First token latency: {first_token_time:.3f}s")
    
    # Calculate average inter-token latency
    if len(token_times) > 1:
        inter_token_latencies = [token_times[i] - token_times[i-1] for i in range(1, len(token_times))]
        avg_inter_token = sum(inter_token_latencies) / len(inter_token_latencies)
        print(f"⚡ Average inter-token latency: {avg_inter_token:.3f}s")
    
    # Show detailed metrics
    if final_response:
        show_metrics(final_response, "Streaming Metrics")
        streaming_response = final_response
        
except Exception as e:
    print(f"Error: {e}")


🌊 Streaming Generation (ollama.generate)
Prompt: List the first 8 prime numbers and explain why 1 is not considered prime.

Response:
<think>
Okay, so I need to list the first 8 prime numbers and explain why 1 isn't considered a prime number. Hmm, let's start by recalling what prime numbers are. Prime numbers are natural numbers greater than 1 that have no positive divisors other than 1 and themselves. So they can't be divided evenly by any other numbers except 1 and themselves.

Alright, so the first step is to list out the primes starting from the smallest. Let's begin with the number 2 because it's the smallest prime number. Is 2 a prime? Yes, because its only divisors are 1 and 2. So that's our first prime.

Next would be 3. Checking if it's divisible by anything other than 1 and 3. The numbers to check are 2 (since we've already considered 1). Wait, no—actually, since the next number after 2 is 3, let me think again. After 2 comes 3. To check if 3 is prime, we need to see if it ha

### 4. Method 3: Chat API (Streaming)

In [10]:
# Chat API with streaming
messages = [
    {"role": "system", "content": "You are a helpful assistant that provides concise, accurate answers."},
    {"role": "user", "content": "What are the key differences between Python lists and tuples? Give 3 main points."}
]

print("💬 Chat API Streaming (ollama.chat)")
print("Messages:")
for msg in messages:
    print(f"  {msg['role']}: {msg['content']}")
print("\nResponse:")

start_time = time.time()
final_chunk = None

try:
    stream = ollama.chat(model=MODEL_NAME, messages=messages, stream=True)
    
    for chunk in stream:
        # Print the content from the message
        content = chunk['message']['content']
        print(content, end='', flush=True)
        final_chunk = chunk
    
    wall_time = time.time() - start_time
    
    print(f"\n\n🕒 Wall time: {wall_time:.2f}s")
    
    # Show detailed metrics
    if final_chunk:
        show_metrics(final_chunk, "Chat API Streaming Metrics")
        chat_response = final_chunk
        
except Exception as e:
    print(f"Error: {e}")


💬 Chat API Streaming (ollama.chat)
Messages:
  system: You are a helpful assistant that provides concise, accurate answers.
  user: What are the key differences between Python lists and tuples? Give 3 main points.

Response:
<think>
Okay, I need to explain the key differences between Python lists and tuples. The user wants three main points.

First, maybe list is ordered but can have duplicates because lists allow item insertion anywhere. Wait, no, actually lists don't support duplicate values unless you add them again, which isn't possible directly. Hmm, perhaps that's not a good point.

Wait, the user asked for differences between lists and tuples. Tuples are immutable. So maybe the first point is about mutability. That makes sense because tuples can't be changed after creation.

Second, another difference might be elements in a list have pointers to their positions, but since they're mutable, you can change values without issue. Wait, no, that's not right. Actually, lists are dynami

### 5. Method 4: Raw REST API


In [11]:
# Raw REST API streaming
payload = {
    "model": MODEL_NAME,
    "prompt": "Explain the concept of recursion in programming with a simple example.",
    "stream": True
}

print("🔗 Raw REST API Streaming")
print(f"Endpoint: {BASE_URL}/api/generate")
print(f"Payload: {json.dumps(payload, indent=2)}")
print("\nResponse:")

start_time = time.time()
final_data = None

try:
    with requests.post(f"{BASE_URL}/api/generate", json=payload, stream=True) as response:
        response.raise_for_status()
        
        for line in response.iter_lines():
            if not line:  # Skip empty lines (keep-alive)
                continue
                
            try:
                data = json.loads(line)
                token = data.get('response', '')
                print(token, end='', flush=True)
                
                if data.get('done', False):
                    final_data = data
                    break
                    
            except json.JSONDecodeError:
                continue
    
    wall_time = time.time() - start_time
    
    print(f"\n\n🕒 Wall time: {wall_time:.2f}s")
    
    # Show detailed metrics
    if final_data:
        show_metrics(final_data, "Raw REST API Metrics")
        rest_response = final_data
        
except Exception as e:
    print(f"Error: {e}")


🔗 Raw REST API Streaming
Endpoint: http://localhost:11434/api/generate
Payload: {
  "model": "deepscaler",
  "prompt": "Explain the concept of recursion in programming with a simple example.",
  "stream": true
}

Response:
<think>
Okay, so I need to explain recursion in programming with a simple example. Hmm, let's see. Recursion is when a function calls itself repeatedly until a base case is reached. That makes sense.

I remember that with math problems like calculating factorials or Fibonacci numbers, recursion is often used because it breaks down the problem into smaller parts which are easier to solve.

Let me think of an example. Maybe the factorial function. The factorial of a number n is n multiplied by (n-1) factorial until we reach 1. So for example, 5! = 5*4!, and so on.

I should write down how this works step by step. Start with a function that takes n as an argument. If n is 0 or 1, return 1 because that's the base case. Otherwise, multiply n by factorial(n-1). That way, e

In [None]:
# LangChain integration with streaming
from langchain_community.llms import Ollama
from langchain_core.callbacks import StreamingStdOutCallbackHandler

print("🦜 LangChain Integration (Synchronous Streaming)")

# Create LangChain Ollama instance
llm = Ollama(
    model=MODEL_NAME,
    base_url=BASE_URL,
    temperature=0.7
)

prompt = "Describe the water cycle in nature using exactly 4 steps."
print(f"Prompt: {prompt}")
print("\nResponse:")

start_time = time.time()

try:
    # Streaming with LangChain
    response_chunks = []
    for chunk in llm.stream(prompt):
        print(chunk, end='', flush=True)
        response_chunks.append(chunk)
    
    wall_time = time.time() - start_time
    full_response = ''.join(response_chunks)
    
    print(f"\n\n🕒 Wall time: {wall_time:.2f}s")
    print(f"📝 Response length: {len(full_response)} characters")
    print(f"🧮 Estimated tokens: ~{len(full_response.split())} words")
    
except Exception as e:
    print(f"Error: {e}")


In [None]:
# LangChain Async streaming
import asyncio

async def async_langchain_streaming():
    print("🦜🚀 LangChain Integration (Asynchronous Streaming)")
    
    prompt = "What are the main components of a computer? List 5 key parts."
    print(f"Prompt: {prompt}")
    print("\nResponse:")
    
    start_time = time.time()
    
    try:
        response_chunks = []
        async for chunk in llm.astream(prompt):
            print(chunk, end='', flush=True)
            response_chunks.append(chunk)
        
        wall_time = time.time() - start_time
        full_response = ''.join(response_chunks)
        
        print(f"\n\n🕒 Wall time: {wall_time:.2f}s")
        print(f"📝 Response length: {len(full_response)} characters")
        print(f"🧮 Estimated tokens: ~{len(full_response.split())} words")
        
    except Exception as e:
        print(f"Error: {e}")

# Run the async function
await async_langchain_streaming()


In [None]:
# Test embeddings (if the model supports them)
print("🔢 Testing Embeddings")

# Note: Not all models support embeddings. We'll try with the current model first,
# then fall back to a dedicated embedding model if available

test_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is a subset of artificial intelligence.",
    "Python is a popular programming language."
]

print("Test texts:")
for i, text in enumerate(test_texts, 1):
    print(f"  {i}. {text}")

try:
    print(f"\nTrying embeddings with {MODEL_NAME}...")
    start_time = time.time()
    
    # Try single embedding first
    embedding_result = ollama.embed(model=MODEL_NAME, input=test_texts[0])
    
    if 'embeddings' in embedding_result:
        embedding = embedding_result['embeddings'][0]
        print(f"✅ Single embedding successful!")
        print(f"   Embedding dimension: {len(embedding)}")
        print(f"   First 10 values: {embedding[:10]}")
        
        # Try batch embeddings
        batch_result = ollama.embed(model=MODEL_NAME, input=test_texts)
        batch_time = time.time() - start_time
        
        print(f"\n✅ Batch embeddings successful!")
        print(f"   Number of embeddings: {len(batch_result['embeddings'])}")
        print(f"   Time taken: {batch_time:.3f}s")
        
        # Calculate similarities (dot product)
        import numpy as np
        embeddings = np.array(batch_result['embeddings'])
        
        print(f"\n📊 Similarity matrix (dot product):")
        for i in range(len(embeddings)):
            for j in range(len(embeddings)):
                similarity = np.dot(embeddings[i], embeddings[j])
                print(f"  Text {i+1} ↔ Text {j+1}: {similarity:.4f}")
    else:
        print(f"❌ No embeddings in response")
        
except Exception as e:
    print(f"❌ Embeddings failed with {MODEL_NAME}: {e}")
    
    # Try with a common embedding model if available
    embedding_models = ["nomic-embed-text", "all-minilm", "mxbai-embed-large"]
    
    for embed_model in embedding_models:
        try:
            print(f"\nTrying with {embed_model}...")
            test_result = ollama.embed(model=embed_model, input=test_texts[0])
            print(f"✅ {embed_model} works! Use this model for embeddings.")
            break
        except Exception as embed_e:
            print(f"❌ {embed_model} not available: {embed_e}")
    else:
        print("\n💡 To test embeddings, pull an embedding model like:")
        print("   ollama pull nomic-embed-text")


In [None]:
# Performance benchmarking across different methods
import time
import statistics

def benchmark_method(method_name, method_func, iterations=3):
    """Benchmark a specific method multiple times"""
    print(f"\n🏁 Benchmarking {method_name} ({iterations} iterations)")
    
    times = []
    token_counts = []
    
    for i in range(iterations):
        print(f"  Run {i+1}/{iterations}...", end=" ")
        
        start_time = time.time()
        try:
            result = method_func()
            end_time = time.time()
            
            duration = end_time - start_time
            times.append(duration)
            
            # Extract token count if available
            if isinstance(result, dict):
                tokens = result.get('eval_count', result.get('completion_tokens', 0))
                token_counts.append(tokens)
            
            print(f"{duration:.2f}s ({tokens if 'tokens' in locals() else '?'} tokens)")
            
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    if times:
        avg_time = statistics.mean(times)
        std_time = statistics.stdev(times) if len(times) > 1 else 0
        min_time = min(times)
        max_time = max(times)
        
        print(f"  📊 Results:")
        print(f"     Average: {avg_time:.3f}s ± {std_time:.3f}s")
        print(f"     Range: {min_time:.3f}s - {max_time:.3f}s")
        
        if token_counts:
            avg_tokens = statistics.mean(token_counts)
            avg_tokens_per_sec = avg_tokens / avg_time if avg_time > 0 else 0
            print(f"     Avg tokens: {avg_tokens:.1f}")
            print(f"     Tokens/sec: {avg_tokens_per_sec:.1f}")
        
        return {
            'method': method_name,
            'avg_time': avg_time,
            'std_time': std_time,
            'min_time': min_time,
            'max_time': max_time,
            'avg_tokens': statistics.mean(token_counts) if token_counts else None,
            'tokens_per_sec': avg_tokens_per_sec if token_counts else None
        }
    
    return None

# Define test prompt
test_prompt = "Explain the difference between RAM and storage in computers. Be concise."

# Define benchmark methods
def bench_generate_nonstreaming():
    return ollama.generate(model=MODEL_NAME, prompt=test_prompt)

def bench_generate_streaming():
    final_chunk = None
    for chunk in ollama.generate(model=MODEL_NAME, prompt=test_prompt, stream=True):
        final_chunk = chunk
    return final_chunk

def bench_chat_streaming():
    messages = [{"role": "user", "content": test_prompt}]
    final_chunk = None
    for chunk in ollama.chat(model=MODEL_NAME, messages=messages, stream=True):
        final_chunk = chunk
    return final_chunk

def bench_rest_api():
    payload = {"model": MODEL_NAME, "prompt": test_prompt, "stream": False}
    response = requests.post(f"{BASE_URL}/api/generate", json=payload)
    return response.json()

# Run benchmarks
print(f"🚀 Performance Benchmarking")
print(f"Model: {MODEL_NAME}")
print(f"Test prompt: {test_prompt}")

benchmarks = []

# Benchmark each method
methods = [
    ("Generate (Non-streaming)", bench_generate_nonstreaming),
    ("Generate (Streaming)", bench_generate_streaming), 
    ("Chat (Streaming)", bench_chat_streaming),
    ("REST API (Non-streaming)", bench_rest_api)
]

for method_name, method_func in methods:
    result = benchmark_method(method_name, method_func, iterations=3)
    if result:
        benchmarks.append(result)

# Create comparison DataFrame
if benchmarks:
    print(f"\n📈 Performance Comparison Summary")
    df = pd.DataFrame(benchmarks)
    df = df.round(3)
    print(df.to_string(index=False))
    
    # Find fastest method
    fastest = df.loc[df['avg_time'].idxmin()]
    print(f"\n🏆 Fastest method: {fastest['method']} ({fastest['avg_time']:.3f}s avg)")
    
    if 'tokens_per_sec' in df.columns and df['tokens_per_sec'].notna().any():
        highest_throughput = df.loc[df['tokens_per_sec'].idxmax()]
        print(f"⚡ Highest throughput: {highest_throughput['method']} ({highest_throughput['tokens_per_sec']:.1f} tokens/sec)")


In [None]:
# Test advanced features like custom options, context, etc.
print("🧪 Advanced Features Testing")

# Test 1: Custom generation options
print("\n1️⃣ Custom Generation Options")
custom_options = {
    "temperature": 0.1,  # More deterministic
    "top_p": 0.9,
    "top_k": 40,
    "repeat_penalty": 1.1,
    "num_predict": 50  # Limit response length
}

prompt = "Write a creative story opening."
print(f"Prompt: {prompt}")
print(f"Options: {custom_options}")
print("\nResponse:")

try:
    response = ollama.generate(
        model=MODEL_NAME, 
        prompt=prompt,
        options=custom_options
    )
    print(response['response'])
    show_metrics(response, "Custom Options Metrics")
    
except Exception as e:
    print(f"Error: {e}")

# Test 2: Context/conversation memory
print("\n\n2️⃣ Context and Conversation Memory")
print("Testing conversation continuity...")

# First message
context = None
messages = []

try:
    # First turn
    print("\nTurn 1:")
    user_msg1 = "My name is Alice and I love astronomy."
    print(f"User: {user_msg1}")
    
    response1 = ollama.generate(model=MODEL_NAME, prompt=user_msg1)
    context = response1.get('context')  # Save context for next turn
    
    print(f"Assistant: {response1['response']}")
    
    # Second turn using context
    print("\nTurn 2:")
    user_msg2 = "What do you remember about me?"
    print(f"User: {user_msg2}")
    
    response2 = ollama.generate(
        model=MODEL_NAME, 
        prompt=user_msg2,
        context=context  # Use saved context
    )
    
    print(f"Assistant: {response2['response']}")
    
    print(f"\n💾 Context size: {len(context) if context else 0} tokens")
    
except Exception as e:
    print(f"Error in context test: {e}")

# Test 3: System prompt
print("\n\n3️⃣ System Prompt Testing")
system_prompt = "You are a helpful assistant who always responds in exactly one sentence and ends with an emoji."
user_prompt = "Explain photosynthesis."

print(f"System: {system_prompt}")
print(f"User: {user_prompt}")
print("\nResponse:")

try:
    # Using chat API with system message
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    response = ollama.chat(model=MODEL_NAME, messages=messages)
    print(response['message']['content'])
    show_metrics(response, "System Prompt Metrics")
    
except Exception as e:
    print(f"Error: {e}")

# Test 4: Raw mode (if supported)
print("\n\n4️⃣ Raw Mode Testing")
print("Testing raw mode (bypassing template)...")

try:
    # Raw mode bypasses the model's chat template
    raw_response = ollama.generate(
        model=MODEL_NAME,
        prompt="<|user|>What is 2+2?<|assistant|>",
        raw=True
    )
    print(f"Raw response: {raw_response['response']}")
    
except Exception as e:
    print(f"Raw mode not supported or error: {e}")

print("\n✅ Advanced features testing complete!")


In [None]:
# Summary of all testing results and recommendations
print("📋 OLLAMA TESTING SUMMARY")
print("=" * 50)

print(f"\n🔧 Configuration Used:")
print(f"   Model: {MODEL_NAME}")
print(f"   Ollama Host: {BASE_URL}")

print(f"\n📊 Methods Tested:")
methods_tested = [
    "✅ Native Ollama Python Client (Non-streaming)",
    "✅ Native Ollama Python Client (Streaming)", 
    "✅ Chat API (Streaming)",
    "✅ Raw REST API (Streaming)",
    "✅ LangChain Integration (Sync & Async)",
    "✅ Embeddings (if supported)",
    "✅ Performance Benchmarking",
    "✅ Advanced Features (Custom options, Context, System prompts)"
]

for method in methods_tested:
    print(f"   {method}")

print(f"\n💡 Key Findings:")
print(f"   • Streaming provides real-time token delivery")
print(f"   • All methods provide detailed metrics (token counts, timing)")
print(f"   • LangChain offers higher-level abstractions")
print(f"   • Raw REST API provides maximum control")
print(f"   • Context/memory enables multi-turn conversations")
print(f"   • Custom options allow fine-tuning model behavior")

print(f"\n🚀 Recommendations for Your App:")
print(f"   1. Use STREAMING for better user experience")
print(f"   2. Native ollama client for simplicity")
print(f"   3. LangChain for complex pipelines")
print(f"   4. Monitor tokens/sec for performance")
print(f"   5. Implement proper error handling")
print(f"   6. Cache context for conversations")

print(f"\n📈 Metrics to Track:")
metrics_to_track = [
    "• prompt_eval_count (input tokens)",
    "• eval_count (output tokens)", 
    "• total_duration (total time)",
    "• eval_duration (generation time)",
    "• tokens_per_second (throughput)",
    "• first_token_latency (responsiveness)",
    "• inter_token_latency (smoothness)"
]

for metric in metrics_to_track:
    print(f"   {metric}")

print(f"\n🛠️ Next Steps:")
next_steps = [
    "1. Integrate streaming into your studio app",
    "2. Add token counting for billing/limits",
    "3. Implement conversation memory",
    "4. Add performance monitoring",
    "5. Test with different models",
    "6. Add error handling and retries"
]

for step in next_steps:
    print(f"   {step}")

print(f"\n✨ All tests completed successfully!")
print("=" * 50)
