# Ollama Model Performance - Sentiment Analysis

## Objective
Test and compare the performance and accuracy of configurable Ollama models for sentiment analysis using the mental health dataset.

## Checklist
- Load and validate the mental health dataset with proper handling of missing values
- Configure Ollama models, sample sizes, and test parameters
- Query Ollama models for sentiment predictions on sampled data
- Calculate accuracy, precision, recall, and F1 scores for each model
- Record results in an append-only JSON structure with timestamps
- Display server specifications and comparative performance metrics

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import json
import requests
import time
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## 2. Configuration Parameters

In [None]:
# Configurable Parameters
CONFIG = {
    "ollama_url": "http://localhost:11434/api/chat",
    "models_to_test": ["gemma3:4b"],  # List of Ollama models
    "dataset_path": "./datasets/mental-health-dataset.csv",
    "sample_size": 100,  # Number of records to sample for testing
    "random_seed": 42,  # For reproducibility
    "batch_size": 10,  # Batch size for predictions
    "test_entire_dataset": False,  # Set to True to test entire dataset
    "results_file": "results.json",  # File to store test results
    "timeout_seconds": 30,  # Timeout for API calls
    "store_sample_predictions": True,  # Store sample predictions for debugging
    "sample_predictions_count": 5  # Number of sample predictions to store
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

Configuration:
  ollama_url: http://localhost:11434/api/chat
  models_to_test: ['gemma3:4b']
  dataset_path: ../datasets/mental-health-dataset.csv
  sample_size: 100
  random_seed: 42
  batch_size: 10
  test_entire_dataset: False
  results_file: results.json
  timeout_seconds: 30
  store_sample_predictions: True
  sample_predictions_count: 5


## 3. Server Specifications

Record the hardware and system specifications used for testing.

In [3]:
import platform
import subprocess

def get_server_specs() -> Dict[str, Any]:
    """
    Gather server specifications. Set unknown values to None.
    """
    specs = {
        "cpu": None,
        "ram_gb": None,
        "gpu_count": None,
        "gpu_type": None,
        "gpu_bus_speed_gbps": None,
        "os": platform.platform()
    }
    
    # Try to get CPU info
    try:
        if platform.system() == "Linux":
            cpu_info = subprocess.check_output("lscpu | grep 'Model name'", shell=True).decode().strip()
            specs["cpu"] = cpu_info.split(":")[1].strip() if ":" in cpu_info else None
        elif platform.system() == "Darwin":  # macOS
            cpu_info = subprocess.check_output("sysctl -n machdep.cpu.brand_string", shell=True).decode().strip()
            specs["cpu"] = cpu_info
    except:
        pass
    
    # Try to get RAM
    try:
        if platform.system() == "Linux":
            mem_info = subprocess.check_output("free -g | grep Mem | awk '{print $2}'", shell=True).decode().strip()
            specs["ram_gb"] = int(mem_info)
        elif platform.system() == "Darwin":
            mem_info = subprocess.check_output("sysctl hw.memsize | awk '{print $2}'", shell=True).decode().strip()
            specs["ram_gb"] = int(int(mem_info) / (1024**3))
    except:
        pass
    
    # Try to get GPU info (NVIDIA)
    try:
        gpu_info = subprocess.check_output("nvidia-smi --query-gpu=name --format=csv,noheader", shell=True).decode().strip()
        gpu_list = gpu_info.split("\n")
        specs["gpu_count"] = len(gpu_list)
        specs["gpu_type"] = gpu_list[0] if gpu_list else None
    except:
        pass
    
    return specs

SERVER_SPECS = get_server_specs()
print("Server Specifications:")
print(json.dumps(SERVER_SPECS, indent=2))

Server Specifications:
{
  "cpu": "13th Gen Intel(R) Core(TM) i9-13900K",
  "ram_gb": 62,
  "gpu_count": 2,
  "gpu_type": "NVIDIA GeForce RTX 3090",
  "gpu_bus_speed_gbps": null,
  "os": "Linux-5.15.0-157-generic-x86_64-with-glibc2.35"
}


## 4. Data Loading and Validation

In [4]:
def load_and_validate_data(dataset_path: str) -> Tuple[pd.DataFrame, int]:
    """
    Load the dataset and validate/clean the data.
    Returns: (cleaned_dataframe, skipped_rows_count)
    """
    # Load dataset
    df = pd.read_csv(dataset_path)
    print(f"Loaded dataset with {len(df)} rows")
    
    initial_count = len(df)
    
    # Remove rows with missing or blank 'posts' field
    df = df.dropna(subset=['posts'])
    df = df[df['posts'].str.strip() != '']
    
    # Remove rows with missing 'predicted' or 'intensity' values
    df = df.dropna(subset=['predicted', 'intensity'])
    
    skipped_count = initial_count - len(df)
    
    print(f"Cleaned dataset: {len(df)} rows (skipped {skipped_count} rows)")
    print(f"Label distribution: {df['predicted'].value_counts().to_dict()}")
    
    return df, skipped_count

# Load data
df_full, skipped_rows = load_and_validate_data(CONFIG["dataset_path"])
print(f"\n✓ Data loaded and validated")

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/mental-health-dataset.csv'

## 5. Data Sampling

In [None]:
def sample_data(df: pd.DataFrame, sample_size: int, random_seed: int, test_entire: bool = False) -> Tuple[pd.DataFrame, List[int]]:
    """
    Sample data from the dataset.
    Returns: (sampled_dataframe, list_of_indices)
    """
    if test_entire:
        sampled_df = df.copy()
        indices = df.index.tolist()
    else:
        sample_size = min(sample_size, len(df))
        sampled_df = df.sample(n=sample_size, random_state=random_seed)
        indices = sampled_df.index.tolist()
    
    print(f"Sampled {len(sampled_df)} records for testing")
    return sampled_df, indices

# Sample data
df_sample, sample_indices = sample_data(
    df_full, 
    CONFIG["sample_size"], 
    CONFIG["random_seed"],
    CONFIG["test_entire_dataset"]
)

print(f"Sample indices: {sample_indices[:10]}...")
print(f"\n✓ Data sampled")

## 6. Ollama API Integration

In [None]:
def query_ollama_sentiment(text: str, model: str, ollama_url: str, timeout: int = 30) -> Optional[str]:
    """
    Query Ollama model for sentiment analysis.
    Returns: sentiment label (negative/neutral/positive) or None on error
    """
    prompt = f"""Analyze the sentiment of the following text and respond with ONLY one word: 'negative', 'neutral', or 'positive'.

Text: {text}

Sentiment:"""
    
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "stream": False
    }
    
    try:
        response = requests.post(ollama_url, json=payload, timeout=timeout)
        response.raise_for_status()
        
        result = response.json()
        sentiment = result.get("message", {}).get("content", "").strip().lower()
        
        # Extract sentiment label
        if "negative" in sentiment:
            return "negative"
        elif "positive" in sentiment:
            return "positive"
        elif "neutral" in sentiment:
            return "neutral"
        else:
            # If no clear match, return the first word
            first_word = sentiment.split()[0] if sentiment else None
            if first_word in ["negative", "neutral", "positive"]:
                return first_word
            return None
    except Exception as e:
        print(f"Error querying model: {e}")
        return None

print("✓ Ollama API integration ready")

## 7. Batch Prediction

In [None]:
def predict_sentiments(df: pd.DataFrame, model: str, config: Dict) -> Tuple[List[Optional[str]], int]:
    """
    Predict sentiments for all texts in the dataframe.
    Returns: (predictions_list, error_count)
    """
    predictions = []
    error_count = 0
    
    for idx, row in df.iterrows():
        text = row['posts']
        prediction = query_ollama_sentiment(
            text, 
            model, 
            config["ollama_url"],
            config["timeout_seconds"]
        )
        
        if prediction is None:
            error_count += 1
        
        predictions.append(prediction)
        
        # Progress update
        if (len(predictions)) % 10 == 0:
            print(f"Progress: {len(predictions)}/{len(df)} predictions completed (errors: {error_count})")
    
    return predictions, error_count

print("✓ Batch prediction function ready")

## 8. Evaluation Metrics

In [None]:
def calculate_metrics(y_true: List[str], y_pred: List[Optional[str]]) -> Dict[str, float]:
    """
    Calculate accuracy, precision, recall, and F1 score.
    Handles None predictions by filtering them out.
    """
    # Filter out None predictions
    valid_pairs = [(true, pred) for true, pred in zip(y_true, y_pred) if pred is not None]
    
    if not valid_pairs:
        return {
            "accuracy": 0.0,
            "precision": 0.0,
            "recall": 0.0,
            "f1": 0.0
        }
    
    y_true_valid = [pair[0] for pair in valid_pairs]
    y_pred_valid = [pair[1] for pair in valid_pairs]
    
    accuracy = accuracy_score(y_true_valid, y_pred_valid)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true_valid, 
        y_pred_valid, 
        average='weighted',
        zero_division=0
    )
    
    return {
        "accuracy": round(accuracy, 4),
        "precision": round(precision, 4),
        "recall": round(recall, 4),
        "f1": round(f1, 4)
    }

print("✓ Evaluation metrics function ready")

## 9. Results Storage

In [None]:
def load_results(results_file: str) -> Dict:
    """
    Load existing results from JSON file or create new structure.
    """
    results_path = Path(results_file)
    
    if results_path.exists():
        with open(results_path, 'r') as f:
            return json.load(f)
    else:
        return {
            "runs": [],
            "server_specs": SERVER_SPECS
        }

def save_results(results: Dict, results_file: str):
    """
    Save results to JSON file.
    """
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {results_file}")

def add_test_run(results: Dict, run_data: Dict):
    """
    Add a new test run to results (append-only).
    """
    results["runs"].append(run_data)
    # Ensure chronological order (oldest first)
    results["runs"] = sorted(results["runs"], key=lambda x: x["timestamp"])

print("✓ Results storage functions ready")

## 10. Main Test Execution

In [None]:
def run_model_test(model: str, df: pd.DataFrame, indices: List[int], config: Dict) -> Dict:
    """
    Run a complete test for a single model.
    Returns: test run data dictionary
    """
    print(f"\n{'='*60}")
    print(f"Testing model: {model}")
    print(f"{'='*60}")
    
    start_time = time.time()
    error_message = None
    
    try:
        # Get predictions
        predictions, error_count = predict_sentiments(df, model, config)
        
        # Calculate metrics
        y_true = df['predicted'].tolist()
        stats = calculate_metrics(y_true, predictions)
        
        # Prepare sample predictions for debugging
        sample_predictions = []
        if config.get("store_sample_predictions", False):
            sample_count = min(config.get("sample_predictions_count", 5), len(df))
            for i in range(sample_count):
                row = df.iloc[i]
                sample_predictions.append({
                    "index": int(indices[i]),
                    "input": row['posts'][:100] + "..." if len(row['posts']) > 100 else row['posts'],
                    "true_label": row['predicted'],
                    "pred": predictions[i]
                })
        
    except Exception as e:
        error_message = str(e)
        predictions = []
        error_count = len(df)
        stats = {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
        sample_predictions = []
        print(f"ERROR: {error_message}")
    
    runtime = round(time.time() - start_time, 2)
    
    # Create test run data
    run_data = {
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "model": model,
        "sample_size": len(df),
        "dataset_indices": indices,
        "config": {
            "random_seed": config["random_seed"],
            "batch_size": config["batch_size"],
            "timeout_seconds": config["timeout_seconds"]
        },
        "stats": stats,
        "runtime_sec": runtime,
        "skipped_rows": skipped_rows,
        "error_count": error_count,
        "error_message": error_message,
        "sample_predictions": sample_predictions
    }
    
    # Print summary
    print(f"\nTest completed in {runtime} seconds")
    print(f"Accuracy: {stats['accuracy']:.4f}")
    print(f"Precision: {stats['precision']:.4f}")
    print(f"Recall: {stats['recall']:.4f}")
    print(f"F1 Score: {stats['f1']:.4f}")
    print(f"Errors: {error_count}")
    
    return run_data

print("✓ Main test execution function ready")

## 11. Execute Tests for All Models

**Micro-update:** Running tests for all configured models and storing results in append-only JSON format.

In [None]:
# Load existing results
results = load_results(CONFIG["results_file"])

# Test each model
for model in CONFIG["models_to_test"]:
    run_data = run_model_test(model, df_sample, sample_indices, CONFIG)
    add_test_run(results, run_data)
    save_results(results, CONFIG["results_file"])
    print(f"\n✓ Results for {model} saved to {CONFIG['results_file']}")

print(f"\n{'='*60}")
print("All tests completed!")
print(f"{'='*60}")

## 12. View Results Summary

In [None]:
# Load and display results
results = load_results(CONFIG["results_file"])

print("\nTest Results Summary")
print("=" * 80)

if results["runs"]:
    # Create summary DataFrame
    summary_data = []
    for run in results["runs"]:
        summary_data.append({
            "Timestamp": run["timestamp"],
            "Model": run["model"],
            "Sample Size": run["sample_size"],
            "Accuracy": run["stats"]["accuracy"],
            "Precision": run["stats"]["precision"],
            "Recall": run["stats"]["recall"],
            "F1": run["stats"]["f1"],
            "Runtime (s)": run["runtime_sec"],
            "Errors": run["error_count"]
        })
    
    df_summary = pd.DataFrame(summary_data)
    print(df_summary.to_string(index=False))
    
    # Display server specs
    print("\n" + "=" * 80)
    print("Server Specifications:")
    print(json.dumps(results["server_specs"], indent=2))
else:
    print("No test runs found.")

## 13. View Sample Predictions

Display sample predictions from the most recent test run for debugging.

In [None]:
if results["runs"]:
    latest_run = results["runs"][-1]
    print(f"Sample predictions from: {latest_run['model']}")
    print("=" * 80)
    
    for i, pred in enumerate(latest_run.get("sample_predictions", []), 1):
        print(f"\nSample {i}:")
        print(f"  Index: {pred['index']}")
        print(f"  Input: {pred['input']}")
        print(f"  True Label: {pred['true_label']}")
        print(f"  Predicted: {pred['pred']}")
        print(f"  Match: {'✓' if pred['true_label'] == pred['pred'] else '✗'}")
else:
    print("No test runs available.")