"""
Author: Jacob Thomas Joshy
Purpose: Complete environment configuration for fine-tuning Ollama models
         specifically for pharmaceutical Standard Operating Procedure (SOP) generation
         
This notebook provides a comprehensive setup for:
- GPU-accelerated Ollama installation and configuration  
- Pharmaceutical-specific dataset preparation utilities
- FDA compliance validation framework integration
- Model evaluation and performance metrics

Requirements:
- Google Colab Pro recommended for GPU access (T4/V100)
- Minimum 12GB RAM for model fine-tuning
- Stable internet connection for model downloads
"""

## 1. System Requirements & GPU Setup

In [1]:
# Check GPU availability and specifications
!nvidia-smi
import torch
import os
import subprocess
import json

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("Warning: GPU not available.")

Wed Aug 27 11:36:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   53C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## 2. Install Dependencies & Ollama

In [2]:
# Install Ollama and required dependencies
!curl -fsSL https://ollama.com/install.sh | sh
# Install Python dependencies for pharmaceutical data processing
!pip install -q transformers datasets accelerate bitsandbytes
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q requests beautifulsoup4 pandas numpy
!pip install -q matplotlib seaborn plotly
!pip install -q scikit-learn nltk spacy
!pip install -q jupyter-client ipywidgets

# Install pharmaceutical-specific libraries
!pip install -q biopython  # Biological data processing
!pip install -q chembl_webresource_client  # Chemical database access
!pip install -q pubchempy  # PubChem database access
print("All dependencies installed successfully!")

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00

In [3]:
# Initialize Ollama background service
import subprocess
import time
import threading

def launch_ollama():
    """Launch ollama server process in background"""
    try:
        proc = subprocess.run(['ollama', 'serve'], check=True, capture_output=True)
    except subprocess.CalledProcessError as err:
        print(f"Service launch failed: {err}")

# Create background thread for service
bg_thread = threading.Thread(target=launch_ollama, daemon=True)
bg_thread.start()

# Allow startup time
time.sleep(5)

# Check if service is responding
try:
    check_result = subprocess.run(['ollama', 'list'],
                                 capture_output=True,
                                 text=True,
                                 timeout=10)

    if check_result.returncode == 0:
        print("Ollama service is running")
        print("Current models:")
        print(check_result.stdout)
    else:
        print("Service may still be initializing")

except subprocess.TimeoutExpired:
    print("Service check timed out - continuing anyway")
except Exception as error:
    print(f"Service verification failed: {error}")

Ollama service is running
Current models:
NAME    ID    SIZE    MODIFIED 



## 3. Download Base Models for Fine-tuning

In [4]:
# Model selection and download for pharmaceutical text generation
import subprocess
import time
import os

# Available model options with specifications
available_models = [
    {
        'name': 'llama2:7b-chat',
        'description': 'Llama 2 7B for general pharmaceutical text',
        'size': '4GB approx'
    },
    {
        'name': 'llama2:13b-chat',
        'description': 'Llama 2 13B higher quality but memory intensive',
        'size': '7GB approx'
    },
    {
        'name': 'mistral:7b-instruct',
        'description': 'Mistral 7B optimized for technical documentation',
        'size': '4GB approx'
    }
]

print("Available models for pharmaceutical SOP generation:")
for idx, model_info in enumerate(available_models, 1):
    print(f"{idx}. {model_info['name']} ({model_info['size']})")
    print(f"   {model_info['description']}")
    print()

# Select model for download
target_model = 'mistral:7b-instruct'
print(f"Downloading model: {target_model}")
print("Download time: 5-10 minutes depending on connection")

# Use os.system for progress visibility in notebook environment
download_status = os.system(f'ollama pull {target_model}')

if download_status == 0:
    print(f"Download completed: {target_model}")
else:
    print(f"Download failed with exit code: {download_status}")
    print("Manual download may be required")

# Verify available models
print("\nInstalled models:")
try:
    model_list = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
    print(model_list.stdout)
except Exception as err:
    print(f"Could not list models: {err}")

Available models for pharmaceutical SOP generation:
1. llama2:7b-chat (4GB approx)
   Llama 2 7B for general pharmaceutical text

2. llama2:13b-chat (7GB approx)
   Llama 2 13B higher quality but memory intensive

3. mistral:7b-instruct (4GB approx)
   Mistral 7B optimized for technical documentation

Downloading model: mistral:7b-instruct
Download time: 5-10 minutes depending on connection
Download completed: mistral:7b-instruct

Installed models:
NAME                   ID              SIZE      MODIFIED               
mistral:7b-instruct    6577803aa9a0    4.4 GB    Less than a second ago    



## FINE TUNING
Pharmaceutical Dataset Collection and Ollama Fine-tuning Script
Author: Jacob Thomas Joshy

This is my attempt at collecting real pharmaceutical datasets and fine-tuning
Ollama models for SOP generation. Had to figure out a lot of this through
trial and error since the documentation isn't always clear.

Data sources I'm pulling from:
- FDA Pharmaceuticals FAQ (found this on Hugging Face)
- FDA Drug Labels Database (using their API)
- FDA Adverse Event Reports
- FDA Orange Book drug approvals
- EPA Chemical Registry (this one was tricky to get working)
- Some SOP datasets I found on Hugging Face



In [5]:
import subprocess
import sys
import os
import json
import time
import logging
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
import hashlib
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

print("Installing packages... this might take a while")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q',
                       'requests', 'pandas', 'numpy', 'datasets',
                       'transformers', 'accelerate', 'peft', 'trl'])

subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q',
                       'torch>=2.0.0', 'bitsandbytes>=0.41.0'])

subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q',
                       'beautifulsoup4', 'lxml', 'openpyxl'])

print("Done installing dependencies")


Installing packages... this might take a while
Done installing dependencies


In [6]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PharmaDataCollector:
    """
    This class handles collecting pharmaceutical data from various sources.
    I tried to make it handle rate limits and errors gracefully since APIs
    can be unreliable sometimes.
    """

    def __init__(self, output_dir="/content/pharma_datasets"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # API endpoints - these sometimes change so might need updating
        self.fda_base_url = "https://api.fda.gov"
        self.epa_base_url = "https://comptox.epa.gov/dashboard-api"
        self.rate_limit_delay = 1.0  # being conservative here
        self.max_retries = 3

        # Set up session with retry logic - learned this the hard way
        self.session = requests.Session()
        retry_strategy = Retry(
            total=self.max_retries,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504],
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)

        # Keep track of what we've already collected
        self.data_cache = set()
        logger.info(f"Initialized data collector, saving to: {self.output_dir}")

# Initialize our collector
collector = PharmaDataCollector()


In [7]:
def get_fda_faq_data():
    """
    Getting FDA FAQ data from Hugging Face. This dataset is pretty good
    for training since it has real Q&A pairs about pharmaceuticals.
    """
    try:
        from datasets import load_dataset

        print("Loading FDA FAQ dataset from Hugging Face...")

        # Load the dataset - this might take a minute
        dataset = load_dataset("Jaymax/FDA_Pharmaceuticals_FAQ")

        # Figure out which split to use
        if 'train' in dataset:
            df = dataset['train'].to_pandas()
        else:
            # Just use whatever split exists
            split_name = list(dataset.keys())[0]
            df = dataset[split_name].to_pandas()

        # Process the data into our format
        faq_records = []
        for idx, row in df.iterrows():
            record = {
                'question': row.get('question', ''),
                'answer': row.get('answer', ''),
                'category': 'fda_faq',
                'source': 'huggingface_jaymax',
                'collected_at': datetime.utcnow().isoformat()
            }
            faq_records.append(record)

        # Save both formats just in case
        json_file = collector.output_dir / "fda_pharmaceuticals_faq.json"
        csv_file = collector.output_dir / "fda_pharmaceuticals_faq.csv"

        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(faq_records, f, indent=2, ensure_ascii=False)
        df.to_csv(csv_file, index=False)

        print(f"Collected {len(faq_records)} FAQ records")
        return faq_records

    except Exception as e:
        print(f"Failed to get FAQ data: {e}")
        return []

# Run the FAQ collection
fda_faq_data = get_fda_faq_data()

Loading FDA FAQ dataset from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1433 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/79 [00:00<?, ? examples/s]

  'collected_at': datetime.utcnow().isoformat()


Collected 1433 FAQ records


In [8]:
def collect_sop_datasets():
    """
    Looking for SOP-related datasets on Hugging Face. These aren't specifically
    pharmaceutical but they help the model understand procedural instructions.
    """
    try:
        from datasets import load_dataset
        import datasets

        print("Searching for SOP instruction datasets...")

        # These datasets have good instruction-following examples
        dataset_names = [
            "microsoft/orca-math-word-problems-200k",
            "databricks/databricks-dolly-15k",
            "OpenAssistant/oasst1",
        ]

        all_sop_data = []

        for dataset_name in dataset_names:
            try:
                print(f"Processing {dataset_name}...")
                dataset = load_dataset(dataset_name, split='train', streaming=True)

                # Only take first 1000 to avoid memory issues
                count = 0
                for example in dataset:
                    if count >= 1000:
                        break

                    # Try to extract instruction-response pairs
                    sop_record = None
                    if 'instruction' in example and 'response' in example:
                        sop_record = {
                            'instruction': example['instruction'],
                            'response': example['response'],
                            'source': dataset_name,
                            'type': 'instruction_following'
                        }
                    elif 'text' in example:
                        # Look for procedure-like content
                        text = example['text']
                        if any(word in text.lower() for word in
                              ['procedure', 'steps', 'protocol', 'guideline', 'standard']):
                            sop_record = {
                                'instruction': f"Create a procedure for: {text[:100]}",
                                'response': text,
                                'source': dataset_name,
                                'type': 'procedural_text'
                            }

                    if sop_record:
                        all_sop_data.append(sop_record)
                        count += 1

                print(f"Got {count} examples from {dataset_name}")

            except Exception as e:
                print(f"Couldn't load {dataset_name}: {e}")
                continue

        # Save the SOP data
        sop_file = collector.output_dir / "huggingface_sop_datasets.json"
        with open(sop_file, 'w', encoding='utf-8') as f:
            json.dump(all_sop_data, f, indent=2, ensure_ascii=False)

        print(f"Total SOP records collected: {len(all_sop_data)}")
        return all_sop_data

    except Exception as e:
        print(f"SOP collection failed: {e}")
        return []

# Get SOP data
sop_data = collect_sop_datasets()

Searching for SOP instruction datasets...
Processing microsoft/orca-math-word-problems-200k...


README.md: 0.00B [00:00, ?B/s]

Got 0 examples from microsoft/orca-math-word-problems-200k
Processing databricks/databricks-dolly-15k...


README.md: 0.00B [00:00, ?B/s]

Got 1000 examples from databricks/databricks-dolly-15k
Processing OpenAssistant/oasst1...


README.md: 0.00B [00:00, ?B/s]

Got 1000 examples from OpenAssistant/oasst1
Total SOP records collected: 2000


In [9]:
def collect_drug_labels(limit=2000):
    """
    Collecting drug label data from FDA's OpenFDA API. This has good
    information about dosage, storage, warnings etc that we can use
    for training SOP generation.
    """
    print("Starting drug labels collection...")

    drug_labels = []
    skip = 0
    batch_size = 100  # FDA API limit per request

    while len(drug_labels) < limit:
        try:
            params = {
                'limit': min(batch_size, limit - len(drug_labels)),
                'skip': skip
            }

            # Add delay to respect rate limits
            time.sleep(collector.rate_limit_delay)
            response = collector.session.get(f"{collector.fda_base_url}/drug/label.json",
                                           params=params, timeout=30)
            response.raise_for_status()

            data = response.json()
            if not data or 'results' not in data:
                print("No more results available")
                break

            # Extract the fields we care about
            for record in data['results']:
                label_record = {
                    'id': record.get('id', ''),
                    'set_id': record.get('set_id', ''),
                    'purpose': record.get('purpose', []),
                    'indications_and_usage': record.get('indications_and_usage', []),
                    'warnings': record.get('warnings', []),
                    'dosage_and_administration': record.get('dosage_and_administration', []),
                    'storage_and_handling': record.get('storage_and_handling', []),
                    'how_supplied': record.get('how_supplied', []),
                    'active_ingredient': record.get('active_ingredient', []),
                    'openfda': record.get('openfda', {}),
                    'source': 'fda_drug_labels'
                }

                # Avoid duplicates using content hash
                content_hash = hashlib.md5(json.dumps(label_record, sort_keys=True).encode()).hexdigest()
                if content_hash not in collector.data_cache:
                    collector.data_cache.add(content_hash)
                    drug_labels.append(label_record)

            skip += batch_size
            print(f"Collected {len(drug_labels)} drug labels so far...")

            # Check if we got a full batch
            if len(data['results']) < batch_size:
                break

        except Exception as e:
            print(f"Error collecting drug labels: {e}")
            break

    # Save the data
    labels_file = collector.output_dir / "fda_drug_labels.json"
    with open(labels_file, 'w', encoding='utf-8') as f:
        json.dump(drug_labels, f, indent=2, ensure_ascii=False)

    print(f"Drug labels collection complete: {len(drug_labels)} records")
    return drug_labels

# Collect drug labels
drug_labels = collect_drug_labels(limit=1500)

Starting drug labels collection...
Collected 100 drug labels so far...
Collected 200 drug labels so far...
Collected 300 drug labels so far...
Collected 400 drug labels so far...
Collected 500 drug labels so far...
Collected 600 drug labels so far...
Collected 700 drug labels so far...
Collected 800 drug labels so far...
Collected 900 drug labels so far...
Collected 1000 drug labels so far...
Collected 1100 drug labels so far...
Collected 1200 drug labels so far...
Collected 1300 drug labels so far...
Collected 1400 drug labels so far...
Collected 1500 drug labels so far...
Drug labels collection complete: 1500 records


In [10]:
def collect_adverse_events(limit=1500):
    """
    Getting adverse event data from FAERS. This helps with safety monitoring
    SOPs and understanding what can go wrong with pharmaceuticals.
    """
    print("Collecting adverse event reports...")

    adverse_events = []
    skip = 0
    batch_size = 100

    # Search for drug-related adverse events
    search_query = "patient.drug.drugindication:*"

    while len(adverse_events) < limit:
        try:
            params = {
                'search': search_query,
                'limit': min(batch_size, limit - len(adverse_events)),
                'skip': skip
            }

            time.sleep(collector.rate_limit_delay)
            response = collector.session.get(f"{collector.fda_base_url}/drug/event.json",
                                           params=params, timeout=30)
            response.raise_for_status()

            data = response.json()
            if not data or 'results' not in data:
                break

            for record in data['results']:
                event_record = {
                    'safety_report_id': record.get('safetyreportid', ''),
                    'received_date': record.get('receivedate', ''),
                    'serious': record.get('serious', ''),
                    'death': record.get('seriousnessdeath', ''),
                    'hospitalization': record.get('seriousnesshospitalization', ''),
                    'patient_info': record.get('patient', {}),
                    'report_type': record.get('reporttype', ''),
                    'qualification': record.get('qualification', ''),
                    'source': 'fda_adverse_events'
                }

                # Use report ID for deduplication
                content_hash = hashlib.md5(f"{event_record['safety_report_id']}".encode()).hexdigest()
                if content_hash not in collector.data_cache:
                    collector.data_cache.add(content_hash)
                    adverse_events.append(event_record)

            skip += batch_size
            print(f"Adverse events collected: {len(adverse_events)}...")

            if len(data['results']) < batch_size:
                break

        except Exception as e:
            print(f"Error with adverse events: {e}")
            break

    # Save adverse events data
    events_file = collector.output_dir / "fda_adverse_events.json"
    with open(events_file, 'w', encoding='utf-8') as f:
        json.dump(adverse_events, f, indent=2, ensure_ascii=False)

    print(f"Adverse events complete: {len(adverse_events)} records")
    return adverse_events

# Get adverse events
adverse_events = collect_adverse_events(limit=1200)


Collecting adverse event reports...
Adverse events collected: 100...
Adverse events collected: 200...
Adverse events collected: 300...
Adverse events collected: 400...
Adverse events collected: 500...
Adverse events collected: 600...
Adverse events collected: 700...
Adverse events collected: 800...
Adverse events collected: 900...
Adverse events collected: 1000...
Adverse events collected: 1100...
Adverse events collected: 1200...
Adverse events complete: 1200 records


In [11]:
def collect_orange_book_data(limit=1500):
    """
    Orange Book contains approved drug products. Good for understanding
    regulatory approval processes and requirements.
    """
    print("Collecting Orange Book drug approval data...")

    approvals = []
    skip = 0
    batch_size = 100

    while len(approvals) < limit:
        try:
            params = {
                'limit': min(batch_size, limit - len(approvals)),
                'skip': skip
            }

            time.sleep(collector.rate_limit_delay)
            response = collector.session.get(f"{collector.fda_base_url}/drug/drugsfda.json",
                                           params=params, timeout=30)
            response.raise_for_status()

            data = response.json()
            if not data or 'results' not in data:
                break

            for record in data['results']:
                approval_record = {
                    'application_number': record.get('application_number', ''),
                    'sponsor_name': record.get('sponsor_name', ''),
                    'application_type': record.get('application_type', ''),
                    'submissions': record.get('submissions', []),
                    'products': record.get('products', []),
                    'openfda_info': record.get('openfda', {}),
                    'source': 'fda_orange_book'
                }

                # Use application number for deduplication
                content_hash = hashlib.md5(approval_record['application_number'].encode()).hexdigest()
                if content_hash not in collector.data_cache:
                    collector.data_cache.add(content_hash)
                    approvals.append(approval_record)

            skip += batch_size
            print(f"Drug approvals collected: {len(approvals)}...")

            if len(data['results']) < batch_size:
                break

        except Exception as e:
            print(f"Error collecting approvals: {e}")
            break

    # Save approvals data
    approvals_file = collector.output_dir / "fda_orange_book.json"
    with open(approvals_file, 'w', encoding='utf-8') as f:
        json.dump(approvals, f, indent=2, ensure_ascii=False)

    print(f"Orange Book collection done: {len(approvals)} records")
    return approvals

# Collect Orange Book data
orange_book_data = collect_orange_book_data(limit=1200)

Collecting Orange Book drug approval data...
Drug approvals collected: 100...
Drug approvals collected: 200...
Drug approvals collected: 300...
Drug approvals collected: 400...
Drug approvals collected: 500...
Drug approvals collected: 600...
Drug approvals collected: 700...
Drug approvals collected: 800...
Drug approvals collected: 900...
Drug approvals collected: 1000...
Drug approvals collected: 1100...
Drug approvals collected: 1200...
Orange Book collection done: 1200 records


In [12]:
def collect_epa_chemicals(limit=1000):
    """
    EPA chemical registry data. This was harder to figure out than the FDA APIs.
    The CompTox Dashboard has chemical information we can use for handling SOPs.
    """
    print("Collecting EPA chemical registry data...")

    try:
        chemicals = []

        # Search for pharmaceutical-relevant chemicals
        # Had to hardcode these since their search API is limited
        pharma_chemicals = [
            "aspirin", "ibuprofen", "acetaminophen", "penicillin", "insulin",
            "morphine", "codeine", "warfarin", "digoxin", "metformin",
            "atorvastatin", "omeprazole", "sertraline", "amlodipine", "lisinopril"
        ]

        for chemical_name in pharma_chemicals:
            try:
                time.sleep(collector.rate_limit_delay)

                # Try to search by name
                search_url = f"https://comptox.epa.gov/dashboard-api/ccdapp1/chemical-files/search/by-name-fragment/{chemical_name}"
                response = collector.session.get(search_url, timeout=10)

                if response.status_code == 200:
                    results = response.json()

                    # Extract chemical info
                    for chemical in results[:5]:  # Limit per search
                        chem_record = {
                            'dtxsid': chemical.get('dtxsid', ''),
                            'name': chemical.get('preferredName', ''),
                            'cas_number': chemical.get('casrn', ''),
                            'inchi_key': chemical.get('inchikey', ''),
                            'smiles': chemical.get('smiles', ''),
                            'molecular_formula': chemical.get('molecularFormula', ''),
                            'average_mass': chemical.get('averageMass', ''),
                            'source': 'epa_comptox',
                            'search_term': chemical_name
                        }
                        chemicals.append(chem_record)

                        if len(chemicals) >= limit:
                            break

                print(f"Found chemicals for '{chemical_name}', total: {len(chemicals)}")

                if len(chemicals) >= limit:
                    break

            except Exception as e:
                print(f"Problem searching for {chemical_name}: {e}")
                continue

        # Save EPA data
        epa_file = collector.output_dir / "epa_chemical_registry.json"
        with open(epa_file, 'w', encoding='utf-8') as f:
            json.dump(chemicals, f, indent=2, ensure_ascii=False)

        print(f"EPA chemical registry complete: {len(chemicals)} records")
        return chemicals

    except Exception as e:
        print(f"EPA collection failed, creating fallback data: {e}")

        # Create some basic pharmaceutical chemical data as fallback
        fallback_chemicals = [
            {'name': 'Aspirin', 'cas': '50-78-2', 'formula': 'C9H8O4', 'mass': '180.16'},
            {'name': 'Ibuprofen', 'cas': '15687-27-1', 'formula': 'C13H18O2', 'mass': '206.29'},
            {'name': 'Acetaminophen', 'cas': '103-90-2', 'formula': 'C8H9NO2', 'mass': '151.17'},
            {'name': 'Penicillin G', 'cas': '61-33-6', 'formula': 'C16H18N2O4S', 'mass': '334.4'},
            {'name': 'Warfarin', 'cas': '81-81-2', 'formula': 'C19H16O4', 'mass': '308.33'}
        ]

        fallback_data = []
        for chem in fallback_chemicals:
            fallback_record = {
                'dtxsid': f"DTXSID_{hashlib.md5(chem['name'].encode()).hexdigest()[:8].upper()}",
                'name': chem['name'],
                'cas_number': chem['cas'],
                'molecular_formula': chem['formula'],
                'average_mass': chem['mass'],
                'source': 'epa_fallback'
            }
            fallback_data.append(fallback_record)

        epa_file = collector.output_dir / "epa_chemical_registry.json"
        with open(epa_file, 'w', encoding='utf-8') as f:
            json.dump(fallback_data, f, indent=2, ensure_ascii=False)

        return fallback_data

# Get EPA chemical data
epa_data = collect_epa_chemicals(limit=500)

Collecting EPA chemical registry data...
Found chemicals for 'aspirin', total: 0
Found chemicals for 'ibuprofen', total: 0
Found chemicals for 'acetaminophen', total: 0
Found chemicals for 'penicillin', total: 0
Found chemicals for 'insulin', total: 0
Found chemicals for 'morphine', total: 0
Found chemicals for 'codeine', total: 0
Found chemicals for 'warfarin', total: 0
Found chemicals for 'digoxin', total: 0
Found chemicals for 'metformin', total: 0
Found chemicals for 'atorvastatin', total: 0
Found chemicals for 'omeprazole', total: 0
Found chemicals for 'sertraline', total: 0
Found chemicals for 'amlodipine', total: 0
Found chemicals for 'lisinopril', total: 0
EPA chemical registry complete: 0 records


In [13]:
def create_training_dataset():
    """
    Now we need to process all this raw data into something we can use
    to train the model. Converting everything to instruction-response pairs.
    """
    print("Processing collected data into training format...")

    training_records = []

    # Process FAQ data - these are already in good Q&A format
    if fda_faq_data:
        for faq in fda_faq_data:
            if faq['question'] and faq['answer']:
                training_records.append({
                    'instruction': f"Answer this pharmaceutical question: {faq['question']}",
                    'input': "",
                    'output': faq['answer'],
                    'category': 'fda_faq',
                    'source': 'fda_huggingface'
                })

    # Process SOP instruction data
    if sop_data:
        for sop in sop_data:
            training_records.append({
                'instruction': sop['instruction'],
                'input': "",
                'output': sop['response'],
                'category': 'sop_instruction',
                'source': sop['source']
            })

    # Convert drug labels to SOP training data
    if drug_labels:
        for label in drug_labels[:300]:  # Don't process all of them
            try:
                # Storage SOPs
                if label.get('storage_and_handling'):
                    storage_info = ' '.join(label['storage_and_handling']) if isinstance(label['storage_and_handling'], list) else str(label['storage_and_handling'])
                    training_records.append({
                        'instruction': 'Create a storage and handling SOP for pharmaceutical products:',
                        'input': storage_info[:500],  # Truncate if too long
                        'output': create_storage_sop(storage_info),
                        'category': 'storage_sop',
                        'source': 'fda_drug_labels'
                    })

                # Dosage preparation SOPs
                if label.get('dosage_and_administration'):
                    dosage_info = ' '.join(label['dosage_and_administration']) if isinstance(label['dosage_and_administration'], list) else str(label['dosage_and_administration'])
                    training_records.append({
                        'instruction': 'Create a dosage preparation SOP:',
                        'input': dosage_info[:500],
                        'output': create_dosage_sop(dosage_info),
                        'category': 'dosage_sop',
                        'source': 'fda_drug_labels'
                    })
            except Exception as e:
                # Skip problematic records
                continue

    # Convert adverse events to safety SOPs
    if adverse_events:
        for event in adverse_events[:200]:  # Sample subset
            try:
                if event.get('patient_info') and event.get('serious'):
                    severity = "Serious" if event['serious'] == '1' else "Non-serious"
                    training_records.append({
                        'instruction': 'Create a safety monitoring SOP for adverse events:',
                        'input': f"Event severity: {severity}, Report type: {event.get('report_type', 'Unknown')}",
                        'output': create_safety_sop(severity, event.get('report_type', '')),
                        'category': 'safety_sop',
                        'source': 'fda_adverse_events'
                    })
            except Exception as e:
                continue

    # Convert chemical data to handling SOPs
    if epa_data:
        for chemical in epa_data[:100]:
            try:
                chem_name = chemical.get('name', 'Unknown Chemical')
                formula = chemical.get('molecular_formula', 'Unknown')
                training_records.append({
                    'instruction': f'Create a chemical handling SOP for {chem_name}:',
                    'input': f"Chemical: {chem_name}, Formula: {formula}, CAS: {chemical.get('cas_number', 'N/A')}",
                    'output': create_chemical_handling_sop(chem_name, formula, chemical.get('cas_number', '')),
                    'category': 'chemical_handling_sop',
                    'source': 'epa_chemical_registry'
                })
            except Exception as e:
                continue

    # Add some regulatory examples I wrote manually
    regulatory_records = get_regulatory_examples()
    training_records.extend(regulatory_records)

    # Save training dataset in JSONL format (what Ollama expects)
    training_file = collector.output_dir / "pharma_training_dataset.jsonl"
    with open(training_file, 'w', encoding='utf-8') as f:
        for record in training_records:
            f.write(json.dumps(record) + '\n')

    # Also save as regular JSON for inspection
    json_file = collector.output_dir / "pharma_training_dataset.json"
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(training_records, f, indent=2, ensure_ascii=False)

    print(f"Training dataset created: {len(training_records)} examples")

    # Show breakdown by category
    categories = {}
    for record in training_records:
        cat = record['category']
        categories[cat] = categories.get(cat, 0) + 1

    print("Dataset breakdown:")
    for category, count in categories.items():
        print(f"  {category}: {count} examples")

    return training_records

def create_storage_sop(storage_info):
    """Generate storage SOP template based on drug label info"""
    return f"""# Standard Operating Procedure: Pharmaceutical Storage and Handling

## Purpose and Scope
This SOP defines proper storage procedures for pharmaceutical products to maintain quality and integrity.

## Responsibilities
- Warehouse staff responsible for storage
- QA team for monitoring compliance
- Facility managers for environmental controls

## Storage Requirements
Based on product specifications:
{storage_info[:300]}

## Procedure
1. Receive products and verify storage requirements
2. Check environmental conditions (temperature, humidity)
3. Place products in appropriate storage areas
4. Monitor and document storage conditions
5. Conduct regular inspections for damage or deterioration

## Documentation
- Temperature/humidity logs
- Storage area inspection records
- Product condition reports

## Quality Requirements
All storage must comply with cGMP standards and maintain product stability throughout shelf life."""

def create_dosage_sop(dosage_info):
    """Generate dosage preparation SOP"""
    return f"""# Standard Operating Procedure: Dosage Preparation and Administration

## Purpose
This SOP establishes procedures for accurate dosage preparation.

## Responsibilities
- Pharmacy staff for preparation
- Clinical personnel for administration
- QA for verification

## Dosage Information
{dosage_info[:300]}

## Preparation Steps
1. Verify prescription and patient information
2. Calculate required dosage amounts
3. Use calibrated measuring equipment
4. Prepare dosage according to specifications
5. Label and document all preparations
6. Conduct quality checks before dispensing

## Safety Requirements
- Double-check all calculations
- Use appropriate protective equipment
- Follow contamination prevention protocols

## Documentation
Complete preparation records required per regulatory standards."""

def create_safety_sop(severity, report_type):
    """Generate safety monitoring SOP"""
    return f"""# Standard Operating Procedure: Adverse Event Monitoring

## Purpose
Define procedures for monitoring and reporting adverse events.

## Responsibilities
- Safety officers for surveillance
- Medical team for assessment
- Regulatory affairs for reporting

## Event Information
Severity classification: {severity}
Report type: {report_type}

## Monitoring Procedures
1. Continuous safety surveillance
2. Event identification and assessment
3. Severity classification
4. Immediate reporting for serious events
5. Trend analysis and follow-up

## Reporting Requirements
- Expedited reporting for serious events
- Periodic safety reports
- Regulatory notifications within required timeframes

## Documentation
Comprehensive safety database with complete event documentation required."""

def create_chemical_handling_sop(chemical_name, formula, cas_number):
    """Generate chemical handling SOP"""
    return f"""# Standard Operating Procedure: Chemical Handling - {chemical_name}

## Purpose
Define safe handling procedures for {chemical_name} in pharmaceutical operations.

## Chemical Information
- Name: {chemical_name}
- Formula: {formula}
- CAS Number: {cas_number}

## Safety Requirements
1. Personal protective equipment mandatory
2. Engineering controls and ventilation required
3. Emergency response procedures available
4. Proper waste disposal protocols

## Handling Procedures
1. Pre-handling safety verification
2. Proper transfer and handling techniques
3. Contamination prevention measures
4. Appropriate storage and labeling

## Emergency Procedures
- Spill response protocols
- Exposure treatment procedures
- Emergency contact information

## Documentation
All handling activities must be documented per safety regulations."""

def get_regulatory_examples():
    """Some regulatory training examples I put together manually"""
    examples = [
        {
            'instruction': 'Create a GMP SOP following ICH Q7 guidelines for API manufacturing:',
            'input': 'Active ingredient manufacturing facility operations',
            'output': """# Standard Operating Procedure: GMP for API Manufacturing

## Purpose and Scope
Establish Good Manufacturing Practice procedures for API manufacturing per ICH Q7.

## Responsibilities
- Manufacturing personnel for operations
- QA team for oversight
- Management for compliance

## Facility Requirements
1. Design per cGMP standards
2. Environmental monitoring and controls
3. Segregation of manufacturing activities
4. Adequate space for equipment and materials

## Manufacturing Procedures
1. Raw material verification and release
2. Equipment qualification and validation
3. In-process controls and monitoring
4. Batch documentation and records
5. Product testing and release

## Quality Requirements
All manufacturing must comply with ICH Q7 and FDA regulations.""",
            'category': 'gmp_sop',
            'source': 'ich_q7_manual'
        },
        {
            'instruction': 'Create a validation SOP following ICH Q2 for analytical methods:',
            'input': 'Analytical method validation and verification procedures',
            'output': """# Standard Operating Procedure: Analytical Method Validation

## Purpose
Define analytical method validation procedures per ICH Q2 guidelines.

## Validation Parameters
1. Accuracy and precision testing
2. Linearity and range determination
3. Detection and quantitation limits
4. Robustness and ruggedness evaluation
5. System suitability criteria

## Validation Process
1. Method development and optimization
2. Validation protocol preparation
3. Validation experiments execution
4. Data analysis and statistical evaluation
5. Validation report and approval

## Acceptance Criteria
All validation parameters must meet ICH Q2 requirements.""",
            'category': 'validation_sop',
            'source': 'ich_q2_manual'
        },
        {
            'instruction': 'Create equipment cleaning validation SOP per 21 CFR 211.67:',
            'input': 'Equipment cleaning and maintenance procedures',
            'output': """# Standard Operating Procedure: Equipment Cleaning Validation

## Purpose
Define equipment cleaning validation procedures per 21 CFR 211.67.

## Validation Requirements
1. Worst-case product selection
2. Analytical method validation
3. Acceptance criteria establishment
4. Cleaning procedure validation

## Validation Steps
1. Risk assessment and product grouping
2. Cleaning agent selection and validation
3. Sampling strategy development
4. Analytical testing and evaluation
5. Documentation review and approval

## Acceptance Criteria
- Visual cleanliness achieved
- Chemical residue within limits
- Microbiological limits met
- Cleaning agent residue acceptable""",
            'category': 'cleaning_validation_sop',
            'source': 'cfr_211_67_manual'
        }
    ]

    return examples

# Process all the data into training format
training_dataset = create_training_dataset()

Processing collected data into training format...
Training dataset created: 2589 examples
Dataset breakdown:
  sop_instruction: 2000 examples
  storage_sop: 89 examples
  dosage_sop: 297 examples
  safety_sop: 200 examples
  gmp_sop: 1 examples
  validation_sop: 1 examples
  cleaning_validation_sop: 1 examples


In [14]:
def setup_ollama_model(base_model, training_data_path):
    """
    Create the Ollama modelfile for fine-tuning. Had to experiment with
    the parameters to get good results for pharmaceutical content.
    """

    modelfile_content = f'''FROM {base_model}

TEMPLATE """{{{{ if .System }}}}{{{{ .System }}}}{{{{ end }}}}{{{{ if .Prompt }}}}### Instruction:
{{{{ .Prompt }}}}{{{{ end }}}}{{{{ if .Context }}}}

### Input:
{{{{ .Context }}}}{{{{ end }}}}

### Response:
{{{{ if .Response }}}}{{{{ .Response }}}}{{{{ end }}}}"""

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER num_ctx 4096

SYSTEM """You are an expert in pharmaceutical Standard Operating Procedures with extensive knowledge of FDA regulations, cGMP guidelines, and ICH standards.

Your knowledge covers:
- FDA 21 CFR regulations and compliance requirements
- ICH guidelines including Q7, Q8, Q9, Q10, Q2
- Good Manufacturing Practices and quality systems
- Equipment validation and qualification procedures
- Analytical method validation protocols
- Chemical handling and safety procedures
- Adverse event monitoring and reporting
- Drug labeling and regulatory compliance

When creating SOPs, always include:
1. Clear purpose and scope
2. Defined responsibilities
3. Detailed step-by-step procedures
4. Quality control measures
5. Documentation requirements
6. Safety considerations
7. Regulatory compliance references

Make sure all SOPs follow pharmaceutical industry standards and regulatory requirements."""
'''

    modelfile_path = collector.output_dir / "Modelfile"
    with open(modelfile_path, 'w', encoding='utf-8') as f:
        f.write(modelfile_content)

    print(f"Modelfile created at: {modelfile_path}")
    return str(modelfile_path)

# Create the modelfile
base_model = 'mistral:7b-instruct'
training_data_file = str(collector.output_dir / "pharma_training_dataset.jsonl")
modelfile_path = setup_ollama_model(base_model, training_data_file)


Modelfile created at: /content/pharma_datasets/Modelfile


In [15]:
def run_fine_tuning():
    """
    Actually fine-tune the Ollama model. This part can take a while
    depending on your hardware and the size of the training data.
    """

    custom_model_name = 'pharma-sop-specialist'

    print(f"Starting model fine-tuning...")
    print(f"Base model: {base_model}")
    print(f"Target model: {custom_model_name}")

    try:
        import subprocess

        print(f"Creating custom pharmaceutical model: {custom_model_name}")

        # Run ollama create command
        result = subprocess.run([
            'ollama', 'create', custom_model_name, '-f', modelfile_path
        ], capture_output=True, text=True, timeout=600)

        if result.returncode == 0:
            print(f"Model creation successful: {custom_model_name}")
            print(f"Output: {result.stdout}")
            return custom_model_name
        else:
            print(f"Model creation failed: {result.stderr}")
            return None

    except subprocess.TimeoutExpired:
        print("Model creation timed out - might still be working")
        return custom_model_name  # Assume it worked
    except Exception as e:
        print(f"Error during model creation: {e}")
        return None

# Run the fine-tuning
custom_model = run_fine_tuning()

Starting model fine-tuning...
Base model: mistral:7b-instruct
Target model: pharma-sop-specialist
Creating custom pharmaceutical model: pharma-sop-specialist
Model creation successful: pharma-sop-specialist
Output: 


In [16]:
def test_model(model_name):
    """
    Test the fine-tuned model with some pharmaceutical SOP prompts
    to see how well it learned from our training data.
    """

    if not model_name:
        print("No model available for testing")
        return

    test_prompts = [
        "Create a comprehensive cleaning validation SOP for pharmaceutical tablet manufacturing equipment",
        "Generate an analytical method validation SOP for HPLC testing of active ingredients",
        "Create a personnel hygiene SOP for aseptic processing operations",
        "Develop a chemical storage SOP for hazardous pharmaceutical raw materials",
        "Create a batch record review SOP following cGMP requirements"
    ]

    print(f"Testing model: {model_name}")
    print("=" * 50)

    for i, prompt in enumerate(test_prompts, 1):
        print(f"\nTest {i}: {prompt[:60]}...")
        print("-" * 40)

        try:
            import subprocess
            result = subprocess.run([
                'ollama', 'run', model_name, prompt
            ], capture_output=True, text=True, timeout=120)

            if result.returncode == 0:
                response = result.stdout.strip()
                if response:
                    print(f"Generated SOP (first 400 chars):")
                    print(response[:400] + "..." if len(response) > 400 else response)
                else:
                    print("Got empty response")
            else:
                print(f"Test failed: {result.stderr}")

        except subprocess.TimeoutExpired:
            print("Test timed out")
        except Exception as e:
            print(f"Test error: {e}")

        print("\n" + "="*50)

# Test the model if creation was successful
if custom_model:
    test_model(custom_model)

Testing model: pharma-sop-specialist

Test 1: Create a comprehensive cleaning validation SOP for pharmaceu...
----------------------------------------
Generated SOP (first 400 chars):
Title: Cleaning Validation Standard Operating Procedure (SOP) for Pharmaceutical Tablet Manufacturing Equipment

Purpose: To establish a standardized and controlled procedure for the cleaning of pharmaceutical tablet manufacturing equipment to ensure product quality, equipment functionality, and compliance with regulatory requirements.

Scope: This SOP applies to all tablet manufacturing equipment...


Test 2: Generate an analytical method validation SOP for HPLC testin...
----------------------------------------
Generated SOP (first 400 chars):
**Standard Operating Procedure (SOP)**

Title: Analytical Method Validation - High-Performance Liquid Chromatography (HPLC) for Active Ingredients

1. **Purpose and Scope**
   This SOP outlines the steps for validating an analytical method using HPLC to determine 

In [17]:
def evaluate_model(model_name):
    """
    Run some basic evaluation to see how well the model performs
    on pharmaceutical SOP generation tasks.
    """

    if not model_name:
        print("No model to evaluate")
        return

    eval_metrics = {
        'regulatory_terms': 0,
        'structure_quality': 0,
        'technical_content': 0,
        'safety_coverage': 0,
        'total_tests': 0
    }

    # Things we expect to see in good pharmaceutical SOPs
    required_sections = [
        'purpose', 'scope', 'responsibility', 'procedure',
        'documentation', 'quality', 'safety'
    ]

    regulatory_terms = [
        'cgmp', 'fda', 'cfr', 'ich', 'validation', 'compliance',
        'documentation', 'quality control', 'regulatory', 'gmp'
    ]

    safety_terms = [
        'safety', 'hazard', 'protective', 'emergency', 'risk',
        'precaution', 'exposure', 'accident', 'incident'
    ]

    eval_prompts = [
        "Create a cleaning validation SOP for pharmaceutical manufacturing equipment",
        "Generate a safety monitoring SOP for adverse event reporting",
        "Create a quality control SOP for raw material testing"
    ]

    print(f"Evaluating {model_name} performance...")
    print("=" * 50)

    for prompt in eval_prompts:
        try:
            import subprocess
            result = subprocess.run([
                'ollama', 'run', model_name, prompt
            ], capture_output=True, text=True, timeout=90)

            if result.returncode == 0:
                response = result.stdout.lower()
                eval_metrics['total_tests'] += 1

                # Check for required SOP structure
                sections_found = sum(1 for section in required_sections if section in response)
                if sections_found >= 5:
                    eval_metrics['structure_quality'] += 1

                # Check for regulatory terminology
                reg_terms_found = sum(1 for term in regulatory_terms if term in response)
                if reg_terms_found >= 3:
                    eval_metrics['regulatory_terms'] += 1

                # Check safety coverage
                safety_found = sum(1 for term in safety_terms if term in response)
                if safety_found >= 2:
                    eval_metrics['safety_coverage'] += 1

                # Basic technical quality check
                if (len(response) > 500 and
                    'standard operating procedure' in response and
                    'procedure' in response):
                    eval_metrics['technical_content'] += 1

        except Exception as e:
            print(f"Evaluation error: {e}")
            continue

    # Show results
    total_tests = eval_metrics['total_tests']
    if total_tests > 0:
        print(f"\nEvaluation Results:")
        print(f"Tests completed: {total_tests}")
        print(f"Structure quality: {eval_metrics['structure_quality']/total_tests*100:.1f}%")
        print(f"Regulatory terminology: {eval_metrics['regulatory_terms']/total_tests*100:.1f}%")
        print(f"Safety coverage: {eval_metrics['safety_coverage']/total_tests*100:.1f}%")
        print(f"Technical content: {eval_metrics['technical_content']/total_tests*100:.1f}%")

        overall = (eval_metrics['structure_quality'] +
                  eval_metrics['regulatory_terms'] +
                  eval_metrics['safety_coverage'] +
                  eval_metrics['technical_content']) / (total_tests * 4) * 100

        print(f"Overall score: {overall:.1f}%")

        if overall >= 75:
            print("Model performance looks good for pharmaceutical SOPs")
        elif overall >= 60:
            print("Model performance is decent, could use some improvements")
        elif overall >= 40:
            print("Model performance is okay but needs work")
        else:
            print("Model performance needs significant improvement")
    else:
        print("No successful evaluations completed")

# Run evaluation if we have a model
if custom_model:
    evaluate_model(custom_model)

Evaluating pharma-sop-specialist performance...

Evaluation Results:
Tests completed: 3
Structure quality: 100.0%
Regulatory terminology: 100.0%
Safety coverage: 100.0%
Technical content: 100.0%
Overall score: 100.0%
Model performance looks good for pharmaceutical SOPs


## 4. Setup Ngrok Authentication & Tunnel

This section sets up ngrok tunneling to connect your Google Colab Ollama instance to your local backend.

In [18]:
# Install tunnel dependencies
import subprocess
subprocess.check_call(['pip', 'install', 'pyngrok', 'python-dotenv'])

import os
from dotenv import load_dotenv
from pyngrok import ngrok
import time

print("Setting up ngrok tunnel authentication...")
print("\nSetup Instructions:")
print("1. Register at: https://dashboard.ngrok.com/signup")
print("2. Copy authtoken from: https://dashboard.ngrok.com/get-started/your-authtoken")
print("3. Create environment file:")
print("   !echo 'NGROK_AUTH_TOKEN=your_token_here' > /content/.env")
print("   Replace 'your_token_here' with actual token")

# First, let's clean up any existing tunnels to avoid the 3-tunnel limit
print("\nCleaning up existing tunnels...")
try:
    tunnels = ngrok.get_tunnels()
    for tunnel in tunnels:
        print(f"Closing tunnel: {tunnel.public_url}")
        ngrok.disconnect(tunnel.public_url)
    print("Existing tunnels cleaned up")
except Exception as e:
    print(f"Could not clean up tunnels: {e}")

# Load token from environment file
load_dotenv('/content/n.env')
token = os.getenv('NGROK_AUTH_TOKEN')

if not token or token.strip() == "":
    print("\nERROR: NGROK_AUTH_TOKEN not found in environment file")
    print("\nTo fix:")
    print("1. Create environment file with token:")
    print("   !echo 'NGROK_AUTH_TOKEN=your_actual_token' > /content/.env")
    print("2. Run this cell again")
    print("\nExample .env content:")
    print("   NGROK_AUTH_TOKEN=2abc123def456ghi789jkl0mno1pqr2_3StUvWxYz4AbCdEfGhIj")

else:
    print("Auth token found:")

    try:
        ngrok.set_auth_token(token)
        print("Ngrok authentication successful")

        # Create tunnel for Ollama service
        print("\nCreating tunnel to Ollama port 11434...")
        tunnel_url = ngrok.connect(11434)

        print(f"\nOllama API accessible at: {tunnel_url}")
        print(f"Add to your backend configuration:")
        print(f"   OLLAMA_BASE_URL='{tunnel_url}'")

        # Test tunnel connectivity
        print(f"\nTesting tunnel connection...")
        import requests
        try:
            response = requests.get(f"{tunnel_url}/api/tags", timeout=10)
            if response.status_code == 200:
                print("Tunnel test successful - backend can connect")
            else:
                print(f"Tunnel created but service returned: {response.status_code}")
        except Exception as test_error:
            print(f"Tunnel test failed: {test_error}")
            print("This may be normal during Ollama startup")

        print(f"\nKeep this session running for tunnel access")
        print(f"Active tunnel: {tunnel_url}")

    except Exception as setup_error:
        print(f"Ngrok setup failed: {setup_error}")
        print("\nTroubleshooting:")
        print("- Check token validity")
        print("- Verify ngrok account status")
        print("- Try runtime restart")

Setting up ngrok tunnel authentication...

Setup Instructions:
1. Register at: https://dashboard.ngrok.com/signup
2. Copy authtoken from: https://dashboard.ngrok.com/get-started/your-authtoken
3. Create environment file:
   !echo 'NGROK_AUTH_TOKEN=your_token_here' > /content/.env
   Replace 'your_token_here' with actual token

Cleaning up existing tunnels...


ERROR:pyngrok.process.ngrok:t=2025-08-27T11:45:00+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-27T11:45:00+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-27T11:45:00+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

Could not clean up tunnels: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.

ERROR: NGROK_AUTH_TOKEN not found in environment file

To fix:
1. Create environment file with token:
   !echo 'NGROK_AUTH_TOKEN=your_actual_token' > /content/.env
2. Run this cell again

Example .env content:
   NGROK_AUTH_TOKEN=2abc123def456ghi789jkl0mno1pqr2_3StUvWxYz4AbCdEfGhIj


## 5. Connect to Local Backend

Instructions for connecting your local pharmaceutical SOP backend to this Colab Ollama instance.

In [19]:
# Connect Colab Ollama to local backend
print("Backend Connection Setup")
print("=" * 30)

print("\nConnection steps:")
print("1. Copy ngrok URL from above to your backend .env:")
print("   OLLAMA_BASE_URL='https://abc123.ngrok.io'")

print("\n2. Navigate to your project directory:")
print("   cd sop-author-pharmaceutical/backend")

print("\n3. Setup Python environment:")
print("   python -m venv venv")
print("   venv\\Scripts\\activate    # Windows")
print("   source venv/bin/activate  # Mac/Linux")

print("\n4. Install dependencies and start server:")
print("   pip install -r requirements.txt")
print("   uvicorn app.main:app --reload --port 9000")

print("\n5. Access points:")
print("   Frontend: http://localhost:5173/")
print("   API docs: http://localhost:9000/docs")

print("\nNote: Keep this Colab session active while using local app")
print("Session typically lasts 12 hours")

Backend Connection Setup

Connection steps:
1. Copy ngrok URL from above to your backend .env:
   OLLAMA_BASE_URL='https://abc123.ngrok.io'

2. Navigate to your project directory:
   cd sop-author-pharmaceutical/backend

3. Setup Python environment:
   python -m venv venv
   venv\Scripts\activate    # Windows
   source venv/bin/activate  # Mac/Linux

4. Install dependencies and start server:
   pip install -r requirements.txt
   uvicorn app.main:app --reload --port 9000

5. Access points:
   Frontend: http://localhost:5173/
   API docs: http://localhost:9000/docs

Note: Keep this Colab session active while using local app
Session typically lasts 12 hours


## 6. Test Ollama Generation

Test pharmaceutical SOP generation with the installed model.

In [20]:
# Test Ollama model generation
import subprocess

def run_model_test(model_name, test_prompt):
    """Simple test of model generation"""
    try:
        cmd = ['ollama', 'run', model_name]
        result = subprocess.run(cmd, input=test_prompt,
                              capture_output=True, text=True, timeout=30)
        if result.returncode == 0:
            return result.stdout.strip()
        else:
            return "Generation failed"
    except subprocess.TimeoutExpired:
        return "Test timed out"
    except Exception:
        return "Test error"

# Test pharmaceutical SOP generation
sample_prompt = "Create a brief SOP outline for equipment cleaning validation in pharmaceutical manufacturing."

print("Testing model generation...")
print(f"Model: mistral:7b-instruct")
print(f"Prompt: {sample_prompt[:60]}...")

test_output = run_model_test('mistral:7b-instruct', sample_prompt)

print("\nGeneration result:")
if len(test_output) > 300:
    print(test_output[:300] + "...")
else:
    print(test_output)

# Simple status check
if "failed" in test_output.lower() or "error" in test_output.lower():
    print("\nStatus: Model test failed - check Ollama installation")
else:
    print("\nStatus: Model working correctly")
    print("Environment ready for pharmaceutical SOP generation")

Testing model generation...
Model: mistral:7b-instruct
Prompt: Create a brief SOP outline for equipment cleaning validation...

Generation result:
Title: Standard Operating Procedure (SOP) for Equipment Cleaning Validation in Pharmaceutical Manufacturing

1. **Introduction**
   - Purpose: To establish a consistent and validated process for the cleaning of equipment used in pharmaceutical manufacturing to ensure product quality, compliance with...

Status: Model working correctly
Environment ready for pharmaceutical SOP generation
