# Minimal Fixed HTML Slicing Pipeline

This notebook provides a robust slicing pipeline that:
- Calls Ollama directly with a selected model
- Always returns a `SliceSet` (empty on failure) to avoid `NoneType` errors
- Prints raw LLM output for debugging



In [None]:
import os, json, time
from typing import List
from pydantic import BaseModel, Field

# Pydantic models
class Slice(BaseModel):
    first_line: int = Field(..., ge=0)
    last_line: int = Field(..., ge=0)

class SliceSet(BaseModel):
    slices: List[Slice] = Field(default_factory=list)

# Utilities

def load_prompt(prompt_name: str) -> str:
    with open(f"../prompts/{prompt_name}.md", "r", encoding="utf-8") as f:
        return f.read()


def _get_ollama_base_url() -> str:
    return os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")


def _select_model(preferred: list[str] | None = None) -> str:
    preferred = preferred or ["gpt-oss:20b"]
    try:
        import ollama
        client = ollama.Client(host=_get_ollama_base_url())
        # Prefer running models
        try:
            ps = client.ps()
            running = [m.get('name') or m.get('model') for m in ps.get('models', []) if isinstance(m, dict)]
        except Exception:
            running = []
        # Fallback to listed
        try:
            listed = client.list()
            listed_models = [m.get('name') or m.get('model') for m in listed.get('models', []) if isinstance(m, dict)]
        except Exception:
            listed_models = []
        names = [n for n in running + listed_models if isinstance(n, str) and n]
        # prefer desired
        for cand in (preferred or []):
            if cand in names:
                return cand
        if names:
            return names[0]
    except Exception:
        pass
    return preferred[0]


# LLM call with direct Ollama and safe wrapper
class _ContentWrapper:
    def __init__(self, content: str, raw_response=None, model: str = "", duration_s: float = 0.0):
        self.content = content
        self.raw_response = raw_response
        self.model = model
        self.duration_s = duration_s
        self.response_metadata = {"source": "ollama.direct"}
        self.additional_kwargs = {}


def call_llm_local(prompt_content: str, model: str | None = None, temperature: float = 0, schema: dict | None = None) -> _ContentWrapper:
    if not isinstance(prompt_content, str) or not prompt_content.strip():
        return _ContentWrapper("")
    model = model or _select_model()
    base_url = _get_ollama_base_url()
    try:
        import ollama
        client = ollama.Client(host=base_url)
        t0 = time.time()
        messages = [{"role": "user", "content": prompt_content}]
        tools = None
        if schema:
            tools = [{
                "type": "function",
                "function": {
                    "name": "structured_response",
                    "description": "Provide a structured response according to the schema",
                    "parameters": schema
                }
            }]
            # Strong instruction to use the tool
            messages.insert(0, {"role": "system", "content": "You must respond using the structured_response function with the exact schema. Do not include any other text."})
        resp = client.chat(model=model, messages=messages, tools=tools, options={"temperature": temperature})
        dt = time.time() - t0
        # Extract content and tool_calls if present
        content = resp.get('message', {}).get('content', '') if isinstance(resp, dict) else ''
        tool_calls = resp.get('message', {}).get('tool_calls', []) if isinstance(resp, dict) else []
        print(f"LLM chat took {dt:.2f}s | model={model}")
        print("RAW ollama.chat response:")
        try:
            print(json.dumps(resp, ensure_ascii=False, indent=2))
        except Exception:
            print(str(resp))
        # If content empty but tool_calls present, pack the tool args as content for downstream parsing
        if (not content.strip()) and tool_calls:
            try:
                args = tool_calls[0].get('function', {}).get('arguments')
                if isinstance(args, str):
                    content = args
                elif isinstance(args, dict):
                    content = json.dumps(args)
            except Exception:
                pass
        return _ContentWrapper((content or '').strip(), raw_response=resp, model=model, duration_s=dt)
    except Exception as e:
        print(f"LLM call error: {e}")
        return _ContentWrapper("", raw_response=None, model=model or "", duration_s=0.0)


def generate_html_slices_fixed(html_content: str) -> SliceSet:
    prompt = load_prompt("generate_slices")
    json_schema = SliceSet.model_json_schema()
    prompt = prompt.format(html_content=html_content, output_schema=json_schema)

    response = call_llm_local(prompt, schema=json_schema)
    raw = (getattr(response, "content", None) or "").strip()
    print("Raw LLM response (full):\n", raw)
    # Also print raw response object for complete debugging
    try:
        print("\nRaw response object:")
        print(json.dumps(response.raw_response, ensure_ascii=False, indent=2))
    except Exception:
        print(response.raw_response)
    if not raw:
        return SliceSet(slices=[])

    content = raw
    if content.startswith('```json'):
        start_idx = content.find('\n', 7) + 1
        end_idx = content.rfind('```')
        if start_idx > 7 and end_idx > start_idx:
            content = content[start_idx:end_idx].strip()
    elif content.startswith('```'):
        start_idx = content.find('\n', 3) + 1
        end_idx = content.rfind('```')
        if start_idx > 3 and end_idx > start_idx:
            content = content[start_idx:end_idx].strip()

    try:
        data = json.loads(content)
    except Exception as e:
        # Try to extract JSON substring
        start = content.find('{')
        end = content.rfind('}')
        if start != -1 and end != -1 and end > start:
            try:
                data = json.loads(content[start:end+1])
            except Exception:
                data = {}
        else:
            data = {}

    html_lines = html_content.split('\n')
    valid: List[Slice] = []
    for sl in data.get('slices', []) if isinstance(data, dict) else []:
        try:
            a = int(sl.get('first_line', 0))
            b = int(sl.get('last_line', 0))
        except Exception:
            continue
        a = max(0, min(a, len(html_lines) - 1))
        b = max(a, min(b, len(html_lines) - 1))
        valid.append(Slice(first_line=a, last_line=b))

    return SliceSet(slices=valid)



In [2]:
# Test HTML
html_sample = """<!DOCTYPE html>
<html>
<head><title>Example Co - Services</title></head>
<body>
<h1>Example Co</h1>
<p>We provide consulting, cloud migration, and AI services.</p>
<section id="services">
  <h2>Services</h2>
  <div><h3>Cloud Migration</h3><p>Move to AWS, Azure, or GCP.</p></div>
  <div><h3>AI</h3><p>Custom ML solutions for automation.</p></div>
</section>
<section id="contact">
  <h2>Contact</h2>
  <p>Email: info@example.com</p>
</section>
</body>
</html>"""

print(f"Input HTML: {len(html_sample)} chars, {len(html_sample.split(chr(10)))} lines")

# Run slicing
slice_result = generate_html_slices_fixed(html_sample)
print("\n✅ SLICING SUCCESSFUL!")
print(f"Generated {len(slice_result.slices)} slices:")

# Extract and show
def get_slices(html_content: str, slice_set: SliceSet) -> str:
    if not slice_set.slices:
        return ""
    lines = html_content.split('\n')
    chunks = []
    for sl in slice_set.slices:
        a = max(0, sl.first_line)
        b = min(len(lines) - 1, sl.last_line)
        if a <= b:
            chunks.append('\n'.join(lines[a:b+1]))
            chunks.append(f"\n<!-- SLICE {sl.first_line}-{sl.last_line} -->\n")
    return '\n'.join(chunks)

extracted = get_slices(html_sample, slice_result)
print("\nExtracted (first 800 chars):\n", extracted[:800])


Input HTML: 459 chars, 17 lines
LLM chat took 4.01s | model=gpt-oss:20b
RAW ollama.chat response:
model='gpt-oss:20b' created_at='2025-09-09T08:35:56.431697371Z' done=True done_reason='stop' total_duration=4004240803 load_duration=69008023 prompt_eval_count=751 prompt_eval_duration=591615039 eval_count=362 eval_duration=3341029679 message=Message(role='assistant', content='', thinking='We need to output JSON with slices. Input lines: Let\'s enumerate lines. The HTML snippet:\n\nLine 0: <!DOCTYPE html>\n1: <html>\n2: <head><title>Example Co - Services</title></head>\n3: <body>\n4: <h1>Example Co</h1>\n5: <p>We provide consulting, cloud migration, and AI services.</p>\n6: <section id="services">\n7:   <h2>Services</h2>\n8:   <div><h3>Cloud Migration</h3><p>Move to AWS, Azure, or GCP.</p></div>\n9:   <div><h3>AI</h3><p>Custom ML solutions for automation.</p></div>\n10: </section>\n11: <section id="contact">\n12:   <h2>Contact</h2>\n13:   <p>Email: info@example.com</p>\n14: </section>\n15: