## Final HTML Slicing Pipeline (Function Calling + Logging)

This notebook is a clean, working version:
- Uses Ollama function calling with a `get_slices` tool
- Passes the exact Pydantic JSON schema to the tool
- Parses tool calls (dict or two-item list formats)
- Always returns a `SliceSet` and executes `get_slices` locally
- Logs: initial HTML, full LLM prompt, raw LLM output, final sliced HTML
- Slices are separated by a single blank line (no markers)



In [1]:
import os, json, time
from typing import List, Dict
from pydantic import BaseModel, Field

# Models
class Slice(BaseModel):
    first_line: int = Field(..., ge=0)
    last_line: int = Field(..., ge=0)

class SliceSet(BaseModel):
    slices: List[Slice] = Field(default_factory=list)

# Paths and logging

def _resolve_base_dir() -> str:
    candidates = [os.getcwd(), os.path.dirname(os.getcwd())]
    for c in candidates:
        if os.path.isdir(os.path.join(c, "crawler_v2", "prompts")):
            return os.path.join(c, "crawler_v2")
        if os.path.isdir(os.path.join(c, "prompts")):
            return c
    return os.getcwd()

BASE_DIR = _resolve_base_dir()
LOG_DIR = os.path.join(BASE_DIR, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

def _write_log(filename: str, content: str) -> str:
    # Overwrite same filenames each run (no RUN_ID prefix)
    path = os.path.join(LOG_DIR, filename)
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)
    return path

# Prompt loader

def load_prompt(prompt_name: str) -> str:
    path = os.path.join(BASE_DIR, "prompts", f"{prompt_name}.md")
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

# Slice rendering

def get_slices(html_content: str, slice_set: SliceSet) -> str:
    if not slice_set.slices:
        return ""
    lines = html_content.split('\n')
    chunks: List[str] = []
    for sl in slice_set.slices:
        a = max(0, sl.first_line)
        b = min(len(lines) - 1, sl.last_line)
        if a <= b:
            chunks.append('\n'.join(lines[a:b+1]))
    return '\n\n'.join(chunks)

# Ollama helpers

def _get_ollama_base_url() -> str:
    return os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")


def _select_model(preferred: List[str] | None = None) -> str:
    preferred = preferred or ["gpt-oss:20b", "llama3.2:3b-instruct"]
    try:
        import ollama
        client = ollama.Client(host=_get_ollama_base_url())
        try:
            ps = client.ps()
            running = [m.get('name') or m.get('model') for m in ps.get('models', []) if isinstance(m, dict)]
        except Exception:
            running = []
        try:
            listed = client.list()
            listed_models = [m.get('name') or m.get('model') for m in listed.get('models', []) if isinstance(m, dict)]
        except Exception:
            listed_models = []
        candidates = [n for n in running + listed_models if isinstance(n, str) and n]
        for cand in (preferred or []):
            if cand in candidates:
                return cand
        if candidates:
            return candidates[0]
    except Exception:
        pass
    return (preferred or ["gpt-oss:20b"])[0]

# Core pipeline

def generate_slices_via_tools(html_content: str) -> SliceSet:
    _write_log("initial_html.txt", html_content)

    prompt_tmpl = load_prompt("generate_slices")
    schema = SliceSet.model_json_schema()
    schema_json = json.dumps(schema, ensure_ascii=False)
    prompt = prompt_tmpl.format(html_content=html_content, output_schema=schema_json)
    _write_log("llm_prompt.txt", prompt)

    tool_schema = schema

    messages = [
        {"role": "system", "content": "Respond ONLY by calling the get_slices function with arguments that EXACTLY match the provided JSON Schema. Do not output any other text."},
        {"role": "user", "content": prompt}
    ]
    tools = [{
        "type": "function",
        "function": {
            "name": "get_slices",
            "description": "Return slice ranges to extract relevant HTML lines.",
            "parameters": tool_schema
        }
    }]

    import ollama
    client = ollama.Client(host=_get_ollama_base_url())
    t0 = time.time()
    resp = client.chat(model=_select_model(), messages=messages, tools=tools, options={"temperature": 0})
    dt = time.time() - t0

    try:
        _write_log("llm_output.txt", json.dumps(resp, ensure_ascii=False, indent=2))
    except Exception:
        _write_log("llm_output.txt", str(resp))

    # Extract args from tool calls (support dict and object-style responses)
    args: Dict = {}
    if isinstance(resp, dict):
        tcs = resp.get('message', {}).get('tool_calls', [])
        if tcs:
            fn = tcs[0].get('function', {})
            raw = fn.get('arguments')
            if isinstance(raw, str):
                try:
                    args = json.loads(raw)
                except Exception:
                    args = {}
            elif isinstance(raw, dict):
                args = raw
    else:
        # Object-style response (e.g., with .message and .tool_calls attributes)
        message = getattr(resp, 'message', None)
        if message is not None:
            tc_list = getattr(message, 'tool_calls', []) or []
            if tc_list:
                fn_obj = getattr(tc_list[0], 'function', None)
                raw = getattr(fn_obj, 'arguments', None) if fn_obj is not None else None
                if isinstance(raw, str):
                    try:
                        args = json.loads(raw)
                    except Exception:
                        args = {}
                elif isinstance(raw, dict):
                    args = raw

    # Build SliceSet (handle dict or [start, end])
    parsed_preview = []
    slices: List[Slice] = []
    for sl in (args.get('slices') or []):
        a = None
        b = None
        if isinstance(sl, dict):
            a = sl.get('first_line', None)
            b = sl.get('last_line', None)
        elif isinstance(sl, list) and len(sl) >= 2:
            a, b = sl[0], sl[1]
        try:
            a = int(a)
            b = int(b)
        except Exception:
            continue
        a = max(0, a)
        b = max(a, b)
        slices.append(Slice(first_line=a, last_line=b))
        parsed_preview.append([a, b])

    _write_log("parsed_tool_args.txt", json.dumps({"slices": parsed_preview}, ensure_ascii=False))

    result = SliceSet(slices=slices)
    final_html = get_slices(html_content, result)
    _write_log("final_sliced_html.txt", final_html)

    print(f"LLM + tools took {dt:.2f}s. Produced {len(result.slices)} slices.")
    return result



In [2]:
# Test cell
html_sample = """<!DOCTYPE html>
<html>
<head><title>Example Co - Services</title></head>
<body>
<h1>Example Co</h1>
<p>We provide consulting, cloud migration, and AI services.</p>
<section id="services">
  <h2>Services</h2>
  <div><h3>Cloud Migration</h3><p>Move to AWS, Azure, or GCP.</p></div>
  <div><h3>AI</h3><p>Custom ML solutions for automation.</p></div>
</section>
<section id="contact">
  <h2>Contact</h2>
  <p>Email: info@example.com</p>
</section>
</body>
</html>"""

print(f"Input HTML: {len(html_sample)} chars, {len(html_sample.split(chr(10)))} lines")
res = generate_slices_via_tools(html_sample)
print("Slices:", [s.model_dump() for s in res.slices])
print(f"Logs written to: {LOG_DIR}")


Input HTML: 459 chars, 17 lines
LLM + tools took 13.66s. Produced 3 slices.
Slices: [{'first_line': 4, 'last_line': 5}, {'first_line': 6, 'last_line': 9}, {'first_line': 11, 'last_line': 13}]
Logs written to: /home/mohammed/Desktop/tech_projects/growbal/crawler_v2/logs
