In [None]:
import os
import json
import time
import re
from typing import Dict, Any, List, Tuple
from pathlib import Path

# Debug helpers (updated for tests/ subdirectory)
BASE_DIR = Path(__file__).parent.parent if "__file__" in globals() else Path("..").resolve()
LOG_DIR = BASE_DIR / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)

try:
    import ollama
except Exception as e:
    raise RuntimeError("The 'ollama' Python package is required. Install it and ensure Ollama is running.")


def read_text_file(path: str | Path) -> str:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {p}")
    return p.read_text(encoding="utf-8", errors="ignore")


def approx_token_count(text: str) -> int:
    # Rough token proxy for debugging
    return len(re.findall(r"\S+", text))


def build_tool_schema() -> Dict[str, Any]:
    return {
        "type": "function",
        "function": {
            "name": "get_slices",
            "description": "Return slice ranges to extract relevant HTML lines.",
            "parameters": {
                "type": "object",
                "properties": {
                    "slices": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "first_line": {"type": "integer"},
                                "last_line": {"type": "integer"}
                            },
                            "required": ["first_line", "last_line"]
                        }
                    }
                },
                "required": ["slices"]
            }
        }
    }


def extract_tool_args(resp: Any) -> Dict[str, Any]:
    # Supports dict or object-shaped response
    if isinstance(resp, dict):
        tcs = (resp.get("message", {}) or {}).get("tool_calls", [])
        if tcs:
            fn = tcs[0].get("function", {})
            raw = fn.get("arguments")
            if isinstance(raw, str):
                try:
                    return json.loads(raw)
                except Exception:
                    return {}
            if isinstance(raw, dict):
                return raw
        return {}
    # object style
    message = getattr(resp, "message", None)
    if message is None:
        return {}
    tcs = getattr(message, "tool_calls", []) or []
    if not tcs:
        return {}
    fn = getattr(tcs[0], "function", None)
    if fn is None:
        return {}
    raw = getattr(fn, "arguments", None)
    if isinstance(raw, str):
        try:
            return json.loads(raw)
        except Exception:
            return {}
    if isinstance(raw, dict):
        return raw
    return {}


def get_slices_from_args(args: Dict[str, Any]) -> List[Tuple[int, int]]:
    out: List[Tuple[int, int]] = []
    for sl in (args.get("slices") or []):
        a = None
        b = None
        if isinstance(sl, dict):
            # Accept multiple key variants
            a = (
                sl.get("first_line")
                if sl.get("first_line") is not None else sl.get("start")
                if sl.get("start") is not None else sl.get("from")
            )
            b = (
                sl.get("last_line")
                if sl.get("last_line") is not None else sl.get("end")
                if sl.get("end") is not None else sl.get("to")
            )
        elif isinstance(sl, list) and len(sl) >= 2:
            a, b = sl[0], sl[1]
        try:
            a = int(a)
            b = int(b)
        except Exception:
            continue
        a = max(0, a)
        b = max(a, b)
        out.append((a, b))
    return out


def apply_slices(html_text: str, slices: List[Tuple[int, int]]) -> str:
    if not slices:
        return ""
    lines = html_text.splitlines()
    chunks: List[str] = []
    for a, b in slices:
        a = max(0, a)
        b = min(len(lines) - 1, b)
        if a <= b:
            chunks.append("\n".join(lines[a:b+1]))
    return "\n\n".join(chunks)


def focus_html(html_text: str, keyword_window: int = 3) -> str:
    # Extract lines around likely-interesting keywords
    key_re = re.compile(r"about|service|contact|team|clients|email|phone|address|location|office|company|who\s+we|vision", re.I)
    lines = html_text.splitlines()
    keep = set()
    for idx, ln in enumerate(lines):
        if key_re.search(ln):
            for j in range(max(0, idx - keyword_window), min(len(lines), idx + keyword_window + 1)):
                keep.add(j)
    # Always keep header/footer windows
    for j in range(min(200, len(lines))):
        keep.add(j)
    for j in range(max(0, len(lines) - 200), len(lines)):
        keep.add(j)
    focused = "\n".join(lines[i] for i in sorted(keep))
    return focused


def _normalize_ollama_resp(resp: Any) -> Dict[str, Any]:
    """Return a minimal JSON-serializable dict with content and tool_calls."""
    out: Dict[str, Any] = {"content": None, "thinking": None, "tool_calls": []}
    if isinstance(resp, dict):
        msg = resp.get("message") or {}
        out["content"] = msg.get("content")
        out["thinking"] = msg.get("thinking")
        for tc in (msg.get("tool_calls") or []):
            fn = tc.get("function") or {}
            out["tool_calls"].append({
                "name": fn.get("name"),
                "arguments": fn.get("arguments"),
            })
        return out
    # object style
    message = getattr(resp, "message", None)
    if message is not None:
        out["content"] = getattr(message, "content", None)
        out["thinking"] = getattr(message, "thinking", None)
        tcs = getattr(message, "tool_calls", []) or []
        norm_tcs: List[Dict[str, Any]] = []
        for tc in tcs:
            fn = getattr(tc, "function", None)
            name = getattr(fn, "name", None) if fn is not None else None
            args = getattr(fn, "arguments", None) if fn is not None else None
            norm_tcs.append({"name": name, "arguments": args})
        out["tool_calls"] = norm_tcs
    return out


def run_ollama_slicing(html_text: str, model: str = "gpt-oss:20b", num_ctx: int = 8192, temperature: float = 0.0,
                        save_prefix: str = "debug") -> Dict[str, Any]:
    client = ollama.Client(host=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"))

    # Try to fetch model metadata
    try:
        model_info = client.show(model)
    except Exception:
        model_info = None

    tools = [build_tool_schema()]
    prompt = html_text
    messages = [
        {"role": "system", "content": (
            "Respond ONLY by calling the get_slices function with arguments that EXACTLY match the provided JSON Schema. "
            "Return 3-12 concise, non-overlapping ranges that capture ABOUT/WHO WE ARE, SERVICES, CONTACT (emails/phones/addresses), "
            "and any TEAM/CLIENTS sections. Use line indices of the provided HTML text."
        )},
        {"role": "user", "content": prompt}
    ]

    # Save prompt for inspection
    (LOG_DIR / f"{save_prefix}_prompt.txt").write_text(prompt, encoding="utf-8")

    t0 = time.time()
    resp = client.chat(model=model, messages=messages, tools=tools, options={"temperature": temperature, "num_ctx": num_ctx})
    dt = time.time() - t0

    # Save raw response (string repr)
    try:
        raw = json.dumps(resp, ensure_ascii=False, indent=2)
    except Exception:
        raw = str(resp)
    (LOG_DIR / f"{save_prefix}_response.json").write_text(raw, encoding="utf-8")

    # Save normalized JSON for easy inspection
    norm = _normalize_ollama_resp(resp)
    (LOG_DIR / f"{save_prefix}_normalized.json").write_text(json.dumps(norm, ensure_ascii=False, indent=2), encoding="utf-8")

    args = extract_tool_args(resp)
    slices = get_slices_from_args(args)
    sliced_html = apply_slices(html_text, slices)
    (LOG_DIR / f"{save_prefix}_sliced_html.txt").write_text(sliced_html, encoding="utf-8")

    return {
        "model": model,
        "model_info": model_info,
        "elapsed_sec": dt,
        "num_ctx": num_ctx,
        "temperature": temperature,
        "token_est_input": approx_token_count(html_text),
        "num_lines_input": len(html_text.splitlines()),
        "num_slices": len(slices),
        "slices": slices,
        "sliced_len": len(sliced_html)
    }


def debug_slicing(html_path: str | Path,
                  model: str = "gpt-oss:20b",
                  num_ctx: int = 8192,
                  temperature: float = 0.0,
                  fallback_focus: bool = True,
                  save_prefix: str = "debug") -> Dict[str, Any]:
    html_text = read_text_file(html_path)

    print(f"Loaded {html_path}")
    print(f"Chars: {len(html_text):,}  |  Lines: {len(html_text.splitlines()):,}  |  ~Tokens: {approx_token_count(html_text):,}")

    result = run_ollama_slicing(html_text, model=model, num_ctx=num_ctx, temperature=temperature, save_prefix=save_prefix)
    print(json.dumps({k: v for k, v in result.items() if k not in ("model_info", "slices")}, indent=2))

    if result["num_slices"] == 0 and fallback_focus:
        print("No slices from full input. Retrying with focused HTML window...")
        focused = focus_html(html_text)
        (LOG_DIR / f"{save_prefix}_focused_input.txt").write_text(focused, encoding="utf-8")
        focus_res = run_ollama_slicing(focused, model=model, num_ctx=num_ctx, temperature=temperature, save_prefix=f"{save_prefix}_focus")
        print(json.dumps({k: v for k, v in focus_res.items() if k not in ("model_info", "slices")}, indent=2))
        result["focus_attempt"] = focus_res

    return result


In [2]:
from dotenv import load_dotenv
load_dotenv("/home/mohammed/Desktop/tech_projects/growbal/envs/1.env", override=True)

html_file = "/home/mohammed/Desktop/tech_projects/growbal/crawler_v2/logs/clean_html_before_slicing_additional.txt"
if not os.path.exists(html_file) or os.path.getsize(html_file) == 0:
    html_file = "/home/mohammed/Desktop/tech_projects/growbal/crawler_v2/logs/clean_html_before_slicing.txt"

res = debug_slicing(html_file, model="gpt-oss:20b", num_ctx=16384, temperature=0.0, fallback_focus=True, save_prefix="slicing_debug")
print("\nSummary:")
print(json.dumps({k: v for k, v in res.items() if k not in ("model_info", "slices", "focus_attempt")}, indent=2))
print("Slices:", res.get("slices"))
if res.get("focus_attempt"):
    print("\nFocus attempt summary:")
    print(json.dumps({k: v for k, v in res["focus_attempt"].items() if k not in ("model_info", "slices")}, indent=2))


Loaded /home/mohammed/Desktop/tech_projects/growbal/crawler_v2/logs/clean_html_before_slicing_additional.txt
Chars: 32,577  |  Lines: 406  |  ~Tokens: 2,038
{
  "model": "gpt-oss:20b",
  "elapsed_sec": 46.80624198913574,
  "num_ctx": 16384,
  "temperature": 0.0,
  "token_est_input": 2038,
  "num_lines_input": 406,
  "num_slices": 4,
  "sliced_len": 32563
}

Summary:
{
  "model": "gpt-oss:20b",
  "elapsed_sec": 46.80624198913574,
  "num_ctx": 16384,
  "temperature": 0.0,
  "token_est_input": 2038,
  "num_lines_input": 406,
  "num_slices": 4,
  "sliced_len": 32563
}
Slices: [(1, 200), (201, 400), (401, 600), (601, 800)]


In [3]:
def inspect_last_response(prefix: str = "slicing_debug") -> None:
    path = LOG_DIR / f"{prefix}_response.json"
    if not path.exists():
        print(f"No response file at {path}")
        return
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        print("Response file is not valid JSON; raw content below:\n")
        print(path.read_text(encoding="utf-8"))
        return
    msg = (data.get("message") or {})
    tc = msg.get("tool_calls")
    print("Has tool_calls:", bool(tc))
    if tc:
        print(json.dumps(tc, ensure_ascii=False, indent=2)[:2000])
    else:
        print("Assistant content:\n")
        print((msg.get("content") or "")[:2000])


def run_json_slicing(html_text: str, model: str = "gpt-oss:20b", num_ctx: int = 16384,
                     temperature: float = 0.0, save_prefix: str = "slicing_debug_json") -> Dict[str, Any]:
    """Ask the model to output ONLY JSON with slices (no tools)."""
    client = ollama.Client(host=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"))

    system_prompt = (
        "You will receive an HTML text split by lines. "
        "Return ONLY a JSON object of the form: {\"slices\":[{\"first_line\":int,\"last_line\":int}, ...]}. "
        "Choose 3-12 non-overlapping ranges that capture ABOUT/WHO WE ARE, SERVICES, CONTACT (emails/phones/addresses), and TEAM/CLIENTS sections. "
        "Use 0-based line indices relative to the provided text. Do not include any other keys or text."
    )
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": html_text}
    ]

    (LOG_DIR / f"{save_prefix}_prompt.txt").write_text(html_text, encoding="utf-8")

    t0 = time.time()
    # Force JSON output format
    resp = client.chat(model=model, messages=messages, options={"temperature": temperature, "num_ctx": num_ctx, "format": "json"})
    dt = time.time() - t0

    try:
        raw = json.dumps(resp, ensure_ascii=False, indent=2)
    except Exception:
        raw = str(resp)
    (LOG_DIR / f"{save_prefix}_response.json").write_text(raw, encoding="utf-8")

    # Parse JSON content
    content = None
    if isinstance(resp, dict):
        content = (resp.get("message") or {}).get("content")
    if not content:
        content = raw

    try:
        parsed = json.loads(content)
    except Exception:
        parsed = {}

    slices = get_slices_from_args(parsed)
    sliced_html = apply_slices(html_text, slices)
    (LOG_DIR / f"{save_prefix}_sliced_html.txt").write_text(sliced_html, encoding="utf-8")

    return {
        "model": model,
        "elapsed_sec": dt,
        "num_ctx": num_ctx,
        "temperature": temperature,
        "token_est_input": approx_token_count(html_text),
        "num_lines_input": len(html_text.splitlines()),
        "num_slices": len(slices),
        "slices": slices,
        "sliced_len": len(sliced_html)
    }


def rule_based_slicing(html_text: str, window: int = 8) -> Dict[str, Any]:
    """Heuristic slicer when the model doesn't produce slices."""
    lines = html_text.splitlines()
    n = len(lines)
    keys = {
        "ABOUT": re.compile(r"about\b|who\s+we\s+are|company\s+profile|our\s+story", re.I),
        "SERVICES": re.compile(r"services|what\s+we\s+do|solutions", re.I),
        "CONTACT": re.compile(r"contact|email|phone|address|location|office", re.I),
        "TEAM": re.compile(r"team|leadership|partners|directors|staff|people", re.I),
        "CLIENTS": re.compile(r"clients|testimonials|case\s+studies|our\s+clients", re.I),
    }
    hits: List[Tuple[int, int]] = []
    for name, rx in keys.items():
        for i, ln in enumerate(lines):
            if rx.search(ln):
                a = max(0, i - window)
                b = min(n - 1, i + window)
                hits.append((a, b))
    # Merge overlaps and keep up to ~12 windows
    hits.sort()
    merged: List[Tuple[int, int]] = []
    for a, b in hits:
        if not merged or a > merged[-1][1] + 1:
            merged.append((a, b))
        else:
            merged[-1] = (merged[-1][0], max(merged[-1][1], b))
    merged = merged[:12]
    sliced_html = apply_slices(html_text, merged)
    return {
        "num_slices": len(merged),
        "slices": merged,
        "sliced_len": len(sliced_html),
        "sliced_html": sliced_html,
    }


In [4]:
# Inspect tool_calls presence in last tool run
inspect_last_response(prefix="slicing_debug")

# Now try JSON-only path
html_file = "/home/mohammed/Desktop/tech_projects/growbal/crawler_v2/logs/clean_html_before_slicing_additional.txt"
if not os.path.exists(html_file) or os.path.getsize(html_file) == 0:
    html_file = "/home/mohammed/Desktop/tech_projects/growbal/crawler_v2/logs/clean_html_before_slicing.txt"

html_text = read_text_file(html_file)
json_res = run_json_slicing(html_text, model="gpt-oss:20b", num_ctx=16384, temperature=0.0, save_prefix="slicing_debug_json")
print("\nJSON-only attempt summary:")
print(json.dumps(json_res, indent=2))

# If still no slices, use rule-based fallback
if json_res.get("num_slices", 0) == 0:
    rb = rule_based_slicing(html_text, window=10)
    (LOG_DIR / "slicing_debug_rule_based.txt").write_text(rb["sliced_html"], encoding="utf-8")
    print("\nRule-based fallback produced:")
    print(json.dumps({k: v for k, v in rb.items() if k != "sliced_html"}, indent=2))
    print(f"Saved: {(LOG_DIR / 'slicing_debug_rule_based.txt')}\n")


Response file is not valid JSON; raw content below:

model='gpt-oss:20b' created_at='2025-09-10T05:23:17.054515985Z' done=True done_reason='stop' total_duration=46804275803 load_duration=64648593 prompt_eval_count=8861 prompt_eval_duration=7945876099 eval_count=3285 eval_duration=38780753206 message=Message(role='assistant', content='', thinking='We need to call get_slices with ranges capturing ABOUT/WHO WE ARE, SERVICES, CONTACT (emails/phones/addresses), and any TEAM/CLIENTS sections. We need line indices of provided HTML text. We need to count lines. Let\'s approximate. The HTML is long. We need to provide slices as array of objects? The schema: get_slices expects { slices: any[] }. Each slice likely an object with start and end? The schema not defined. But typical: slice: { start: number, end: number }. We need to guess. The instructions: "Return 3-12 concise, non-overlapping ranges that capture ABOUT/WHO WE ARE, SERVICES, CONTACT (emails/phones/addresses), and any TEAM/CLIENTS sec

KeyboardInterrupt: 