In [1]:
from langchain_community.llms import Ollama
from datetime import datetime, timezone

llm = Ollama(
    model="phi4:latest",
    base_url="http://localhost:11434",
    temperature=0
)


def call_phi4_single_comment(
    comment_id: str,
    comment_text: str,
    sentiment_group: str,
    created_at: str,
    retries: int,
) -> str:
    prompt = PHI4_SINGLE_COMMENT_PROMPT.format(
        comment_id=comment_id,
        created_at=created_at,
        sentiment_group=sentiment_group,
        comment_text=comment_text.strip()
    )
    for i in range(retries + 1):
        raw = llm.invoke(prompt)

        if raw and raw.strip():
            return raw

        print(f"⚠️ Empty output, retry {i+1}")

    raise RuntimeError("Phi4 returned empty output after retries")


    


  from .autonotebook import tqdm as notebook_tqdm
  llm = Ollama(


In [None]:
PHI4_SINGLE_COMMENT_PROMPT = """
Analyze ONE Persian user comment and return ONE JSON object.

Rules:
- Output ONLY JSON. No markdown. No explanation.
- Use ONLY the comment text.
- evidence MUST be an exact quote from the comment.
- short_title: max 10 words.
- If type != issue → severity = null
- If type != suggestion → priority = null
- ALL fields must be in Persian (fa).
- normalized_title MUST be Persian.
- keywords MUST be Persian.

Allowed values:

type: issue | suggestion | question | praise | other
category: transfer | auth | card | bill | loan | login | ui | performance | AI assistant | other
severity / priority: high | medium | low | null

JSON format:

{{
  "comment_id": "{comment_id}",
  "created_at": "{created_at}",
  "sentiment_group": "{sentiment_group}",

  "type": "",
  "category": "",

  "short_title": "",
  "normalized_title": "",

  "keywords": [],

  "severity": null,
  "priority": null,

  "evidence": "",

  "model": "phi4",
  "processed_at": ""
}}

Comment:
{comment_text}
"""


In [None]:
import json
import re

ALLOWED_TYPES = {"issue","suggestion","question","praise","other"}
ALLOWED_CATEGORIES = {
    "transfer","auth","card","bill","loan","login","ui","performance", "AI assistant", "other"
}
LEVELS = {"high","medium","low",None}


def validate_output(obj: dict, original_text: str):
    assert obj["type"] in ALLOWED_TYPES
    assert obj["category"] in ALLOWED_CATEGORIES
    assert obj["severity"] in LEVELS
    assert obj["priority"] in LEVELS

    # evidence must be exact substring
    assert obj["evidence"] in original_text

    # no English hallucination
    assert not re.search(r"[A-Za-z]", obj["evidence"])

    for field in ["short_title", "normalized_title"]:
        assert not re.search(r"[A-Za-z]", obj[field]), f"English in {field}"

    for kw in obj["keywords"]:
        assert not re.search(r"[A-Za-z]", kw), "English keyword detected"



    # title length
    assert len(obj["short_title"].split()) <= 12


In [4]:
def append_jsonl(path: str, obj: dict):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")


In [5]:
import re
import json

def extract_json(raw: str) -> dict:
    if not raw or not raw.strip():
        raise ValueError("Empty LLM output")

    # Remove markdown fences
    raw = re.sub(r"```(?:json)?", "", raw)
    raw = raw.replace("```", "").strip()

    # Extract first JSON object
    match = re.search(r"\{[\s\S]*\}", raw)
    if not match:
        raise ValueError(f"No JSON object found:\n{raw}")

    return json.loads(match.group(0))

In [11]:
from datetime import datetime, timezone
import json

def process_comments_batch(comments, output_path):
    """
    comments: list of dicts with keys:
      id, description, sentiment_group, created_at
    """

    for c in comments:
        try:
            raw = call_phi4_single_comment(
                comment_id=str(c["id"]),
                comment_text=c["description"],
                sentiment_group=c["sentiment_group"],
                created_at=c["created_at"], retries=2
            )

            obj = extract_json(raw)
            obj["processed_at"] = datetime.now(timezone.utc).isoformat()

            validate_output(obj, c["description"])

            append_jsonl(output_path, obj)

        
            if not raw or not raw.strip():
                raise RuntimeError("LLM returned empty output")


        except Exception as e:
            print(f"❌ Failed comment {c['id']}: {e}")







In [14]:
comments = [
    {
        "id": 1,
        "description": "انتقال وجه خطا میده و حساب مبدا نمایش داده نمیشود.",
        "sentiment_group": "negative",
        "created_at": "2025-01-02T10:12:00Z"
    },
    {
        "id": 2,
        "description": "احراز هویت انجام نمیشه و پیامک نمیاد.",
        "sentiment_group": "negative",
        "created_at": "2025-01-02T11:05:00Z"
    }
]


In [15]:
output_path = "processed_comments.jsonl"

process_comments_batch(
    comments=comments,
    output_path=output_path
)


In [9]:
prompt = PHI4_SINGLE_COMMENT_PROMPT.format(
    comment_id="test_1",
    created_at="2025-01-01T00:00:00Z",
    sentiment_group="negative",
    comment_text="انتقال وجه خطا میده و حساب مبدا نمایش داده نمی‌شود."
)

raw = llm.invoke(prompt)
print(raw)


```json
{
  "comment_id": "test_1",
  "created_at": "2025-01-01T00:00:00Z",
  "sentiment_group": "negative",

  "type": "issue",
  "category": "transfer",

  "short_title": "انتقال وجه خطا میده",
  "normalized_title": "خطای انتقال پول",

  "keywords": ["انتقال", "وجه", "خطا", "حساب", "مبدا"],

  "severity": null,
  "priority": null,

  "evidence": "انتقال وجه خطا میده و حساب مبدا نمایش داده نمی‌شود.",
  
  "model": "phi4",
  "processed_at": ""
}
```


In [10]:
with open("processed_comments.jsonl", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i == 3:
            break
        print(line)


{"comment_id": "1", "created_at": "2025-01-02T10:12:00Z", "sentiment_group": "negative", "type": "issue", "category": "transfer", "short_title": "خطای انتقال وجه و نمایش حساب مبدا", "normalized_title": "خطای انتقال وجه و نمایش حساب مبدا", "keywords": ["انتقال", "وجه", "خطا", "حساب", "مبدا", "نمایش"], "severity": "high", "priority": null, "evidence": "انتقال وجه خطا میده و حساب مبدا نمایش داده نمیشود.", "model": "phi4", "processed_at": "2026-01-06T10:09:49.993091+00:00"}

{"comment_id": "2", "created_at": "2025-01-02T11:05:00Z", "sentiment_group": "negative", "type": "issue", "category": "auth", "short_title": "احراز هویت ناموفق است", "normalized_title": "مشکل در احراز هویت و دریافت پیامک", "keywords": ["احراز هویت", "پیامک", "ناموفق"], "severity": "high", "priority": null, "evidence": "احراز هویت انجام نمیشه و پیامک نمیاد.", "model": "phi4", "processed_at": "2026-01-06T10:10:05.060867+00:00"}

