# NewsBiasDetector — main.ipynb

Clean final notebook for MAT496 capstone. This notebook includes:
- Nodes: summarizer, claim extractor, web-search, fact-check, language-bias, bias-scoring
- Human-in-the-loop (interactive & file-driven)
- Export to JSON/CSV and LangSmith logging helper
- BeautifulSoup-only URL extractor and URL-input breakpoint
- Final pipeline run that prints the explicit output parameters required for submission


In [51]:
# Cell 1 — load environment (do not print secrets)
from dotenv import load_dotenv
import os, sys
load_dotenv()
print("Python:", sys.executable)
print("CWD:", os.getcwd())
print("OPENAI_API_KEY present:", bool(os.getenv("OPENAI_API_KEY")))
print("SERPAPI_API_KEY present:", bool(os.getenv("SERPAPI_API_KEY")))
print("LANGSMITH_API_KEY present:", bool(os.getenv("LANGSMITH_API_KEY")))


Python: d:\MAT 496\NewsBiasDetector\venv\Scripts\python.exe
CWD: d:\MAT 496\NewsBiasDetector
OPENAI_API_KEY present: True
SERPAPI_API_KEY present: False
LANGSMITH_API_KEY present: True


In [52]:
# Optional: enable LangSmith tracer for full tracing of LLM calls (if you want full traces)
try:
    # try to import LangSmith tracer from LangChain callbacks (API may vary with versions)
    from langchain.callbacks import LangSmithTracer  # typical import in many LangChain versions
    tracer = LangSmithTracer(project_name = os.getenv("LANGSMITH_PROJECT_NAME") or os.getenv("LANGSMITH_PROJECT") or "NewsBiasDetector")
    # Attach to your llm or to callbacks for chains/agents:
    # If llm constructor accepts callbacks, you can pass callbacks=[tracer] when creating llm.
    # Example (if using ChatOpenAI):
    # llm = ChatOpenAI(model_name=MODEL_NAME, temperature=0, callbacks=[tracer])
    print("LangSmith tracer created; to enable tracing attach `callbacks=[tracer]` to your llm or chain.")
except Exception as e:
    print("LangSmith tracer not available or LangChain version mismatch:", e)
    print("Install and use `from langchain.callbacks import LangSmithTracer` or check LangChain version.")


LangSmith tracer not available or LangChain version mismatch: No module named 'langchain.callbacks'
Install and use `from langchain.callbacks import LangSmithTracer` or check LangChain version.


In [53]:
# Tracer autodetect + attach (paste & run after your LLM init cell)
import os, importlib
def try_make_tracer_and_attach(llm_obj):
    candidates = [
        ("langchain.callbacks", "LangSmithTracer"),
        ("langchain.callbacks", "LangChainTracer"),
        ("langchain_core.callbacks", "LangChainTracer"),
        ("langgraph.tracing", "LangSmithTracer"),
        ("langsmith", "LangSmithTracer"),
    ]
    tracer = None
    msg = ""
    for mod_name, cls_name in candidates:
        try:
            mod = importlib.import_module(mod_name)
            TracerClass = getattr(mod, cls_name, None) or getattr(mod, "LangSmithTracer", None) or getattr(mod, "LangChainTracer", None)
            if TracerClass:
                try:
                    project = os.getenv("LANGSMITH_PROJECT_NAME") or os.getenv("LANGSMITH_PROJECT") or "newsbiasdetector"
                    tracer = TracerClass(project_name=project)
                    msg = f"Tracer created via {mod_name}.{TracerClass.__name__} (project={project})"
                    break
                except Exception as e:
                    msg = f"Found {mod_name}.{cls_name} but failed to instantiate: {e}"
        except Exception:
            continue

    if tracer is None:
        if os.getenv("LANGCHAIN_TRACING") or os.getenv("LANGSMITH_TRACING") or os.getenv("LANGCHAIN_TRACING_V2"):
            return llm_obj, None, "No tracer class found; env-vars for tracing are set (may need restart)."
        return llm_obj, None, "No tracer class found; install compatible langchain/langsmith versions."

    # Try to attach tracer to ChatOpenAI (recreate llm with callbacks)
    try:
        ChatCls = None
        for test_mod in ("langchain.chat_models", "langchain.chat_models.openai", "langchain.chat_models.base"):
            try:
                m = importlib.import_module(test_mod)
                ChatCls = getattr(m, "ChatOpenAI", None) or getattr(m, "ChatModel", None)
                if ChatCls:
                    break
            except Exception:
                continue
        if ChatCls:
            try:
                new_llm = ChatCls(model_name=os.getenv("OPENAI_MODEL_NAME","gpt-4o"), temperature=0, callbacks=[tracer])
                return new_llm, tracer, msg + " — attached to LLM."
            except Exception as e:
                return llm_obj, tracer, msg + f" — tracer created but failed to attach to LLM: {e}"
        else:
            return llm_obj, tracer, msg + " — tracer created but ChatOpenAI class not found to attach."
    except Exception as e:
        return llm_obj, tracer, f"Tracer created but reattach failed: {e}"

# Run attempt
current_llm = globals().get("llm", None)
llm, tracer, message = try_make_tracer_and_attach(current_llm)
if tracer:
    globals()["llm"] = llm
print("Tracer attempt result:", message)


Tracer attempt result: No tracer class found; env-vars for tracing are set (may need restart).


In [54]:
# Cell 2 — imports and safe LLM initialization (may be None if libs not installed)
import json, re, requests
from typing import TypedDict, List, Dict, Any, Optional
import os
try:
    from langchain_core.messages import SystemMessage, HumanMessage
except Exception:
    try:
        from langchain.schema import SystemMessage, HumanMessage
    except Exception:
        SystemMessage = None
        HumanMessage = None

try:
    from langchain.chat_models import ChatOpenAI
    from langchain.embeddings import OpenAIEmbeddings
except Exception:
    ChatOpenAI = None
    OpenAIEmbeddings = None

try:
    from langgraph.graph import StateGraph, END
except Exception:
    StateGraph = None
    END = None

MODEL_NAME = os.getenv("OPENAI_MODEL_NAME", "gpt-4o")
EMBED_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
llm = None
embeddings = None
try:
    if ChatOpenAI is None:
        raise RuntimeError("Chat model not available")
    llm = ChatOpenAI(model_name=MODEL_NAME, temperature=0)
    embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
    print("LLM initialized:", MODEL_NAME)
except Exception as e:
    llm = None
    embeddings = None
    print("LLM not initialized — continuing. Error:", e)


LLM not initialized — continuing. Error: Chat model not available


In [55]:
# Cell 3 — helper to extract text from LLM responses
def _resp_to_text(resp) -> str:
    if resp is None:
        return ""
    content = getattr(resp, "content", None)
    if isinstance(content, str):
        return content.strip()
    try:
        if hasattr(resp, "generations"):
            gens = resp.generations
            if isinstance(gens, list) and len(gens):
                g0 = gens[0]
                if isinstance(g0, dict) and "text" in g0:
                    return str(g0["text"]).strip()
                if hasattr(g0, "text"):
                    return str(g0.text).strip()
        if hasattr(resp, "choices"):
            c0 = resp.choices[0]
            t = getattr(c0, "text", None)
            if isinstance(t, str):
                return t.strip()
            m = getattr(c0, "message", None)
            if isinstance(m, str):
                return m.strip()
            if isinstance(m, dict):
                return json.dumps(m)
        if isinstance(resp, str):
            return resp.strip()
    except Exception:
        pass
    return repr(resp)


In [56]:
# Cell 4 — summarizer node
SUMMARY_PROMPT = """
You are a neutral assistant. Read the article text delimited by <<<ARTICLE>>> and produce:
- A short, factual SUMMARY (2-3 sentences).
- An estimated confidence score (0.0 - 1.0).
Return JSON only: {"summary": "...", "confidence": 0.87}
<<<ARTICLE>>>
{article}
"""
def summarize_article(article_text: str) -> Dict[str, Any]:
    if llm is None:
        return {"summary": (article_text or "")[:400], "confidence": 0.0}
    prompt = SUMMARY_PROMPT.format(article=article_text.strip()[:6000])
    messages = [SystemMessage(content="You are a factual summariser. Return only JSON."), HumanMessage(content=prompt)]
    try:
        resp = llm.invoke(messages)
        raw = _resp_to_text(resp)
        parsed = json.loads(raw)
        return {"summary": parsed.get("summary", ""), "confidence": float(parsed.get("confidence", 0.0))}
    except Exception:
        return {"summary": article_text[:400], "confidence": 0.0}

def summarize_node(state: Dict[str, Any]) -> Dict[str, Any]:
    art = state.get("raw_text","") or ""
    res = summarize_article(art)
    state["summary"] = res.get("summary")
    state["summary_confidence"] = float(res.get("confidence", 0.0))
    return state


In [57]:
# Cell 5 — claim extractor node
CLAIM_PROMPT = """
Extract factual claims and opinion statements from the article. Return JSON exactly:
{"claims": [{"id":1,"speaker":"Unknown","text":"...","type":"factual","topic":"..."}, ...]}
Article between <<<ARTICLE>>> and <<<END>>>.
<<<ARTICLE>>>
{article}
<<<END>>>
"""
def extract_claims(article_text: str) -> List[Dict[str, Any]]:
    if llm is None:
        return []
    prompt = CLAIM_PROMPT.format(article=article_text.strip()[:6000])
    messages = [SystemMessage(content="Return valid JSON only."), HumanMessage(content=prompt)]
    try:
        resp = llm.invoke(messages)
        raw = _resp_to_text(resp)
        j = json.loads(raw)
        return j.get("claims", [])
    except Exception:
        return []

def extract_claims_node(state: Dict[str, Any]) -> Dict[str, Any]:
    art = state.get("raw_text","") or ""
    state["claims"] = extract_claims(art)
    return state


In [58]:
# Cell 6 — web_search helper (SerpAPI/Tavily fallback)
import requests
TAVILY_KEY = os.getenv("TAVILY_API_KEY")
SERPAPI_KEY = os.getenv("SERPAPI_API_KEY")
def web_search(query: str, num_results: int = 3) -> List[Dict[str, Any]]:
    q = (query or "").strip()
    if not q:
        return []
    if TAVILY_KEY:
        try:
            from tavily import TavilyClient
            client = TavilyClient(api_key=TAVILY_KEY)
            resp = client.search(query=q, max_results=num_results)
            return [{"title": r.get("title",""), "snippet": r.get("snippet",""), "url": r.get("url","")} for r in resp.get("results", [])[:num_results]]
        except Exception:
            pass
    if SERPAPI_KEY:
        try:
            params = {"q": q, "api_key": SERPAPI_KEY, "engine": "google", "num": num_results}
            r = requests.get("https://serpapi.com/search.json", params=params, timeout=15.0)
            r.raise_for_status()
            data = r.json()
            items = data.get("organic_results") or data.get("organic") or []
            return [{"title": it.get("title",""), "snippet": it.get("snippet",""), "url": it.get("link") or it.get("url") or ""} for it in items[:num_results]]
        except Exception:
            pass
    raise RuntimeError("No web search provider configured. Set TAVILY_API_KEY or SERPAPI_API_KEY.")


In [59]:
# Cell 7 — fact check helper and node (uses web_search + llm)
FACTCHECK_PROMPT = """
You are a verifier. Given a claim and search evidence, decide SUPPORTED/CONTRADICTED/UNCERTAIN and return JSON:
{"verdict": "SUPPORTED"|"CONTRADICTED"|"UNCERTAIN", "confidence": 0.0, "extracted_evidence": [{"url":"..","snippet":"..","note":".."}]}
"""
def fact_check_claim(claim_text: str, top_k: int = 3) -> Dict[str, Any]:
    try:
        results = web_search(claim_text, num_results=top_k)
    except Exception:
        results = []
    evidence_blocks = [f"URL: {r.get('url','')}\nTITLE: {r.get('title','')}\nSNIPPET: {r.get('snippet','')}\n---" for r in results]
    evidence_text = "\n".join(evidence_blocks) if evidence_blocks else "No results found."
    if llm is None:
        fallback = {"verdict": "UNCERTAIN", "confidence": 0.0, "extracted_evidence": [{"url": r.get('url',''), "snippet": r.get('snippet','')} for r in results], "raw_search": results}
        return fallback
    prompt = FACTCHECK_PROMPT
    full_input = "Claim:\n" + claim_text + "\n\nSearch evidence:\n" + evidence_text + "\n\nReturn JSON only."
    messages = [SystemMessage(content="You are a precise verifier. Return JSON only."), HumanMessage(content=prompt + "\n\n" + full_input)]
    try:
        resp = llm.invoke(messages)
        raw = _resp_to_text(resp)
        parsed = json.loads(raw)
        parsed["raw_search"] = results
        return parsed
    except Exception:
        return {"verdict": "UNCERTAIN", "confidence": 0.0, "extracted_evidence": [{"url": r.get('url',''), "snippet": r.get('snippet','')} for r in results], "raw_search": results}

def fact_check_node(state: Dict[str, Any]) -> Dict[str, Any]:
    claims = state.get("claims", []) or []
    res = []
    for c in claims:
        text = c.get("text") if isinstance(c, dict) else str(c)
        cid = c.get("id") if isinstance(c, dict) else None
        fc = fact_check_claim(text, top_k=3)
        res.append({"id": cid, "claim": text, "verdict": fc.get("verdict"), "confidence": float(fc.get("confidence", 0.0) or 0.0), "evidence": fc.get("extracted_evidence", []), "raw_search": fc.get("raw_search", [])})
    state["fact_results"] = res
    return state


In [60]:
# Cell 8 — language bias analysis node (LLM + lexical fallback)
LANGUAGE_BIAS_PROMPT = """
Inspect article language for bias. Return JSON:
{"tone":"neutral|positive|negative|mixed","sentiment_score":float,"emotion_words":[],"hedging_phrases":[],"subjectivity":float,"examples":[]}
Analyze article between <<<ARTICLE>>> and <<<END>>>.
<<<ARTICLE>>>
{article}
<<<END>>>
"""
_LOADED_WORDS = {"positive":["welcomed","benefit","celebrate"], "negative":["shameful","outrage","scandal"], "hedges":["may","might","could","appears","suggests","likely","reportedly"]}
def analyze_language(article_text: str, claims: List[Dict[str, Any]] = None) -> Dict[str, Any]:
    if llm is None:
        text = article_text.lower()
        found = [w for w in _LOADED_WORDS['positive']+_LOADED_WORDS['negative'] if w in text]
        hedges = [h for h in _LOADED_WORDS['hedges'] if re.search(r"\b"+re.escape(h)+r"\b", text)]
        pos = sum(text.count(w) for w in _LOADED_WORDS['positive'])
        neg = sum(text.count(w) for w in _LOADED_WORDS['negative'])
        sentiment = 0.0
        if pos+neg>0:
            sentiment = (pos-neg)/(pos+neg)
        subjectivity = min(1.0, (len(found)+len(hedges))/10.0)
        tone = 'neutral'
        if sentiment>0.2: tone='positive'
        elif sentiment<-0.2: tone='negative'
        examples=[]
        for s in re.split(r'(?<=[.!?])\s+', article_text):
            sl=s.lower()
            if any(w in sl for w in found) or any(h in sl for h in hedges):
                examples.append(s.strip()[:120])
                if len(examples)>=3: break
        return {"tone":tone,"sentiment_score":float(sentiment),"emotion_words":found[:8],"hedging_phrases":hedges[:8],"subjectivity":float(subjectivity),"examples":examples}
    prompt = LANGUAGE_BIAS_PROMPT.format(article=article_text.strip()[:7000])
    messages = [SystemMessage(content="Return JSON only."), HumanMessage(content=prompt)]
    try:
        resp = llm.invoke(messages)
        raw = _resp_to_text(resp)
        parsed = json.loads(raw)
        parsed['sentiment_score'] = float(parsed.get('sentiment_score',0.0))
        parsed['subjectivity'] = float(parsed.get('subjectivity',0.0))
        return parsed
    except Exception:
        # fallback
        return analyze_language(article_text, claims=None)

def language_bias_node(state: Dict[str, Any]) -> Dict[str, Any]:
    art = state.get('raw_text','') or ''
    analysis = analyze_language(art)
    state['language_bias'] = analysis
    return state


In [61]:
# Cell 9 — bias scoring helpers and node
def _clamp(x,a=-1.0,b=1.0): return max(a,min(b,x))
def compute_fact_component(fact_results):
    if not fact_results:
        return {"fact_component":0.0,"mean_fact_conf":0.0,"n_claims":0}
    weighted_sum=0.0; weight_total=0.0; confs=[]
    for r in fact_results:
        verdict=(r.get('verdict') or '').upper(); conf=float(r.get('confidence',0.0) or 0.0); confs.append(conf)
        score = 1.0 if verdict=='SUPPORTED' else (-1.0 if verdict=='CONTRADICTED' else 0.0)
        weighted_sum += score * conf; weight_total += conf
    mean_conf = (sum(confs)/len(confs)) if confs else 0.0
    if weight_total>0: fact_component = weighted_sum / weight_total
    else:
        n_sup = sum(1 for r in fact_results if (r.get('verdict') or '').upper()=='SUPPORTED')
        n_con = sum(1 for r in fact_results if (r.get('verdict') or '').upper()=='CONTRADICTED')
        total = max(1, len(fact_results))
        fact_component = (n_sup - n_con)/ total
    return {"fact_component": _clamp(fact_component), "mean_fact_conf": mean_conf, "n_claims": len(fact_results)}
def compute_language_component(lang):
    if not lang: return {"lang_component":0.0,"subjectivity":0.0,"sentiment":0.0}
    sent = float(lang.get('sentiment_score',0.0) or 0.0); subj = float(lang.get('subjectivity',0.0) or 0.0)
    return {"lang_component": _clamp(sent*subj), "subjectivity": subj, "sentiment": sent}
def compute_bias_score(state, w_fact=0.7, w_lang=0.3):
    fr = state.get('fact_results',[]) or []
    lang = state.get('language_bias',{}) or {}
    fi = compute_fact_component(fr); li = compute_language_component(lang)
    combined = _clamp(fi['fact_component']*w_fact + li['lang_component']*w_lang)
    mean_fact_conf = fi.get('mean_fact_conf',0.0); subj = li.get('subjectivity',0.0); n_claims = fi.get('n_claims',0)
    if n_claims>0: conf = 0.7*mean_fact_conf + 0.3*(1.0 - subj)
    else: conf = 0.3*mean_fact_conf + 0.7*(1.0 - subj)
    conf = max(0.0, min(1.0, conf))
    if combined > 0.2: stance='pro-article'
    elif combined < -0.2: stance='against-article'
    else: stance='neutral'
    breakdown = {"fact_component":fi['fact_component'], "lang_component":li['lang_component'], "weights":{"w_fact":w_fact,"w_lang":w_lang}, "mean_fact_confidence": fi.get('mean_fact_conf',0.0), "language_subjectivity": li.get('subjectivity',0.0), "n_claims": fi.get('n_claims',0)}
    return {"bias_score": combined, "stance": stance, "confidence": conf, "breakdown": breakdown}
def bias_scoring_node(state: Dict[str, Any]) -> Dict[str, Any]:
    r = compute_bias_score(state)
    state['bias_score'] = float(r['bias_score']); state['leaning'] = r['stance']; state['bias_breakdown'] = r['breakdown']; state['bias_confidence'] = float(r['confidence'])
    return state


In [62]:
# Cell 10 — graph wiring (optional) + wrappers
class Claim(TypedDict):
    id: int
    speaker: str
    text: str
    type: str
    topic: str
class NewsState(TypedDict, total=False):
    raw_text: str
    summary: str
    summary_confidence: float
    claims: List[Claim]
    fact_results: List[Dict[str, Any]]
    language_bias: Dict[str, Any]
    bias_score: float
    leaning: str
    bias_breakdown: Dict[str, Any]
    bias_confidence: float
graph = None
if StateGraph is not None:
    try:
        graph = StateGraph(NewsState)
        graph.add_node('summarize', summarize_node)
        graph.add_node('extract_claims', extract_claims_node)
        graph.add_node('fact_check', fact_check_node)
        graph.add_node('language_bias', language_bias_node)
        graph.add_node('bias_scoring', bias_scoring_node)
        graph.add_edge('summarize','extract_claims'); graph.add_edge('extract_claims','fact_check'); graph.add_edge('fact_check','language_bias'); graph.add_edge('language_bias','bias_scoring'); graph.add_edge('bias_scoring', END)
        graph.set_entry_point('summarize')
        print('Graph created. Nodes:', list(graph.nodes.keys()))
    except Exception as e:
        print('Graph wiring skipped/warning:', e)
else:
    print('LangGraph not available — skip graph wiring.')


Graph created. Nodes: ['summarize', 'extract_claims', 'fact_check', 'language_bias', 'bias_scoring']


In [63]:
# Cell 11 — human-review interactive and file nodes + maybe_human_review wrapper + export/logging helpers
import json, csv, time
from pathlib import Path
REVIEW_QUEUE = Path('human_review_queue.json')
REVIEW_DECISIONS = Path('human_review_decisions.json')
def human_review_node_blocking(state: Dict[str, Any]) -> Dict[str, Any]:
    fr = state.get('fact_results',[]) or []
    for r in fr:
        verdict = (r.get('verdict') or 'UNCERTAIN').upper(); conf = float(r.get('confidence',0.0) or 0.0)
        if verdict=='UNCERTAIN' or conf < 0.5:
            print('\n--- HUMAN REVIEW REQUIRED ---')
            print('Claim id:', r.get('id'))
            print('Claim text:', r.get('claim'))
            print('Current verdict:', verdict, 'confidence:', conf)
            print('Evidence (top 2):')
            for ev in (r.get('evidence') or [])[:2]:
                print('-', (ev.get('snippet') or '')[:200], ev.get('url',''))
            ans = input('Decision [y=ACCEPT / n=REJECT / s=SKIP]: ').strip().lower()
            if ans in ('y','yes'):
                r.setdefault('human_review',{})['decision']='ACCEPT'
            elif ans in ('n','no'):
                r.setdefault('human_review',{})['decision']='REJECT'; r['verdict']='CONTRADICTED'; r.setdefault('human_review',{})['note']='Human rejected'
            else:
                r.setdefault('human_review',{})['decision']='SKIPPED'
        else:
            r.setdefault('human_review',{})['decision']='NOT_NEEDED'
    state['fact_results']=fr; return state
def human_review_node_file(state: Dict[str, Any], auto_wait: bool=False, timeout: int=60) -> Dict[str, Any]:
    fr = state.get('fact_results',[]) or []
    to_review=[]
    for r in fr:
        verdict=(r.get('verdict') or 'UNCERTAIN').upper(); conf=float(r.get('confidence',0.0) or 0.0)
        if verdict=='UNCERTAIN' or conf<0.5:
            to_review.append({'id':r.get('id'),'claim':r.get('claim'),'verdict':verdict,'confidence':conf,'evidence':r.get('evidence',[])[:3]})
    if not to_review: return state
    REVIEW_QUEUE.write_text(json.dumps({'items':to_review}, indent=2), encoding='utf-8')
    print(f'Wrote {len(to_review)} items to {REVIEW_QUEUE}. Fill {REVIEW_DECISIONS} with decisions.')
    if auto_wait:
        waited=0; interval=3
        while waited<timeout:
            if REVIEW_DECISIONS.exists():
                try:
                    dec=json.loads(REVIEW_DECISIONS.read_text(encoding='utf-8'))
                    decisions={d['id']:d for d in dec.get('decisions',[])}
                    for r in fr:
                        rid=r.get('id')
                        if rid in decisions:
                            d=decisions[rid]; newv=d.get('decision','').upper()
                            if newv in ('SUPPORTED','CONTRADICTED','UNCERTAIN'):
                                r['verdict']=newv; r.setdefault('human_review',{})['decision']='APPLIED'; r.setdefault('human_review',{})['note']=d.get('note','')
                    state['fact_results']=fr; print('Applied file decisions.'); return state
                except Exception as e:
                    print('Failed to parse decisions file:', e)
            time.sleep(interval); waited+=interval
        print('Timed out waiting for decisions file; continuing.')
    return state
def maybe_human_review(state: Dict[str, Any], method: str = 'interactive', threshold_confidence: float = 0.5) -> Dict[str, Any]:
    blocking_ok = 'human_review_node_blocking' in globals()
    file_ok = 'human_review_node_file' in globals()
    if not blocking_ok and not file_ok:
        print('No human-review nodes defined — skipping review.'); return state
    fr = state.get('fact_results',[]) or []
    try:
        low_conf = any(float(r.get('confidence',0.0) or 0.0) < float(threshold_confidence) for r in fr)
    except Exception:
        low_conf = False
    try:
        verdicts = {(r.get('verdict') or 'UNCERTAIN').upper() for r in fr}; mixed = ('SUPPORTED' in verdicts and 'CONTRADICTED' in verdicts)
    except Exception:
        mixed = False
    try:
        subj = float(state.get('language_bias',{}).get('subjectivity',0.0) or 0.0)
    except Exception:
        subj = 0.0
    trigger = low_conf or mixed or (subj>0.5)
    print(f'maybe_human_review: low_conf={low_conf}, mixed={mixed}, subj={subj:.2f} -> trigger={trigger}')
    if not trigger: return state
    method = (method or 'interactive').lower()
    if method=='file' and file_ok:
        return human_review_node_file(state)
    if method in ('interactive','blocking','input') and blocking_ok:
        return human_review_node_blocking(state)
    if blocking_ok: return human_review_node_blocking(state)
    if file_ok: return human_review_node_file(state)
    return state
EXPORT_JSON = Path('last_run.json')
EXPORT_CSV = Path('last_run.csv')
def _claim_to_row(claim, fact_result, language_bias, state_meta):
    return {
        'claim_id': claim.get('id'), 'claim_text': claim.get('text'), 'claim_type': claim.get('type'),
        'verdict': (fact_result or {}).get('verdict'), 'fact_confidence': (fact_result or {}).get('confidence'),
        'human_review_decision': (fact_result or {}).get('human_review',{}).get('decision') if isinstance(fact_result,dict) else None,
        'evidence_snippets': ' || '.join([ (e.get('snippet') or '')[:200] for e in (fact_result or {}).get('evidence', []) ]),
        'language_tone': language_bias.get('tone'), 'language_subjectivity': language_bias.get('subjectivity'),
        'bias_score': state_meta.get('bias_score'), 'leaning': state_meta.get('leaning'), 'bias_confidence': state_meta.get('bias_confidence'),
        'summary': (state_meta.get('summary') or '')[:400]
    }
def export_run(state: Dict[str, Any], filename_json: str = None, filename_csv: str = None) -> Dict[str, str]:
    if filename_json is None: filename_json = str(EXPORT_JSON)
    if filename_csv is None: filename_csv = str(EXPORT_CSV)
    with open(filename_json, 'w', encoding='utf-8') as f: json.dump(state, f, ensure_ascii=False, indent=2)
    claims = state.get('claims',[]) or []; fact_results = state.get('fact_results',[]) or []; language_bias = state.get('language_bias',{}) or {}
    by_id = {fr.get('id'):fr for fr in fact_results if isinstance(fr,dict)}
    rows = []
    if claims:
        for c in claims:
            fr = by_id.get(c.get('id'), {})
            rows.append(_claim_to_row(c, fr, language_bias, state))
    else:
        rows.append({'claim_id':None,'claim_text':None,'claim_type':None,'verdict':None,'fact_confidence':None,'human_review_decision':None,'evidence_snippets':None,'language_tone':language_bias.get('tone'),'language_subjectivity':language_bias.get('subjectivity'),'bias_score':state.get('bias_score'),'leaning':state.get('leaning'),'bias_confidence':state.get('bias_confidence'),'summary':(state.get('summary') or '')[:400]})
    with open(filename_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = list(rows[0].keys()); writer = csv.DictWriter(csvfile, fieldnames=fieldnames); writer.writeheader();
        for r in rows: writer.writerow(r)
    print(f'Exported JSON -> {filename_json}'); print(f'Exported CSV -> {filename_csv}'); return {'json': filename_json, 'csv': filename_csv}
def log_run_to_langsmith(state: Dict[str, Any], project: str = None):
    try:
        import langsmith
    except Exception:
        print('langsmith not installed — skipping LangSmith logging.'); return None
    if not os.getenv('LANGSMITH_API_KEY'):
        print('LANGSMITH_API_KEY not set — skipping LangSmith logging.'); return None
    try:
        client = getattr(langsmith, 'Client', langsmith)()
    except Exception:
        client = langsmith
    try:
        meta = {'summary': (state.get('summary') or '')[:500], 'bias_score': state.get('bias_score'), 'leaning': state.get('leaning')}
        if hasattr(client, 'create_run'):
            client.create_run(project_name=project or os.getenv('LANGSMITH_PROJECT','NewsBiasDetector'), metadata=meta)
            print('Logged run to LangSmith (create_run called).')
        else:
            print('LangSmith client found but create_run missing; skipped upload.')
    except Exception as e:
        print('LangSmith logging failed:', e)


In [64]:
# Cell 12 — BeautifulSoup-only extractor (fetch_article_text_bs)
import requests
from bs4 import BeautifulSoup
import re
from typing import Optional
def _clean_whitespace(text: str) -> str:
    text = re.sub(r'\r\n|\r', '\n', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()
def fetch_article_text_bs(url: str, timeout: float = 15.0, user_agent: Optional[str] = None) -> str:
    headers = {'User-Agent': user_agent or 'NewsBiasDetector/1.0 (+https://example.com)'}
    try:
        resp = requests.get(url, headers=headers, timeout=timeout)
        resp.raise_for_status(); html = resp.text
    except Exception as e:
        print('fetch_article_text_bs: download failed:', e); return ''
    try:
        soup = BeautifulSoup(html, 'lxml')
    except Exception:
        soup = BeautifulSoup(html, 'html.parser')
    article_tag = soup.find('article')
    if article_tag:
        text = article_tag.get_text(separator='\n'); text = _clean_whitespace(text)
        if len(text) > 80:
            print(f'fetch_article_text_bs: used <article> tag, extracted {len(text)} chars.'); return text
    candidates = soup.find_all(['div','main','section'], recursive=True)
    best_text = ''
    for c in candidates:
        ps = c.find_all('p')
        if not ps: continue
        joined = '\n\n'.join(p.get_text().strip() for p in ps if p.get_text().strip())
        joined = _clean_whitespace(joined)
        if len(joined) > len(best_text): best_text = joined
    if best_text and len(best_text) > 80:
        print(f'fetch_article_text_bs: used best container heuristic, extracted {len(best_text)} chars.'); return best_text
    paragraphs = soup.find_all('p')
    if paragraphs:
        longest=''; current=[]; last_parent=None
        for p in paragraphs:
            parent = p.parent
            if parent == last_parent or last_parent is None:
                current.append(p.get_text().strip())
            else:
                if current:
                    candidate = '\n\n'.join([x for x in current if x])
                    if len(candidate) > len(longest): longest = candidate
                current = [p.get_text().strip()]
            last_parent = parent
        if current:
            candidate = '\n\n'.join([x for x in current if x])
            if len(candidate) > len(longest): longest = candidate
        longest = _clean_whitespace(longest)
        if len(longest) > 80:
            print(f'fetch_article_text_bs: used longest paragraph sequence fallback, extracted {len(longest)} chars.'); return longest
    full = _clean_whitespace(soup.get_text(separator='\n'))
    if len(full) > 80:
        print(f'fetch_article_text_bs: fallback to full page text, extracted {len(full)} chars (may include noise).'); return full
    print('fetch_article_text_bs: extraction returned short/empty text — page may be JS-heavy or blocked.'); return ''


In [65]:
# Cell 13 — URL input breakpoint (asks user for URL and loads it into state)
def ask_for_url_breakpoint(state: Dict[str, Any]) -> Dict[str, Any]:
    print('=== URL INPUT BREAKPOINT ===')
    print('Paste a news article URL to process, or press Enter to skip and use existing raw_text.')
    url = input('Paste URL (or press Enter to skip): ').strip()
    if url:
        print('Fetching article from:', url)
        text = fetch_article_text_bs(url)
        if text:
            state['raw_text'] = text
            print(f'Loaded article ({len(text)} chars).')
        else:
            print('⚠️ Could not extract article text — keeping previous raw_text.')
    else:
        print('Skipping URL input — using existing raw_text.')
    return state


In [66]:
# Cell 14 — runner using BeautifulSoup extractor
def run_pipeline_from_url_bs(url: str = None, human_method: str = 'interactive') -> Optional[Dict[str, Any]]:
    # initial empty state
    state = {"raw_text": ""}
    # If URL provided, fetch immediately; otherwise use breakpoint to ask URL
    if url:
        print('Fetching:', url)
        state['raw_text'] = fetch_article_text_bs(url)
        if not state['raw_text']:
            print('No text extracted from URL; aborting.'); return None
    else:
        state = ask_for_url_breakpoint(state)
    # run pipeline
    try:
        state = summarize_node(state)
        print('SUMMARY (short):', (state.get('summary') or '')[:300])
    except Exception as e:
        print('summarize_node error:', e)
    try:
        state = extract_claims_node(state)
        print('Extracted claims:', len(state.get('claims',[]) or []))
    except Exception as e:
        print('extract_claims_node error:', e)
    try:
        state = fact_check_node(state)
        print('Fact-check done. Claims checked:', len(state.get('fact_results',[]) or []))
    except Exception as e:
        print('fact_check_node error:', e)
    try:
        state = maybe_human_review(state, method=human_method)
    except Exception as e:
        print('maybe_human_review error:', e)
    try:
        state = language_bias_node(state)
    except Exception as e:
        print('language_bias_node error:', e)
    try:
        state = bias_scoring_node(state)
    except Exception as e:
        print('bias_scoring_node error:', e)
    try:
        exported = export_run(state)
    except Exception as e:
        print('export_run error:', e); exported = {}
    try:
        log_run_to_langsmith(state)
    except Exception as e:
        print('log_run_to_langsmith error:', e)
    print('\n--- FINAL KEY PARAMETERS (explicit) ---')
    print('Bias Score:', state.get('bias_score'))
    print('Bias Confidence:', state.get('bias_confidence'))
    print('Stance/Leaning:', state.get('leaning'))
    print('\nSummary:')
    print(state.get('summary'))
    print('\nBias Breakdown:')
    import pprint
    pprint.pprint(state.get('bias_breakdown', {}))
    print('\nClaims (list):')
    pprint.pprint(state.get('claims', []))
    print('\nFact Results (list):')
    pprint.pprint(state.get('fact_results', []))
    print('\nLanguage Bias Analysis:')
    pprint.pprint(state.get('language_bias', {}))
    print('\nExport files:' , exported)
    return state

# Example usage (uncomment and replace URL to run):
# final_state = run_pipeline_from_url_bs('https://www.example.com/news/article', human_method='interactive')


In [67]:
import os
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.callbacks.tracers.langchain import LangChainTracer

# Set environment variables (can also be loaded from .env)
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "newsbiasdetector"
os.environ["LANGSMITH_API_KEY"] = "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"
os.environ["OPENAI_API_KEY"] = "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"

# Initialize tracer
tracer = LangChainTracer()

# Create a simple LLM chain
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

# Run a traced call
response = llm.invoke([HumanMessage(content="What is media bias?")], config={"callbacks": [tracer]})
print(response.content)

ModuleNotFoundError: No module named 'langchain.schema'

## How to run
1. Ensure your `venv` kernel is selected in VS Code/Jupyter.
2. Run cells top-to-bottom.
3. To use URL breakpoint: run **Cell 14** without providing a URL — it will pause and ask for a URL input.
4. To run with a direct URL: call `run_pipeline_from_url_bs('<your-url>')` in a new cell.
5. After the run, `last_run.json` and `last_run.csv` will be created in project root.
