In [38]:
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain_mistralai import MistralAIEmbeddings
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough,RunnableLambda,RunnableBranch,RunnableParallel
from langchain_mistralai import ChatMistralAI
from langchain_chroma import Chroma
from pydantic import BaseModel, Field
import math
from langchain_core.output_parsers import PydanticOutputParser
from typing import Literal,Optional
load_dotenv()

True

In [19]:
#define vector_store embedding model
embed_model=MistralAIEmbeddings()
json_model = ChatMistralAI(model="mistral-small-latest",temperature=0.1)
jsonmodel = ChatMistralAI(model="mistral-small-latest",temperature=0.1)
vector_store=Chroma(embedding_function=embed_model,persist_directory="rumor_detection",collection_name="facts")
retriever = vector_store.as_retriever(search_kwargs={"k": 2})

In [20]:
class Claim(BaseModel):
    # deterministic index so map() results can be recombined later
    claim_id: int = Field(
        description="Unique index of this claim inside the rumor"
    )

    claim: str = Field(
        description=(
            "Atomic factual proposition containing exactly ONE subject and ONE object. "
            "Do not copy the full rumor sentence. Split conjunctions into separate claims."
        )
    )

    claim_type: Literal[
        "health",
        "death",
        "policy",
        "event",
        "statistic",
        "relationship",
        "other"
    ] = Field(
        description="Strict category label"
    )

    # enforce subject + object only (prevents bad clustering embeddings)
    entities: list[str] = Field(
        min_length=2,
        max_length=2,
        description="Exactly two entities: subject and object"
    )

    # optional context — None instead of 'NAN'
    time: Optional[str] = Field(
        default=None,
        description="Explicit time reference if present"
    )

    location: Optional[str] = Field(
        default=None,
        description="Explicit location reference if present"
    )

    canonical_text: str = Field(
        description=(
            "Controlled identity sentence: <subject> <relation> <object> [context]. "
            "Must represent only this claim."
        )
    )
class RumorSchema(BaseModel):
    claims: list[Claim]

In [21]:
structured_llm = json_model.with_structured_output(RumorSchema)

In [22]:
json_extract_prompt = ChatPromptTemplate.from_template("""
You are a structured fact extraction system.

Return ONLY valid JSON matching the schema exactly.
Do not add commentary.

--------------------------------------------------
TASK
From a rumor, extract atomic factual claims suitable for verification and clustering.

Each claim must represent ONE real-world relationship:

    subject → relation → object

--------------------------------------------------
ATOMICITY RULE (CRITICAL)

Split conjunctions:

"X and Y cause Z"
→ X causes Z
→ Y causes Z

"X causes Y and Z"
→ X causes Y
→ X causes Z

Never keep conjunctions inside a claim.
Each claim must stand independently.

--------------------------------------------------
CLAIM RULES

The claim field:
- minimal factual statement
- no "and/or/with"
- no explanation
- no context phrases

--------------------------------------------------
ENTITY RULES

entities MUST contain exactly two items:
[subject, object]

Do NOT include time/location/conditions.

--------------------------------------------------
CLAIM TYPE (STRICT ENUM)

Choose one:
health | death | policy | event | statistic | relationship | other

--------------------------------------------------
TIME & LOCATION RULE

If missing → return null (NOT "NAN")

--------------------------------------------------
CANONICAL TEXT CONTRACT

Format:
<subject> <relation> <object> [context]

Allowed relations:
prevents | causes | cures | treats | increases | decreases | kills |
contains | leads_to | results_in | died_from | implemented | occurred_in | affects

Rules:
- lowercase except proper nouns
- remove modal words
- ≤ 12 words
- health claims end with "in humans"
- must match the claim meaning exactly

--------------------------------------------------
OUTPUT FORMAT (STRICT)

claims: [
    {{
        claim_id: integer (start at 0 and increment)
        claim: string
        claim_type: one of enum
        entities: [subject, object]
        time: string or null
        location: string or null
        canonical_text: string
    }}
]

Return STRICT JSON only.

--------------------------------------------------
Rumor: {rumor}
""")

In [23]:
def convert(result):
    result=result.model_dump()
    return result
dict_convert_runnable=RunnableLambda(convert)

In [24]:
def attach_embeddings(data: dict):
    claims = data.get("claims", [])
    texts = [c.get("canonical_text") for c in claims]
    embeddings = embed_model.embed_documents(texts)
    for idx, emb in enumerate(embeddings):
        claims[idx]["embedding"] = emb
    return data
embedding_chain=RunnableLambda(attach_embeddings)

In [25]:
def convertdic_list(dic: dict):
    lis=dic["claims"]
    return lis
lis_runnable=RunnableLambda(convertdic_list)

In [26]:
def similarity_check(claim: dict, cluster: list = None):
    # temporary stub — always low similarity
    return {
        "score": 0.1,
        "claim": claim
    }

similaritycheck_runnable = RunnableLambda(similarity_check)

In [27]:
def format_tostr(x: dict):
    return x["claim"]["canonical_text"]

format_tostrrunnable = RunnableLambda(format_tostr)

In [28]:
rag=format_tostrrunnable | retriever

In [29]:
rag_chain=RunnableParallel({
    "pass_rest":RunnablePassthrough(),
    "rag_doc":rag
})

In [30]:
similarity_branch=RunnableBranch(
    (lambda x:x["score"]<0.5,rag_chain),
    RunnablePassthrough()
)

In [31]:
single_claim_chain= similaritycheck_runnable | similarity_branch 
multi_batch_chain=single_claim_chain.map()

In [32]:
def extract_texts(results):
    canonical_texts = []
    page_contents = []

    for item in results:
        # ---- canonical text ----
        canon = item["pass_rest"]["claim"]["canonical_text"]
        canonical_texts.append(canon)

        # ---- retrieved docs ----
        docs = item.get("rag_doc", [])
        doc_texts = []

        for d in docs:
            if isinstance(d, Document):
                doc_texts.append(d.page_content)
            else:  # fallback safety (sometimes retrievers return dicts)
                doc_texts.append(str(d))

        page_contents.append(doc_texts)

    return canonical_texts, page_contents
print(extract_texts(result))
extract_llm=RunnableLambda(extract_texts)

(['drinking cold water after meals causes stomach cancer in humans', 'smoking after meals causes stomach cancer in humans'], [['Smoking is a major risk factor for developing several types of cancer, including lung, throat, and mouth cancer.', 'Lung cancer is the leading cause of cancer-related deaths worldwide.'], ['Smoking is a major risk factor for developing several types of cancer, including lung, throat, and mouth cancer.', 'Lung cancer is the leading cause of cancer-related deaths worldwide.']])


In [33]:
def tuple_to_inputs(data):
    claims, docs = data
    return {
        "claims": claims,
        "documents": docs
    }

prepare_inputs = RunnableLambda(tuple_to_inputs)

In [35]:
class ClaimLabel(BaseModel):
    verdict: Literal[
        "supported",
        "contradicted",
        "conflicting",
        "insufficient"
    ] = Field(description="Label for the claim at the same index")

class ValidationOutput(BaseModel):
    results: list[ClaimLabel] = Field(
        description="List index corresponds exactly to claims list index"
    )

In [39]:
parser = PydanticOutputParser(pydantic_object=ValidationOutput)

validation_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """
You are a strict factual verification classifier.

You will receive:

1) A list of CLAIMS
2) A list of DOCUMENT GROUPS

Each claim at index i must ONLY be evaluated using the
documents at index i.

Never mix indices.

Labels:

supported:
documents clearly confirm the claim

contradicted:
documents clearly deny the claim

conflicting:
documents contain both support and contradiction

insufficient:
documents related but no proof

Rules:
- No outside knowledge
- No guessing
- Output labels must match claim count exactly
- Order must be preserved

Return JSON only:

{format_instructions}
"""
    ),
    (
        "human",
        """
CLAIMS:
{claims}

DOCUMENTS:
{documents}
"""
    )
]).partial(format_instructions=parser.get_format_instructions())

In [40]:
validation_chain=extract_llm | prepare_inputs| validation_prompt | jsonmodel | parser

In [44]:
validation_branch=RunnableParallel({
    "plain_json":RunnablePassthrough(),
    "validation":validation_chain
})

In [45]:
chain=json_extract_prompt | structured_llm| dict_convert_runnable | embedding_chain | lis_runnable | multi_batch_chain | validation_branch

In [46]:
result = chain.invoke({
    "rumor": "From whatsapp doctors say drinking cold water after and smoking meals causes stomach cancer in India"
})
print(result)
print(type(result))


{'plain_json': [{'pass_rest': {'score': 0.1, 'claim': {'claim_id': 0, 'claim': 'drinking cold water after meals causes stomach cancer', 'claim_type': 'health', 'entities': ['drinking cold water after meals', 'stomach cancer'], 'time': None, 'location': 'India', 'canonical_text': 'drinking cold water after meals causes stomach cancer in humans', 'embedding': [-0.0164947509765625, 0.00536346435546875, 0.053466796875, -0.029693603515625, 0.042083740234375, 0.00783538818359375, 0.01122283935546875, 0.020050048828125, -0.009857177734375, -0.02276611328125, -0.0384521484375, 0.0034847259521484375, -0.0277252197265625, 0.00853729248046875, -0.023590087890625, 0.065673828125, 0.01303863525390625, 0.0109710693359375, -0.00460052490234375, 0.01270294189453125, -0.028045654296875, -0.0282135009765625, -0.027557373046875, -0.0004897117614746094, 0.00788116455078125, 0.004291534423828125, -0.006435394287109375, -0.0262298583984375, -0.047515869140625, -0.00606536865234375, 0.0094451904296875, -0.05