In [1]:
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai import ChatMistralAI

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
load_dotenv()
model = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0.1
)

In [8]:
class Claim(BaseModel):
    claim: str = Field(
        description=(
            "A minimal atomic factual proposition with exactly ONE subject and ONE object. "
            "Do not copy the full rumor sentence. "
            "Split conjunctions into separate claims. "
            "No 'and', 'or', commas joining multiple causes."
        )
    )
    claim_type: str = Field(
        description="One of: health | death | policy | event | statistic | relationship | other"
    )

    entities: list[str] = Field(
        description=(
            "Only the primary subject and object entities involved in this specific claim. "
            "Do not include extra phrases or conditions."
        )
    )

    time: str = Field(description="Explicit time reference or NAN")
    location: str = Field(description="Explicit location reference or NAN")

    canonical_text: str = Field(
        description=(
            "Controlled identity sentence: <subject> <relation> <object> [context]. "
            "Must match this claim exactly and not include other causes."
        )
    )


In [9]:
structured_llm = model.with_structured_output(RumorSchema)

In [10]:
prompt = ChatPromptTemplate.from_template("""
You are a structured fact extraction system.

Your task:
From a rumor, produce atomic factual claims suitable for verification and clustering.

You are NOT summarizing.
You are converting language into a set of independent causal propositions.

--------------------------------------------------
CORE PRINCIPLE
Each claim represents ONE real-world relationship:

    subject → relation → object

A single claim MUST contain exactly ONE subject and ONE object.

--------------------------------------------------
ATOMICITY RULE (CRITICAL)

If a sentence contains multiple causes, split them:

"X and Y cause Z"
→ Claim 1: X causes Z
→ Claim 2: Y causes Z

If a sentence contains multiple effects, split them:

"X causes Y and Z"
→ Claim 1: X causes Y
→ Claim 2: X causes Z

Never keep conjunctions inside a claim.
Never reuse the full original sentence.
Each produced claim must stand independently.

--------------------------------------------------
CLAIM FIELD RULES

The claim field must:
- be rewritten into a minimal factual statement
- contain no "and/or/with"
- contain no extra context
- contain no explanation
- represent only that atomic proposition

Bad:
"drinking cold water and sitting long causes cancer"

Good:
"drinking cold water causes stomach cancer"
"sitting long after meals causes stomach cancer"

--------------------------------------------------
ENTITY RULES

Entities must include ONLY:
- the acting subject
- the target object

Do NOT include:
- time phrases
- quantities
- locations
- descriptive conditions
- supporting phrases

--------------------------------------------------
CANONICAL TEXT CONTRACT

canonical_text is a strict identity sentence used for semantic matching.

Format:
<subject> <relation_verb> <object> [context]

Allowed relation verbs:
prevents | causes | cures | treats | increases | decreases | kills | contains |
leads_to | results_in | died_from | implemented | occurred_in | affects

Constraints:
- lowercase except proper nouns
- remove modal words (may, might, can, possibly, reportedly)
- no adjectives or emotional wording
- ≤ 12 words
- health claims must end with "in humans"
- must correspond EXACTLY to the claim field

--------------------------------------------------
GENERAL RULES

- Extract all implied factual claims
- Each claim must be verifiable true or false
- Do NOT merge multiple facts
- Do NOT explain
- If time missing return NAN
- If location missing return NAN
- Return STRICT JSON only

--------------------------------------------------
EXAMPLES

Rumor:
"haldi cures corona instantly"

Output:
claim: "turmeric cures COVID-19 infection"
canonical_text: "turmeric cures COVID-19 infection in humans"


Rumor:
"5g towers spread covid"

Output:
claim: "5G towers cause COVID-19 infection"
canonical_text: "5G towers cause COVID-19 infection in humans"


Rumor:
"drinking cold water and sitting long after meals causes stomach cancer"

Output:
claim: "drinking cold water causes stomach cancer"
canonical_text: "cold water causes stomach cancer in humans"

claim: "sitting long after meals causes stomach cancer"
canonical_text: "sitting long after meals causes stomach cancer in humans"

--------------------------------------------------
Schema:
claims: [
    {{
        claim: string
        claim_type: string
        entities: list[string]
        time: string
        location: string
        canonical_text: string
    }}
]

Rumor: {rumor}
""")


In [11]:
chain = prompt | structured_llm

In [12]:
result = chain.invoke({
    "rumor": "From whatsapp doctors say drinking cold water and sitting for long after meals causes stomach cancer in India"
})
print(result)
print(result.model_dump())
print(type(result))
print(type(result.model_dump()))
#have to return result.model_dump() as dict and convert to json by json.loads(dict)function

claims=[Claim(claim='drinking cold water causes stomach cancer', claim_type='health', entities=['cold water', 'stomach cancer'], time='NAN', location='India', canonical_text='cold water causes stomach cancer in humans'), Claim(claim='sitting for long after meals causes stomach cancer', claim_type='health', entities=['sitting for long after meals', 'stomach cancer'], time='NAN', location='India', canonical_text='sitting for long after meals causes stomach cancer in humans')]
{'claims': [{'claim': 'drinking cold water causes stomach cancer', 'claim_type': 'health', 'entities': ['cold water', 'stomach cancer'], 'time': 'NAN', 'location': 'India', 'canonical_text': 'cold water causes stomach cancer in humans'}, {'claim': 'sitting for long after meals causes stomach cancer', 'claim_type': 'health', 'entities': ['sitting for long after meals', 'stomach cancer'], 'time': 'NAN', 'location': 'India', 'canonical_text': 'sitting for long after meals causes stomach cancer in humans'}]}
<class '__m