In [1]:
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai import ChatMistralAI

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
load_dotenv()
model = ChatMistralAI(
    model="mistral-small-latest",
    temperature=0.1
)

In [None]:
class Claim(BaseModel):
    claim: str = Field(description="Original extracted factual statement exactly as inferred from rumor")
    claim_type: str = Field(description="One of: health | death | policy | event | statistic | relationship | other")
    entities: list[str] = Field(description="Primary real-world named entities only (people, substances, diseases, organizations, places)")
    time: str = Field(description="Explicit time reference or NAN")
    location: str = Field(description="Explicit location reference or NAN")
    canonical_text: str = Field(description=(
            "Normalized factual sentence following strict grammar rules:\n"
            "Format: <subject> <relation> <object> [context]\n\n"

            "Allowed relation verbs:\n"
            "prevents | causes | cures | treats | increases | decreases | kills | contains | "
            "leads_to | results_in | died_from | implemented | occurred_in | affects\n\n"

            "Rules:\n"
            "- lower case except proper nouns\n"
            "- no adjectives or emotional words\n"
            "- no modal verbs: may, might, can, possibly, reportedly, allegedly\n"
            "- no explanations or extra sentences\n"
            "- maximum 12 words\n"
            "- must describe real-world effect, not belief\n"
            "- medical claims must end with 'in humans' when about health\n"
        )
    )
class RumorSchema(BaseModel):
    claims: list[Claim]

In [4]:
structured_llm = model.with_structured_output(RumorSchema)

In [None]:
prompt = ChatPromptTemplate.from_template("""
You are an information extraction system.

Given a rumor, extract independent factual claims and produce a normalized canonical statement.

IMPORTANT:
The canonical_text is NOT a paraphrase.
It is a controlled factual identity sentence used for semantic matching.

GENERAL RULES:
- Each claim must be independently verifiable true or false
- Extract implied claims
- Do NOT merge multiple facts
- No explanations
- Return STRICT JSON only
- If time/location missing return NAN

CANONICAL TEXT GRAMMAR:
Write a single sentence using:

<subject> <relation_verb> <object> [context]

Allowed relation verbs:
prevents | causes | cures | treats | increases | decreases | kills | contains |
leads_to | results_in | died_from | implemented | occurred_in | affects

CONSTRAINTS:
- lowercase except proper nouns
- remove words like: may, might, can, possibly, reportedly, secret, shocking
- no adjectives
- â‰¤ 12 words
- health claims must end with "in humans"

EXAMPLES:

Rumor: "haldi cures corona instantly"
Output canonical_text: "turmeric cures COVID-19 infection in humans"

Rumor: "5g towers spread covid"
Output canonical_text: "5G towers cause COVID-19 infection in humans"

Rumor: "government secretly added microchips in vaccines"
Output canonical_text: "vaccines contain microchips"

Schema:
claims: [
    {{
        claim: string
        claim_type: string
        entities: list[string]
        time: string
        location: string
        canonical_text: string
    }}
]

Rumor: {rumor}
""")


In [10]:
chain = prompt | structured_llm

In [None]:
result = chain.invoke({
    "rumor": "From whatsapp doctors say drinking cold water after meals causes stomach cancer in India"
})
print(result)
print(result.model_dump())
print(type(result))
print(type(result.model_dump()))
#have to return result.model_dump() as dict and convert to json by json.loads(dict)function

claims=[Claim(claim='doctors say drinking cold water after meals causes stomach cancer', claim_type='health', entities=['doctors'], time='NAN', location='NAN'), Claim(claim='the claim is spread from whatsapp', claim_type='other', entities=['whatsapp'], time='NAN', location='NAN'), Claim(claim='the claim is about India', claim_type='other', entities=['India'], time='NAN', location='India')]
{'claims': [{'claim': 'doctors say drinking cold water after meals causes stomach cancer', 'claim_type': 'health', 'entities': ['doctors'], 'time': 'NAN', 'location': 'NAN'}, {'claim': 'the claim is spread from whatsapp', 'claim_type': 'other', 'entities': ['whatsapp'], 'time': 'NAN', 'location': 'NAN'}, {'claim': 'the claim is about India', 'claim_type': 'other', 'entities': ['India'], 'time': 'NAN', 'location': 'India'}]}
<class '__main__.RumorSchema'>
<class 'dict'>
