In [3]:
!pip install -qU langchain langchain-openai langchain-google-genai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key:  ········


In [None]:
# If you prefer Gemini 
# os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your GOOGLE API key: ")

In [16]:
# Cell 1: Imports & global config (single place for model + temperature)

import os
import json
from typing import Any, Dict

import pandas as pd
from tqdm import tqdm

from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# ------------------------------------------------------------------
# PATHS
# ------------------------------------------------------------------
IN_PATH = "/home/jupyter/WSM/data/holifood_weak_signals.csv"
OUT_PATH = "/home/jupyter/WSM/data/holifood_weak_singals_scored.csv"

# ------------------------------------------------------------------
# LLM CONFIG – ONLY CHANGE IT HERE
# ------------------------------------------------------------------
LLM_PROVIDER = "openai"       # options: "openai", "gemini"
MODEL_NAME = "gpt-5-mini"
# MODEL_NAME = "gemini-2.5-flash"  # for Gemini
MODEL_TEMPERATURE = 0.4

print("Input path :", IN_PATH)
print("Output path:", OUT_PATH)
print("LLM provider:", LLM_PROVIDER)
print("Model name :", MODEL_NAME)
print("Temperature:", MODEL_TEMPERATURE)

print("Has OPENAI_API_KEY:", "OPENAI_API_KEY" in os.environ)
print("Has GOOGLE_API_KEY:", "GOOGLE_API_KEY" in os.environ)

Input path : /home/jupyter/WSM/data/holifood_weak_signals.csv
Output path: /home/jupyter/WSM/data/holifood_weak_singals_scored.csv
LLM provider: openai
Model name : gpt-5-mini
Temperature: 0.4
Has OPENAI_API_KEY: True
Has GOOGLE_API_KEY: False


In [17]:
# Cell 2: Define the LLM prompt template (with topic label)

LLM_PROMPT_TEMPLATE = """
You are assisting in regulatory emerging risk assessment (food, feed, chemicals, environment).

Each data point is a *topic* detected by text mining. Topics are represented by a short list of keywords
(“topic representation”) and some simple metadata.

Your task: for **each topic**, estimate:

1) **Novelty** of the topic (how new or weakly institutionalized it is in the scientific/regulatory discourse)
   - Scale: 1–5 (integer)
     - 1 = Very established, mature topic; widely studied, well known.
     - 2 = Established but still evolving; plenty of literature and prior regulatory attention.
     - 3 = Moderately new; clearly present in the literature but not yet mainstream.
     - 4 = New/weakly institutionalized; limited literature, niche or emerging.
     - 5 = Very novel; highly niche, very sparse evidence, or clearly “upcoming” frontier.

2) **Severity** of potential adverse outcomes if this topic turns into a manifest risk
   (think about scale of harm to health, environment, economy, or society).
   - Scale: 1–5 (integer)
     - 1 = Negligible or very localized harm.
     - 2 = Limited harm; mostly reversible, small populations or local ecosystems.
     - 3 = Moderate harm; notable morbidity/mortality or environmental/economic impact.
     - 4 = Severe harm; large-scale, long-lasting, or difficult to reverse.
     - 5 = Catastrophic or systemic harm; wide populations or ecosystems, long-term damage.

3) Propose a short, human-readable **topic label** (3–7 words) that an expert would find intuitive.
   - Avoid generic labels like "novel hazard topic".
   - Use concrete hazard/exposure/outcome language where possible.

Important:
- You do NOT have access to the underlying documents. You must infer from the topic representation + metadata.
- If the topic is clearly about a well-known hazard (e.g. “salmonella outbreak, aflatoxin, PFOS, dioxin, asbestos”),
  novelty is probably LOW (1–2), even if severity can be high.
- If the topic looks niche, newly combined, or unclear, novelty can be HIGH (4–5).
- Severity can be high even if novelty is low (e.g. classic but dangerous hazard).
- When uncertain, choose the middle of the scale, but still commit to clear integers.

Return your answer as valid compact JSON with the following keys:
- "novelty" (int 1–5)
- "severity" (int 1–5)
- "novelty_reason" (string; 1–3 sentences)
- "severity_reason" (string; 1–3 sentences)
- "topic_label" (string; 3–7 words, intuitive label)

DO NOT include any other top-level keys.
DO NOT wrap the JSON in backticks.

-------------------------
TOPIC INFORMATION
-------------------------
Topic ID: {topic_id}
Topic name: {topic_name}
Count (approx. number of documents in topic): {count}
Weak-signal score (if available): {weak_signal_score}

Topic representation (keywords):
{representation}

(End of topic information)
"""

In [18]:
# Cell 3 — Interpreter with OpenAI + Gemini backend switch

class LLMWeakSignalInterpreter:
    """
    Uses either OpenAI or Gemini to score weak-signal topics.
    """

    def __init__(self):
        # Select backend from LLM_PROVIDER
        if LLM_PROVIDER.lower() == "openai":
            self.llm = ChatOpenAI(
                model=MODEL_NAME,
                temperature=MODEL_TEMPERATURE,
            )
        elif LLM_PROVIDER.lower() == "gemini":
            self.llm = ChatGoogleGenerativeAI(
                model=MODEL_NAME,
                temperature=MODEL_TEMPERATURE,
            )
        else:
            raise ValueError(f"Unknown LLM provider: {LLM_PROVIDER}")

        # Prompt + output parser
        self.prompt = ChatPromptTemplate.from_template(LLM_PROMPT_TEMPLATE)
        self.chain = self.prompt | self.llm | StrOutputParser()


    # -------- (rest of interpreter unchanged) --------

    @staticmethod
    def _normalize_representation(rep: Any) -> str:
        if isinstance(rep, (list, tuple)):
            return ", ".join(map(str, rep))
        if isinstance(rep, str) and rep.startswith("[") and rep.endswith("]"):
            inner = rep[1:-1]
            tokens = [
                tok.strip().strip("'\"")
                for tok in inner.split(",")
                if tok.strip()
            ]
            return ", ".join(tokens)
        return str(rep)

    @staticmethod
    def _safe_get(row: pd.Series, key: str, default: Any = "N/A") -> Any:
        return row[key] if key in row and pd.notna(row[key]) else default


    def _call_llm_for_row(self, row: pd.Series, topic_id: int) -> Dict[str, Any]:
        representation = self._normalize_representation(row["Representation"])
        topic_name = self._safe_get(row, "Name", f"Topic_{topic_id}")
        count = self._safe_get(row, "Count", "N/A")

        weak_signal_score = None
        for candidate in ["weak_signal_score", "score", "recency_jump_score"]:
            if candidate in row and pd.notna(row[candidate]):
                weak_signal_score = row[candidate]
                break

        if isinstance(weak_signal_score, (int, float)):
            weak_signal_score_str = f"{weak_signal_score:.4f}"
        elif weak_signal_score is not None:
            weak_signal_score_str = str(weak_signal_score)
        else:
            weak_signal_score_str = "N/A"

        raw = self.chain.invoke(
            {
                "topic_id": topic_id,
                "topic_name": topic_name,
                "count": count,
                "weak_signal_score": weak_signal_score_str,
                "representation": representation,
            }
        )

        try:
            data = json.loads(raw)
            novelty = int(data.get("novelty", 3))
            severity = int(data.get("severity", 3))
            novelty_reason = str(data.get("novelty_reason", "")).strip()
            severity_reason = str(data.get("severity_reason", "")).strip()
            topic_label = str(data.get("topic_label", "")).strip()
        except Exception:
            novelty = severity = 3
            novelty_reason = f"Failed to parse JSON: {raw[:200]}"
            severity_reason = "Failed to parse JSON."
            topic_label = ""

        novelty = max(1, min(5, novelty))
        severity = max(1, min(5, severity))

        return {
            "llm_novelty": novelty,
            "llm_severity": severity,
            "llm_topic_label": topic_label,
            "llm_novelty_reason": novelty_reason,
            "llm_severity_reason": severity_reason,
        }


    def score_dataframe(self, df_weak: pd.DataFrame, show_progress=True):
        df = df_weak.copy().reset_index(drop=True)
        scores = []

        iterator = range(len(df))
        if show_progress:
            iterator = tqdm(iterator, desc="Scoring weak signals…")

        for i in iterator:
            scores.append(self._call_llm_for_row(df.iloc[i], topic_id=i))

        return pd.concat([df, pd.DataFrame(scores)], axis=1)

In [19]:
# Cell 4: Load weak-signal CSV

df_weak = pd.read_csv(IN_PATH)

print("Loaded weak-signal dataframe with shape:", df_weak.shape)
display(df_weak.head())

# Optional: check that Representation exists
if "Representation" not in df_weak.columns:
    raise ValueError("The CSV must contain a 'Representation' column.")

Loaded weak-signal dataframe with shape: (10, 5)


Unnamed: 0,Name,Count,score,weak_signal_score,Representation
0,213_citrus_juice_marinated_sensory,11,0.401407,0.722222,"['citrus', 'juice', 'marinated', 'sensory', 'm..."
1,181_sers_imprinted_qcm_spr,13,0.40135,0.610969,"['sers', 'imprinted', 'qcm', 'spr', 'ecl', 'ca..."
2,291_hets_alsub2subosub3sub_cuonps_nanosilver,8,0.273842,0.571281,"['hets', 'alsub2subosub3sub', 'cuonps', 'nanos..."
3,382_haploid_dh_colchicine_hmf,6,0.200589,0.500009,"['haploid', 'dh', 'colchicine', 'hmf', 'doubli..."
4,347_dandelion_broth_uvbinduced_fermentation,7,0.217578,0.486752,"['dandelion', 'broth', 'uvbinduced', 'fermenta..."


In [20]:
# Cell 5: Run LLM scoring & save results, then print

# Uses MODEL_NAME and MODEL_TEMPERATURE from Cell 1 via the class
interpreter = LLMWeakSignalInterpreter()

df_scored = interpreter.score_dataframe(df_weak, show_progress=True)

print("Scored dataframe shape:", df_scored.shape)

# Quick preview
display(df_scored.head())

# Save to CSV
df_scored.to_csv(OUT_PATH, index=False)
print("Saved LLM-scored weak signals to:", OUT_PATH)

Scoring weak signals…: 100%|██████████| 10/10 [01:55<00:00, 11.56s/it]

Scored dataframe shape: (10, 10)





Unnamed: 0,Name,Count,score,weak_signal_score,Representation,llm_novelty,llm_severity,llm_topic_label,llm_novelty_reason,llm_severity_reason
0,213_citrus_juice_marinated_sensory,11,0.401407,0.722222,"['citrus', 'juice', 'marinated', 'sensory', 'm...",2,1,Citrus juice meat marinade sensory,Research on food marination and sensory effect...,Primary impacts are on sensory quality and con...
1,181_sers_imprinted_qcm_spr,13,0.40135,0.610969,"['sers', 'imprinted', 'qcm', 'spr', 'ecl', 'ca...",3,2,Surface and aptamer-based biosensors,The keywords describe well-established analyti...,This is primarily a detection/monitoring techn...
2,291_hets_alsub2subosub3sub_cuonps_nanosilver,8,0.273842,0.571281,"['hets', 'alsub2subosub3sub', 'cuonps', 'nanos...",4,3,Nanoparticle-induced NETs formation,The topic combines specific engineered nanopar...,If nanoparticle-induced neutrophil extracellul...
3,382_haploid_dh_colchicine_hmf,6,0.200589,0.500009,"['haploid', 'dh', 'colchicine', 'hmf', 'doubli...",2,2,Colchicine-induced doubled haploids,Doubled-haploid production and colchicine-indu...,Colchicine is a toxic mitotic inhibitor so occ...
4,347_dandelion_broth_uvbinduced_fermentation,7,0.217578,0.486752,"['dandelion', 'broth', 'uvbinduced', 'fermenta...",4,2,UVB-induced dandelion skin fermentation,"The combination of dandelion extracts, UVB-ind...",Potential harms are primarily topical (irritat...


Saved LLM-scored weak signals to: /home/jupyter/WSM/data/holifood_weak_singals_scored.csv
