In [7]:
prompt = """
## ROLE

You are a **web crawling and information extraction agent**.

Your task is to **search public web sources and extract explicitly available information** about professionals relevant to toxicology and safety assessment domains.

Do NOT enrich, infer, score, rank logically, or make assumptions.
Only return **raw, verifiable data** exactly as found.

---

## PEOPLE TO SEARCH

Search for professionals with job titles including (but not limited to):

- Director of Toxicology
- Head of Preclinical Safety
- VP / Director – Preclinical Safety
- Head / Director of Safety Assessment
- Director of Investigative Toxicology
- Hepatic Toxicology roles
- Investigative Toxicology roles
- Senior Toxicologist
- Principal Scientist (Safety / Toxicology)

---

## SOURCES TO SEARCH

### 1. Professional Profiles
- LinkedIn profiles (primary source)
- Xing (secondary, especially Europe-based professionals)

---

### 2. Scientific Publication Databases
- PubMed
- Google Scholar
- bioRxiv

---

### 3. Conference & Event Websites
- Society of Toxicology (SOT)
- AACR
- ISSX
- ACT (College of Toxicology)

---

## KEYWORDS TO USE DURING SEARCH

### Role / Domain Keywords
- Toxicology
- Safety Assessment
- Preclinical Safety
- Hepatic Toxicology
- Investigative Toxicology

### Scientific Keywords
- Drug-Induced Liver Injury
- DILI
- Liver toxicity
- Hepatic safety
- 3D cell culture
- 3D in-vitro models
- Organ-on-chip
- In-vitro models

### Conference / Activity Keywords
- Abstract
- Poster
- Presenter
- Speaker
- Attendee

---

## DATA TO COLLECT (RAW ONLY)

For each individual found, collect **only fields that are explicitly available** from the source.

Do NOT infer missing values.

---

## OUTPUT SCHEMA (MANDATORY)

Return the data in a **structured tabular format** with the following fields:

| Field Name | Description |
|----------|-------------|
| `name` | Full name of the person |
| `title` | Current job title |
| `company` | Company or organization name |
| `person_location` | Person’s location (city / region as stated) |
| `company_hq` | Company headquarters location |
| `work_mode` | Work mode if explicitly mentioned (Remote / Onsite / Hybrid) |
| `email` | Business email address (if publicly available) |
| `publications` | Published on Drug-Induced Liver Injury / liver toxicity in last 12–24 months |

---

## OUTPUT RULES

- One row per individual
- Leave fields empty or null if data is not available
- Do not merge or deduplicate records
- Do not normalize titles or locations
- Preserve original wording from the source
- Output must be compatible with:
  - CSV
  - Excel
  - Streamlit DataFrame

---

## CONSTRAINTS
- Only return source-backed information

---

## FINAL GOAL

Produce a **raw, structured dataset** of professionals relevant to toxicology and safety assessment, suitable for downstream processing and dashboard visualization.

"""

In [8]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [None]:
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

app = FirecrawlApp(api_key="fc-****")

class Lead(BaseModel):
    name: str = Field(description="Full name of the person")
    title: str = Field(description="Post of the person")
    company: str = Field(description="Where the person works")
    person_locaton: str = Field(description="where the person lives")
    company_hq: str = Field(description="Location of company head_quarter in which the person work")
    work_mode: Literal["Onsite", "Remote"] = Field(description="If the location of the person and the company head_quarter are different, work mode is Remote, otherwise Onsite")
    email: str = Field(description="email of the person")
    publications: List[str] = Field(description="Publications by the person on Drug-Induced Liver Injury / liver toxicity in last 12–24 months")

class LeadsSchema(BaseModel):
    leads: List[Lead] = Field(description="List of leads")

result = app.agent(
    prompt=prompt,
    schema=LeadsSchema
)

print(result.data)

{'leads': [{'name': 'Jonathan Jackson', 'title': 'Research and Development LifeSciences Director', 'company': 'LifeNet Health', 'person_locaton': 'Groton, Connecticut', 'company_hq': 'Virginia Beach, Virginia', 'work_mode': 'Remote', 'email': 'jonathan_jackson@lifenethealth.org', 'publications': ['Moving beyond Binary Predictions of Human Drug-Induced Liver Injury (2019 - but relevant to his work as DILI SME at Pfizer)', 'Early Drug-Induced Liver Injury Risk Screening (2022 - as DILI SME at Pfizer)']}, {'name': 'Samantha Wilcoxson', 'title': 'Principal Scientist', 'company': 'Amgen', 'person_locaton': 'San Francisco, California', 'company_hq': 'Thousand Oaks, California', 'work_mode': 'Remote', 'email': 'swilcoxs@amgen.com', 'publications': []}, {'name': 'Maria Ellis', 'title': 'Executive Director, Head of Safety Science', 'company': 'Daiichi Sankyo', 'person_locaton': 'Philadelphia, Pennsylvania', 'company_hq': 'Basking Ridge, New Jersey', 'work_mode': 'Remote', 'email': 'maria.ellis@

In [13]:
len(result.data['leads'])

7

In [14]:
import json
import csv
from openai import OpenAI

# -----------------------------
# CONFIG
# -----------------------------
MODEL_NAME = "gpt-4o-mini"
INPUT_JSON = "leads.json"
OUTPUT_CSV = "scored_leads.csv"

client = OpenAI()

# -----------------------------
# SCORING PROMPT TEMPLATE
# -----------------------------
SCORING_PROMPT = """
You are an evaluation engine.

Given a single lead profile, calculate a probability score between 0 and 100
using ONLY the rules below. Do not invent information.

Scoring rules:

1. Role relevance (+30)
   - Toxicology, Safety, Hepatic, Drug Safety, DILI → +30
   - Partial relevance → +15
   - Otherwise → +0

2. Company funding stage (+20)
   - Pharma / Biotech company → +20
   - Academic / Hospital → +0

3. Existing use of similar technologies (+15)
   - Publications or role related to DILI, liver toxicity, biomarkers, drug safety → +15
   - Otherwise → +0

4. Openness to New Approach Methodologies (NAMs) (+10)
   - Translational safety, biomarkers, toxicogenomics, modeling → +10
   - Otherwise → +0

5. Biotech hub presence (+10)
   - Boston/Cambridge, Bay Area, Basel, UK Golden Triangle → +10
   - Otherwise → +0

6. Recent publications (+40)
   - Any liver/DILI publication from 2023–2024 → +40
   - Otherwise → +0

Return ONLY a JSON object:
{
  "probability_score": number
}

Lead data:
"""

# -----------------------------
# HELPER: CALL LLM FOR SCORING
# -----------------------------
def score_lead(lead):
    response = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0,
        messages=[
            {"role": "system", "content": "You are a strict scoring engine."},
            {"role": "user", "content": SCORING_PROMPT + json.dumps(lead, indent=2)}
        ]
    )

    content = response.choices[0].message.content
    score = json.loads(content)["probability_score"]
    return score

# -----------------------------
# LOAD DATA
# -----------------------------
with open(INPUT_JSON, "r") as f:
    data = json.load(f)

leads = data["leads"]

# -----------------------------
# SCORE LEADS
# -----------------------------
for lead in leads:
    lead["probability_score"] = score_lead(lead)

# -----------------------------
# RANK LEADS
# -----------------------------
leads_sorted = sorted(
    leads,
    key=lambda x: x["probability_score"],
    reverse=True
)

for idx, lead in enumerate(leads_sorted, start=1):
    lead["rank"] = idx

# -----------------------------
# SAVE TO CSV
# -----------------------------
csv_fields = [
    "rank",
    "probability_score",
    "name",
    "title",
    "company",
    "person_locaton",
    "company_hq",
    "work_mode",
    "email",
]

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=csv_fields)
    writer.writeheader()
    for lead in leads_sorted:
        writer.writerow({k: lead.get(k, "") for k in csv_fields})

print(f"Saved ranked leads to {OUTPUT_CSV}")


Saved ranked leads to scored_leads.csv
