# General Synthesis Judge

The purpose of this notebook is to showcase the LLM judge for the GeneralSynthesisOntology.


In [None]:
import json

from llm_synthesis.metrics.judge.general_synthesis_judge import (
    DspyGeneralSynthesisJudge,
    make_general_synthesis_judge_signature,
)
from llm_synthesis.utils.dspy_utils import get_llm_from_name

instructions = """
You are an expert materials scientist and data extraction specialist with extensive experience in:
      - Synthesis procedure analysis and documentation
      - Structured data extraction from scientific text
      - Materials science ontology and terminology
      - Quality assessment of automated extraction systems
      
      Evaluate how well the GeneralSynthesisOntology extraction captures synthesis information:
      
      ASSESSMENT FOCUS:
      - Completeness: All synthesis components extracted
      - Accuracy: Correct values, units, and classifications
      - Structure: Proper organization and relationships
      - Semantic Preservation: Scientific meaning maintained
      - Schema Compliance: Adherence to ontology format
      
      EVALUATION CRITERIA:
      1. Structural Completeness - Coverage of materials, steps, equipment, conditions
      2. Material Extraction - Accuracy of names, quantities, units, purities
      3. Process Steps - Correct sequencing and action classification
      4. Equipment Extraction - Complete identification and specifications
      5. Conditions Extraction - Accurate synthesis parameters
      6. Semantic Accuracy - Preservation of scientific meaning
      7. Format Compliance - Schema adherence and data types
      
      Provide detailed technical reasoning for each score and specific improvement recommendations.
"""

system_prompt = """
      You are a senior materials scientist and data extraction expert with deep knowledge of:
      - Inorganic and organic synthesis procedures
      - Laboratory equipment and instrumentation
      - Chemical nomenclature and units
      - Synthesis condition optimization
      - Data structure and ontology design
      - Quality assessment methodologies
      
      Your evaluations should reflect best practices in materials science documentation
      and the highest standards for structured data extraction accuracy.
"""


judge = DspyGeneralSynthesisJudge(
    signature=make_general_synthesis_judge_signature(
        signature_name="GeneralSynthesisJudgeSignature",
        instructions=instructions,
    ),
    lm=get_llm_from_name(
        llm_name="gpt-4o",
        model_kwargs={"temperature": 0.1, "max_tokens": 4096},
        system_prompt=system_prompt,
    ),
    enable_reasoning_traces=True,
    confidence_threshold=0.7,
)

In [None]:
# Simple test case
source_text = """
To synthesize lithium iron phosphate, 2.0 g of LiOH was mixed with 3.0 g 
of FeSO4 and 2.5 g of NH4H2PO4 in 100 mL of distilled water. The mixture 
was stirred for 30 minutes at room temperature using a magnetic stirrer. 
The solution was then transferred to a 150 mL autoclave and heated at 
180°C for 10 hours. After cooling, the product was filtered and dried 
at 80°C for 6 hours.
"""

In [None]:
extracted_ontology = {
    "target_compound": "lithium iron phosphate",
    "synthesis_method": "hydrothermal",
    "starting_materials": [
        {"name": "LiOH", "amount": 2.0, "unit": "g"},
        {"name": "FeSO4", "amount": 3.0, "unit": "g"},
        {"name": "NH4H2PO4", "amount": 2.5, "unit": "g"},
        {"name": "distilled water", "amount": 100, "unit": "mL"},
    ],
    "steps": [
        {
            "step_number": 1,
            "action": "mix",
            "description": "Mix precursors in water",
            "materials": [
                {"name": "LiOH", "amount": 2.0, "unit": "g"},
                {"name": "FeSO4", "amount": 3.0, "unit": "g"},
                {"name": "NH4H2PO4", "amount": 2.5, "unit": "g"},
                {"name": "distilled water", "amount": 100, "unit": "mL"},
            ],
            "equipment": [{"name": "magnetic stirrer"}],
            "conditions": {
                "duration": 30,
                "time_unit": "min",
                "temperature": 25,
                "temp_unit": "C",
                "stirring": True,
            },
        },
        {
            "step_number": 2,
            "action": "heat",
            "description": "Hydrothermal treatment",
            "equipment": [{"name": "autoclave", "settings": "150 mL"}],
            "conditions": {
                "temperature": 180,
                "temp_unit": "C",
                "duration": 10,
                "time_unit": "h",
            },
        },
        {
            "step_number": 3,
            "action": "filter",
            "description": "Filter the product",
        },
        {
            "step_number": 4,
            "action": "dry",
            "description": "Dry the product",
            "conditions": {
                "temperature": 80,
                "temp_unit": "C",
                "duration": 6,
                "time_unit": "h",
            },
        },
    ],
    "equipment": [
        {"name": "magnetic stirrer"},
        {"name": "autoclave", "settings": "150 mL"},
    ],
}

In [None]:
extracted_ontology_json = json.dumps(extracted_ontology, indent=2)
target_material = "lithium iron phosphate"
extracted_ontology_json

In [None]:
# Perform evaluation
evaluation_input = (
    source_text,
    extracted_ontology_json,
    target_material,
)
result = judge.forward(evaluation_input)
result

In [None]:
# Print results
print("\n--- EVALUATION RESULTS ---")
print(f"Overall Score: {result.scores.overall_score}/5.0")
print(f"Confidence Level: {result.confidence_level}")

print("\n[Individual Scores]:")
scores = result.scores
print(f"  Structural Completeness: {scores.structural_completeness_score}/5.0")
print(f"  Material Extraction: {scores.material_extraction_score}/5.0")
print(f"  Process Steps: {scores.process_steps_score}/5.0")
print(f"  Equipment Extraction: {scores.equipment_extraction_score}/5.0")
print(f"  Conditions Extraction: {scores.conditions_extraction_score}/5.0")
print(f"  Semantic Accuracy: {scores.semantic_accuracy_score}/5.0")
print(f"  Format Compliance: {scores.format_compliance_score}/5.0")

if result.missing_information:
    print(f"\n[Missing Information] ({len(result.missing_information)}):")
for item in result.missing_information:
    print(f"  - {item}")

if result.extraction_errors:
    print(f"\n[Extraction Errors] ({len(result.extraction_errors)}):")
for error in result.extraction_errors:
    print(f"  - {error}")

if result.improvement_suggestions:
    print(
        f"\n[Improvement Suggestions] ({len(result.improvement_suggestions)}):"
    )
for suggestion in result.improvement_suggestions:
    print(f"  - {suggestion}")

print("\n[High-level Reasoning]:")
print(result.reasoning)