# LLM vs Rule-Based Extraction: Differences Notebook

This notebook reproduces the key differences described in `docs/LLM_approach.md` by comparing:
- Rule-based output
- LLM output

It focuses on completeness, section-level field differences, duplicates, and approval data extraction.

In [1]:
from pathlib import Path
import json
from collections import Counter
import pandas as pd

pd.set_option('display.max_colwidth', 120)

ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
RULE_PATH = ROOT / 'data/processed/schema_output/diler-07-07-2025-rerun-41-44_schema_output.json'
LLM_PATH = ROOT / 'data/processed/schema_output/llm_certificate_full.json'

print('Rule-based path:', RULE_PATH)
print('LLM path:', LLM_PATH)
print('Rule exists:', RULE_PATH.exists())
print('LLM exists:', LLM_PATH.exists())

Rule-based path: /workspaces/mtc-extraction-benchmark/data/processed/schema_output/diler-07-07-2025-rerun-41-44_schema_output.json
LLM path: /workspaces/mtc-extraction-benchmark/data/processed/schema_output/llm_certificate_full.json
Rule exists: True
LLM exists: True


In [2]:
with open(RULE_PATH, 'r', encoding='utf-8') as f:
    rule_data = json.load(f)

with open(LLM_PATH, 'r', encoding='utf-8') as f:
    llm_data = json.load(f)

print('Loaded rule-based and LLM JSON files successfully.')

Loaded rule-based and LLM JSON files successfully.


## 1) Top-Level Structure Comparison

In [3]:
rule_keys = set(rule_data.keys())
llm_keys = set(llm_data.keys())

pd.DataFrame({
    'present_in_rule_based': [k in rule_keys for k in sorted(rule_keys | llm_keys)],
    'present_in_llm': [k in llm_keys for k in sorted(rule_keys | llm_keys)]
}, index=sorted(rule_keys | llm_keys))

Unnamed: 0,present_in_rule_based,present_in_llm
approval,True,True
chemical_composition,True,True
document,True,True
mechanical_properties,True,True
product,True,True
traceability,True,True


## 2) Document / Traceability / Product Field Differences

In [4]:
def compare_dict_fields(section_name, rule_obj, llm_obj):
    all_fields = sorted(set(rule_obj.keys()) | set(llm_obj.keys()))
    rows = []
    for field in all_fields:
        r = rule_obj.get(field)
        l = llm_obj.get(field)
        rows.append({
            'section': section_name,
            'field': field,
            'rule_based': r,
            'llm': l,
            'same': r == l
        })
    return rows

rows = []
rows += compare_dict_fields('document', rule_data.get('document', {}), llm_data.get('document', {}))
rows += compare_dict_fields('traceability', rule_data.get('traceability', {}), llm_data.get('traceability', {}))
rows += compare_dict_fields('product', rule_data.get('product', {}), llm_data.get('product', {}))

df_basic = pd.DataFrame(rows)
df_basic

Unnamed: 0,section,field,rule_based,llm,same
0,document,certificate_number,25-3133/01MNF/EXP,25-3133/01MNF/EXP,True
1,document,customer,MW STEEL TRADING LIMITED,MW STEEL TRADING LIMITED LONDON THAMESPORT LIBERTY,False
2,document,issuing_date,2025-07-07,2025-07-07,True
3,document,order_number,MK250508-001,MK250508-001,True
4,document,standard,EN 10204 3.1,EN 10204 3.1,True
5,traceability,consignment_number,2025-3133/01,2025-3133/01,True
6,traceability,heat_number,,,True
7,traceability,lot_number,1,2025-3133 LOT-1,False
8,traceability,vessel_name,MV WHITE IVY,MV WHITE IVY,True
9,product,production_process,QST,QST,True


## 3) Chemical Composition Coverage & Completeness

In [5]:
required_elements = ['C','Si','P','S','Mn','Ni','Cr','Mo','Cu','V','N','B','Ce']

def chem_stats(data):
    chems = data.get('chemical_composition', [])
    heats = [str(item.get('heat_number')) for item in chems if item.get('heat_number') is not None]

    complete = 0
    details = []
    for item in chems:
        h = str(item.get('heat_number'))
        elements_obj = item.get('elements', item)
        missing = [e for e in required_elements if elements_obj.get(e) is None]
        is_complete = len(missing) == 0
        complete += int(is_complete)
        details.append({'heat_number': h, 'is_complete': is_complete, 'missing_elements': missing})

    return {
        'count_heat_rows': len(chems),
        'unique_heat_numbers': len(set(heats)),
        'complete_heat_rows': complete,
        'details_df': pd.DataFrame(details).sort_values('heat_number') if details else pd.DataFrame()
    }

rule_chem = chem_stats(rule_data)
llm_chem = chem_stats(llm_data)

pd.DataFrame([
    {'method': 'rule_based', 'heat_rows': rule_chem['count_heat_rows'], 'unique_heats': rule_chem['unique_heat_numbers'], 'complete_heats': rule_chem['complete_heat_rows']},
    {'method': 'llm', 'heat_rows': llm_chem['count_heat_rows'], 'unique_heats': llm_chem['unique_heat_numbers'], 'complete_heats': llm_chem['complete_heat_rows']}
])

Unnamed: 0,method,heat_rows,unique_heats,complete_heats
0,rule_based,6,6,6
1,llm,20,20,11


In [6]:
print('LLM heat rows with missing chemical elements:')
llm_chem['details_df'][~llm_chem['details_df']['is_complete']].head(20)

LLM heat rows with missing chemical elements:


Unnamed: 0,heat_number,is_complete,missing_elements
18,2500812,False,"[N, B, Ce]"
9,2504095,False,"[Mn, Ni, Cr, Mo, Cu, V, N, B, Ce]"
8,25990024,False,"[Cu, V, N, B, Ce]"
7,25990031,False,"[Ni, Cr, Mo, Cu, V, N, B, Ce]"
4,25990035,False,[B]
3,25990039,False,[B]
2,25990040,False,[B]
1,25990041,False,[B]
0,25990085,False,[B]


## 4) Mechanical Properties Coverage & Duplicate Check

In [7]:
def mech_stats(data):
    mech = data.get('mechanical_properties', [])
    key_fields = ['heat_number', 'test_sample', 'weight_kg_per_m', 'cross_sectional_area_mm2', 'yield_point_mpa', 'tensile_strength_mpa', 'rm_re_ratio', 'percentage_elongation', 'agt_percent']

    keys = []
    rebend_count = 0
    for row in mech:
        keys.append(tuple(row.get(k) for k in key_fields))
        if row.get('rebend') is not None:
            rebend_count += 1

    total = len(mech)
    unique = len(set(keys))
    duplicates = total - unique

    return {'total_rows': total, 'unique_rows': unique, 'duplicate_rows': duplicates, 'rebend_rows': rebend_count}

rule_mech = mech_stats(rule_data)
llm_mech = mech_stats(llm_data)

pd.DataFrame([
    {'method': 'rule_based', **rule_mech},
    {'method': 'llm', **llm_mech},
])

Unnamed: 0,method,total_rows,unique_rows,duplicate_rows,rebend_rows
0,rule_based,58,29,29,0
1,llm,80,80,0,40


## 5) Approval Section Comparison

In [8]:
approval_rows = compare_dict_fields('approval', rule_data.get('approval', {}), llm_data.get('approval', {}))
pd.DataFrame(approval_rows)

Unnamed: 0,section,field,rule_based,llm,same
0,approval,cares_approved,True,True,True
1,approval,certificate_of_approval_number,,O11001,False
2,approval,form_number,,C8.03 2-4/R-0,False


## 6) Quick Summary (Data-Driven)

In [9]:
summary = pd.DataFrame([
    {
        'metric': 'chemical_unique_heat_numbers',
        'rule_based': rule_chem['unique_heat_numbers'],
        'llm': llm_chem['unique_heat_numbers'],
        'delta_llm_minus_rule': llm_chem['unique_heat_numbers'] - rule_chem['unique_heat_numbers']
    },
    {
        'metric': 'mechanical_total_rows',
        'rule_based': rule_mech['total_rows'],
        'llm': llm_mech['total_rows'],
        'delta_llm_minus_rule': llm_mech['total_rows'] - rule_mech['total_rows']
    },
    {
        'metric': 'mechanical_duplicate_rows',
        'rule_based': rule_mech['duplicate_rows'],
        'llm': llm_mech['duplicate_rows'],
        'delta_llm_minus_rule': llm_mech['duplicate_rows'] - rule_mech['duplicate_rows']
    },
    {
        'metric': 'approval_fields_present',
        'rule_based': sum(v is not None for v in rule_data.get('approval', {}).values()),
        'llm': sum(v is not None for v in llm_data.get('approval', {}).values()),
        'delta_llm_minus_rule': sum(v is not None for v in llm_data.get('approval', {}).values()) - sum(v is not None for v in rule_data.get('approval', {}).values())
    }
])
summary

Unnamed: 0,metric,rule_based,llm,delta_llm_minus_rule
0,chemical_unique_heat_numbers,6,20,14
1,mechanical_total_rows,58,80,22
2,mechanical_duplicate_rows,29,0,-29
3,approval_fields_present,1,3,2


## 7) Notes

- This notebook compares actual JSON outputs in `data/processed/schema_output`.
- If file names differ in your environment, update `RULE_PATH` and `LLM_PATH` in Cell 2.
- The metrics here are computed directly from output files and can be reused for future documents.