In [1]:
from inference_utils import parse_addresses_from_csv, convert_to_entity_dataframe
from address_parsing_helper_functions import (load_and_prep_OCOD_data, parsing_and_expansion_process, post_process_expanded_data, load_postocde_district_lookup)
import os
import pandas as pd

model_path = "models/address_parser/checkpoint-750"
csv_path = "data/ocod_history/OCOD_FULL_2015_10.zip"

In [2]:
ocod_data = load_and_prep_OCOD_data("data/ocod_history/OCOD_FULL_2015_10.zip", csv_filename=None)

Loading model from models/address_parser/checkpoint-750


Model loaded on cuda. Labels: ['B-building_name', 'B-city', 'B-filter_type', 'B-postcode', 'B-street_name', 'B-street_number', 'B-unit_id', 'B-unit_type', 'I-building_name', 'I-city', 'I-filter_type', 'I-postcode', 'I-street_name', 'I-street_number', 'I-unit_id', 'I-unit_type', 'O']
Processing 99349 addresses in batches of 2048


Processing batches: 100%|██████████| 49/49 [09:43<00:00, 11.91s/batch]


In [13]:
if os.path.exists('data/test_results_df.parquet'):

    import time
    start_time = time.time()

    results = parse_addresses_from_csv(
    df = ocod_data,
    model_path=model_path,
    target_column="property_address",
    batch_size=2048
    )

    end_time = time.time()
    # Load existing file
    test = pd.read_parquet('data/test_results_df.parquet')
    print("Loaded existing test results from file")
else:
    # Create new dataframe and save it
    test = convert_to_entity_dataframe(results)
    test.to_parquet('data/test_results_df.parquet')
    print("Created new test results and saved to file")

Loading model from models/address_parser/checkpoint-750


Model loaded on cuda. Labels: ['B-building_name', 'B-city', 'B-filter_type', 'B-postcode', 'B-street_name', 'B-street_number', 'B-unit_id', 'B-unit_type', 'I-building_name', 'I-city', 'I-filter_type', 'I-postcode', 'I-street_name', 'I-street_number', 'I-unit_id', 'I-unit_type', 'O']
Processing 99349 addresses in batches of 2048


Processing batches:   0%|          | 0/49 [00:06<?, ?batch/s]


KeyboardInterrupt: 

Added missing columns: ['street_name', 'filter_type', 'city']


In [None]:

test = parsing_and_expansion_process(all_entities = test)
ocod_data2 = post_process_expanded_data(test, ocod_data)

In [None]:
import zipfile
import re
ONSPD_path = "data/ONSPD_FEB_2025.zip"
zip_file = zipfile.ZipFile(ONSPD_path)
target_zipped_file = [i for i in zip_file.namelist() if re.search(r'^Data/ONSPD.+csv$',i)][0]
postcode_district_lookup = load_postocde_district_lookup(ONSPD_path, target_zipped_file)

In [15]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import json
from collections import Counter

def check_model_labels(model_path):
    """Check what labels your trained model actually contains"""
    
    # Load the model
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    print("=== MODEL LABEL CONFIGURATION ===")
    print(f"Number of labels: {model.config.num_labels}")
    print(f"id2label mapping: {model.config.id2label}")
    print(f"label2id mapping: {model.config.label2id}")
    
    expected_labels = [
        "building_name", "street_name", "street_number", "filter_type",
        "unit_id", "unit_type", "city", "postcode"
    ]
    
    print("\n=== LABEL COMPARISON ===")
    model_labels = set(model.config.label2id.keys())
    
    # Remove BIO prefixes and 'O' for comparison
    model_entity_types = set()
    for label in model_labels:
        if label.startswith('B-') or label.startswith('I-'):
            model_entity_types.add(label[2:])  # Remove B- or I- prefix
    
    print(f"Expected labels: {expected_labels}")
    print(f"Model entity types: {sorted(model_entity_types)}")
    print(f"Missing from model: {set(expected_labels) - model_entity_types}")
    print(f"Extra in model: {model_entity_types - set(expected_labels)}")
    
    return model, tokenizer

# Check your model
model, tokenizer = check_model_labels(model_path)

=== MODEL LABEL CONFIGURATION ===
Number of labels: 17
id2label mapping: {0: 'O', 1: 'B-building_name', 2: 'I-building_name', 3: 'B-street_name', 4: 'I-street_name', 5: 'B-street_number', 6: 'I-street_number', 7: 'B-filter_type', 8: 'I-filter_type', 9: 'B-unit_id', 10: 'I-unit_id', 11: 'B-unit_type', 12: 'I-unit_type', 13: 'B-city', 14: 'I-city', 15: 'B-postcode', 16: 'I-postcode'}
label2id mapping: {'B-building_name': 1, 'B-city': 13, 'B-filter_type': 7, 'B-postcode': 15, 'B-street_name': 3, 'B-street_number': 5, 'B-unit_id': 9, 'B-unit_type': 11, 'I-building_name': 2, 'I-city': 14, 'I-filter_type': 8, 'I-postcode': 16, 'I-street_name': 4, 'I-street_number': 6, 'I-unit_id': 10, 'I-unit_type': 12, 'O': 0}

=== LABEL COMPARISON ===
Expected labels: ['building_name', 'street_name', 'street_number', 'filter_type', 'unit_id', 'unit_type', 'city', 'postcode']
Model entity types: ['building_name', 'city', 'filter_type', 'postcode', 'street_name', 'street_number', 'unit_id', 'unit_type']
Miss

In [16]:
def analyze_training_data(json_path):
    """Analyze what labels are actually in your training data"""
    
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    print("=== TRAINING DATA ANALYSIS ===")
    print(f"Total examples: {len(data)}")
    
    # Count all labels
    label_counts = Counter()
    total_spans = 0
    
    for example in data:
        for span in example.get('spans', []):
            label_counts[span['label']] += 1
            total_spans += 1
    
    print(f"Total spans: {total_spans}")
    print("\nLabel distribution:")
    for label, count in sorted(label_counts.items()):
        percentage = (count / total_spans) * 100
        print(f"  {label}: {count} ({percentage:.1f}%)")
    
    # Check for missing expected labels
    expected_labels = [
        "building_name", "street_name", "street_number", "number_filter",
        "unit_id", "unit_type", "city", "postcode"
    ]
    
    found_labels = set(label_counts.keys())
    missing_labels = set(expected_labels) - found_labels
    
    if missing_labels:
        print(f"\n⚠️  MISSING LABELS IN TRAINING DATA: {missing_labels}")
    else:
        print(f"\n✅ All expected labels found in training data")
    
    return label_counts

# Analyze your training data
label_counts = analyze_training_data("data/training_data/training_data_dev.json")

=== TRAINING DATA ANALYSIS ===
Total examples: 2000
Total spans: 11176

Label distribution:
  building_name: 797 (7.1%)
  city: 1937 (17.3%)
  number_filter: 50 (0.4%)
  postcode: 1618 (14.5%)
  street_name: 2133 (19.1%)
  street_number: 2939 (26.3%)
  unit_id: 743 (6.6%)
  unit_type: 959 (8.6%)

✅ All expected labels found in training data


In [17]:
def debug_single_prediction(model, tokenizer, address):
    """Debug what the model predicts for a single address"""
    
    model.eval()
    device = next(model.parameters()).device
    
    # Tokenize
    inputs = tokenizer(
        address,
        return_tensors="pt",
        return_offsets_mapping=True,test
        padding=True,
        truncation=True,
        max_length=128
    )
    
    # Move to device (excluding offset_mapping)
    model_inputs = {k: v.to(device) for k, v in inputs.items() if k != 'offset_mapping'}
    
    # Predict
    with torch.no_grad():
        outputs = model(**model_inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_ids = predictions.argmax(dim=-1)
    
    # Get tokens and labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [model.config.id2label[pred.item()] for pred in predicted_ids[0]]
    offset_mapping = inputs["offset_mapping"][0]
    
    print(f"=== DEBUGGING: {address} ===")
    print("Token -> Label -> Text_Span")
    print("-" * 50)
    
    for i, (token, label, offset) in enumerate(zip(tokens, predicted_labels, offset_mapping)):
        if token in ['[CLS]', '[SEP]', '[PAD]']:
            continue
        
        start_pos, end_pos = offset.tolist()
        if start_pos == 0 and end_pos == 0 and i > 0:
            text_span = "[SPECIAL]"
        else:
            text_span = address[start_pos:end_pos]
        
        print(f"{token:15} -> {label:15} -> '{text_span}'")

# Test with an address that should have street_name
test_address = "161, 163, 165, 167 and 169 uxbridge road, ealing"
debug_single_prediction(model, tokenizer, test_address)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (973226428.py, line 11)