In [3]:

import pandas as pd

In [2]:
# test_single_address.py

def test_single_address():
    # Initialize your parser
    model_path = "models/address_parser_dev/final_model"  # Replace with your actual model path
    
    try:
        print("Initializing parser...")
        parser = AddressParserInference(
            model_path=model_path,
            max_length=512,
            stride=50,
            use_fp16=True  # Set to False for debugging to avoid GPU issues
        )
        print("✓ Parser initialized successfully")
        
        # Test address
        test_address = "36 - 49, chapel street, London, se45 6pq"
        print(f"\nTesting address: '{test_address}'")
        
        # Make prediction
        result = parser.predict_single_address(test_address, row_index=0)
        
        # Print results
        print("\n" + "="*50)
        print("RESULTS:")
        print("="*50)
        
        if "error" in result:
            print(f"❌ ERROR: {result['error']}")
        else:
            print(f"✓ Original address: {result['original_address']}")
            print(f"✓ Number of entities found: {len(result['entities'])}")
            
            if result['entities']:
                print("\nEntities found:")
                for i, entity in enumerate(result['entities']):
                    print(f"  {i+1}. {entity['type']}: '{entity['text']}' (confidence: {entity['confidence']:.3f})")
                
                print("\nParsed components:")
                for key, value in result['parsed_components'].items():
                    print(f"  {key}: {value}")
            else:
                print("⚠️  No entities found")
        
        print("="*50)
        
    except Exception as e:
        print(f"❌ Failed to initialize or run parser: {str(e)}")
        import traceback
        traceback.print_exc()


test_single_address()

Initializing parser...
Loading model and tokenizer...
AddressParserInference initialized: cuda, FP16=True
✓ Parser initialized successfully

Testing address: '36 - 49, chapel street, London, se45 6pq'

RESULTS:
✓ Original address: 36 - 49, chapel street, London, se45 6pq
✓ Number of entities found: 3

Entities found:
  1. street_number: '36 - 49' (confidence: 1.000)
  2. street_name: 'chapel street' (confidence: 1.000)
  3. postcode: 'se45 6pq' (confidence: 1.000)

Parsed components:
  street_number: 36 - 49
  street_name: chapel street
  postcode: se45 6pq


In [5]:
test = pd.read_parquet("data/ocod_history_processed/OCOD_FULL_2015_10.parquet")

In [7]:
test['building_name'].unique()

array([None, 'great titchfield house', ' london house', ...,
       'royal pavilion', ' the business village', 'afe'], dtype=object)

In [1]:
from enhance_ocod.inference import parse_addresses_pipeline, convert_to_entity_dataframe
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data, parsing_and_expansion_process, post_process_expanded_data
)
from enhance_ocod.locate_and_classify import (
    load_postcode_district_lookup, preprocess_expandaded_ocod_data, 
    add_missing_lads_ocod, load_voa_ratinglist, street_and_building_matching, substreet_matching,
    counts_of_businesses_per_oa_lsoa, voa_address_match_all_data, classification_type1, classification_type2,
    contract_ocod_after_classification
)
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time
import gc
import torch

torch.set_float32_matmul_precision('medium')

# ====== SET YOUR PATHS HERE ======
test_file_path = "data/simple_example.csv"  # UPDATE THIS PATH
model_path = "models/address_parser_original_fullset/final_model"
ONSPD_path = "data/ONSPD_FEB_2025.zip"
price_paid_path = "data/price_paid_data/price_paid_complete_may_2025.csv"
processed_price_paid_dir = "data/processed_price_paid"
voa_path = "data/2023_non_domestic_rating_list_entries.zip"

print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
print(f"Postcode district lookup shape: {postcode_district_lookup.shape}")

voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)
print(f"VOA businesses shape: {voa_businesses.shape}")

print("\n" + "="*50)
print("STEP 1: Load and prep OCOD data")
print("="*50)

# Load and process the OCOD data
ocod_data_1_initial = load_and_prep_OCOD_data(str(test_file_path))
print(f"Initial OCOD data shape: {ocod_data_1_initial.shape}")
print(f"Initial OCOD data columns: {list(ocod_data_1_initial.columns)}")
print(f"Sample property_address values:\n{ocod_data_1_initial['property_address'].head()}")

print("\n" + "="*50)
print("STEP 2: Parse addresses")
print("="*50)

start_time = time.time()
results = parse_addresses_pipeline(
    df=ocod_data_1_initial,
    model_path=str(model_path),
    target_column="property_address",
)
end_time = time.time()

print(f"Address parsing took {end_time - start_time:.2f} seconds")
print(f"Success rate: {results['summary']['success_rate']:.1%}")
print(f"Results keys: {results.keys()}")



ocod_data_2_entities = convert_to_entity_dataframe(results)


ocod_data_3_expanded = parsing_and_expansion_process(all_entities=ocod_data_2_entities)




Loading common reference data...
Postcode district lookup shape: (2406510, 5)


  voa_businesses = pd.read_csv(csv_file,


Initial rows: 2282385
VOA businesses shape: (2094735, 19)

STEP 1: Load and prep OCOD data
Initial OCOD data shape: (5, 7)
Initial OCOD data columns: ['property_address', 'title_number', 'tenure', 'district', 'county', 'region', 'price_paid']
Sample property_address values:
0       5 to 9(odds only) odd road, london  (pj10 8df)
1          5 to 9 (even) even road, london  (pj10 8df)
2           5 to 9 everything road, london  (pj10 8df)
3            6,7,10 selective road, london, (pj10 8df)
4    Flats 5 to 9, all apartments road, london  (pj...
Name: property_address, dtype: object

STEP 2: Parse addresses


Analyzing address lengths: 100%|██████████| 5/5 [00:00<00:00, 2349.22it/s]


Short addresses: 5 | Long addresses: 0
Processing 5 short addresses (batch_size=2048)...


Device set to use cuda:0


✓ Completed 5 short addresses
No long addresses to process
Address parsing took 5.39 seconds
Success rate: 100.0%
Results keys: dict_keys(['summary', 'results'])
Processing 21 entities into DataFrame...
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 21
Added missing columns: ['building_name', 'filter_type', 'unit_id', 'unit_type']


In [4]:
results

{'summary': {'total_addresses': 5,
  'successful_parses': 5,
  'failed_parses': 0,
  'success_rate': 1.0,
  'short_addresses': 5,
  'long_addresses': 0,
  'short_batch_size': 2048,
  'long_batch_size': 32,
  'token_threshold': 128},
 'results': [{'row_index': 0,
   'datapoint_id': 0,
   'original_address': '5 to 9(odds only) odd road, london  (pj10 8df)',
   'entities': [{'type': 'street_number',
     'text': '5 to 9',
     'start': 0,
     'end': 6,
     'confidence': 0.9999869},
    {'type': 'street_name',
     'text': ' odd road',
     'start': 17,
     'end': 26,
     'confidence': 0.9999984},
    {'type': 'city',
     'text': ' london',
     'start': 27,
     'end': 34,
     'confidence': 0.9994545},
    {'type': 'postcode',
     'text': 'pj10 8df',
     'start': 37,
     'end': 45,
     'confidence': 0.99998873}],
   'parsed_components': {'street_number': ['5 to 9'],
    'street_name': [' odd road'],
    'city': [' london'],
    'postcode': ['pj10 8df']}},
  {'row_index': 1,
   '

In [5]:
ocod_data_2_entities

Unnamed: 0,datapoint_id,label,start,end,text,label_text,label_id_count
0,0,street_number,0,6,"5 to 9(odds only) odd road, london (pj10 8df)",5 to 9,0
1,0,street_name,17,26,"5 to 9(odds only) odd road, london (pj10 8df)",odd road,0
2,0,city,27,34,"5 to 9(odds only) odd road, london (pj10 8df)",london,0
3,0,postcode,37,45,"5 to 9(odds only) odd road, london (pj10 8df)",pj10 8df,0
4,1,street_number,0,6,"5 to 9 (even) even road, london (pj10 8df)",5 to 9,0
5,1,street_name,13,23,"5 to 9 (even) even road, london (pj10 8df)",even road,0
6,1,city,24,31,"5 to 9 (even) even road, london (pj10 8df)",london,0
7,1,postcode,34,42,"5 to 9 (even) even road, london (pj10 8df)",pj10 8df,0
8,2,street_number,0,6,"5 to 9 everything road, london (pj10 8df)",5 to 9,0
9,2,street_name,6,22,"5 to 9 everything road, london (pj10 8df)",everything road,0


In [3]:
ocod_data_3_expanded

Unnamed: 0,index,datapoint_id,city,postcode,street_name,street_number,text,building_name,number_filter,unit_id,unit_type
0,0.0,0,london,pj10 8df,odd road,5 to 9,"5 to 9(odds only) odd road, london (pj10 8df)",,,,
1,4.0,1,london,pj10 8df,even road,5 to 9,"5 to 9 (even) even road, london (pj10 8df)",,,,
2,8.0,2,london,pj10 8df,everything road,5 to 9,"5 to 9 everything road, london (pj10 8df)",,,,
0,2.0,0,london,pj10 8df,,,"5 to 9(odds only) odd road, london (pj10 8df)",,,,
1,3.0,0,,pj10 8df,,,"5 to 9(odds only) odd road, london (pj10 8df)",,,,
2,6.0,1,london,pj10 8df,,,"5 to 9 (even) even road, london (pj10 8df)",,,,
3,7.0,1,,pj10 8df,,,"5 to 9 (even) even road, london (pj10 8df)",,,,
4,10.0,2,london,pj10 8df,,,"5 to 9 everything road, london (pj10 8df)",,,,
5,11.0,2,,pj10 8df,,,"5 to 9 everything road, london (pj10 8df)",,,,
6,12.0,3,,pj10 8df,selective road,6,"6,7,10 selective road, london, (pj10 8df)",,,,


In [7]:
from enhance_ocod.address_parsing import identify_multi_addresses
import numpy as np

multi_unit_id, multi_property, all_multi_ids = identify_multi_addresses(ocod_data_2_entities)

In [8]:
all_entities = ocod_data_2_entities

# Define regex pattern to match number ranges like "1-5", "10 to 20", "3-7" etc.
xx_to_yy_regex = r'^\d+\s?(?:-|to)\s?\d+$'

# Create working dataframe with unique address records only
multi_check_df = all_entities[['datapoint_id', 'text', ]].drop_duplicates()

# Count commas in address text (multiple commas may indicate complex addresses)
multi_check_df['comma_count'] = multi_check_df['text'].str.count(',')

# Flag addresses that start with land-related keywords (these are typically single properties)
multi_check_df['land'] = multi_check_df['text'].str.contains(r"^(?:land|plot|airspace|car|parking)", case = False)

# Flag addresses containing business-related keywords (these are typically single commercial properties)
multi_check_df['business'] = multi_check_df['text'].str.contains(r"cinema|hotel|office|centre|\bpub|holiday\s?inn|travel\s?lodge|business|cafe|^shop| shop|restaurant|home|^stores?\b|^storage\b|company|ltd|limited|plc|retail|leisure|industrial|hall of|trading|commercial|works", case = False)

# Create pivot table showing count of each entity label type per datapoint_id
# This gives us counts of building_name, unit_id, street_number etc. for each address
temp_df = all_entities[['datapoint_id', 'label']].groupby(['datapoint_id', 'label']).value_counts().to_frame(name = "counts").reset_index().pivot(index = 'datapoint_id', columns = 'label', values = 'counts').fillna(0)

# Count how many street numbers contain range patterns (e.g., "1-5 Main St")
# This indicates multiple properties at consecutive street numbers
xx_to_yy_street_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
    xx_to_yy_regex)& (all_entities['label']=="street_number")
                        ].to_frame(name = 'datapoint_id').groupby('datapoint_id').size().to_frame(name = 'xx_to_yy_street_counts')

# Count how many unit IDs contain range patterns (e.g., "Units 1-5")
# This indicates multiple units within a single building
xx_to_yy_unit_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
    xx_to_yy_regex)& (all_entities['label']=="unit_id")
                        ].to_frame(name = 'datapoint_id').groupby('datapoint_id').size().to_frame(name = 'xx_to_yy_unit_counts')

# Merge all the feature dataframes together
multi_check_df = multi_check_df.merge(temp_df, how = 'left', left_on = "datapoint_id", right_index = True).\
merge(xx_to_yy_street_counts, how = 'left', left_on = "datapoint_id", right_index = True).\
merge(xx_to_yy_unit_counts, how = 'left', left_on = "datapoint_id", right_index = True).fillna(0)

# Ensure the necessary columns are present (some addresses may not have these entity types)
required_columns = ['building_name', 'unit_id', 'street_number']
for col in required_columns:
    if col not in multi_check_df.columns:
        multi_check_df[col] = 0


# Classify addresses as single/multi/unknown using hierarchical logical rules
# Order matters here - more specific conditions should come first
multi_check_df['class'] = np.select(
    [
        multi_check_df['land'],  # Land/plot addresses are single properties
        multi_check_df['business'],  # Business addresses are typically single properties
        (multi_check_df['building_name']==1) & (multi_check_df['unit_id'] == 0),  # Single building name without units = single property
        (multi_check_df['xx_to_yy_unit_counts']>0) | (multi_check_df['xx_to_yy_street_counts']>0),  # Range patterns in unit IDs = multiple units
        multi_check_df['street_number']>1,  # Multiple street numbers = multiple properties
        multi_check_df['unit_id']>1,  # Multiple unit IDs = multiple units
        (multi_check_df['street_number']<=1) & (multi_check_df['xx_to_yy_street_counts']==0) & (multi_check_df['unit_id']<=1)  # Single street number, no ranges, single/no unit = single property
    ], 
    [
        'single',
        'single', 
        'single',
        'multi',
        'multi',
        'multi',
        'single',
    ], 
    default='unknown'  # Fallback for edge cases
)

# With the classification complete, extract the required ID lists

# Multi-unit addresses: multiple properties that are individual units (apartments, flats, etc.)
multi_unit_id = set(multi_check_df['datapoint_id'][(multi_check_df['class']=='multi') &( multi_check_df['unit_id']>0)].tolist())

# Multi-property addresses: multiple properties but not individual units (e.g., multiple houses)
multi_property = set(multi_check_df['datapoint_id'][(multi_check_df['class']=='multi') &( multi_check_df['unit_id']==0)].tolist())

# Combined list of all multi-address IDs
all_multi_ids = list(multi_unit_id) +list(multi_property)

In [17]:
all_entities['datapoint_id'][all_entities['label_text']]
                        

KeyError: "None of [Index(['5 to 9', ' odd road', ' london', 'pj10 8df', '5 to 9', ' even road',\n       ' london', 'pj10 8df', '5 to 9', ' everything road', ' london',\n       'pj10 8df', '6', '7', '10', ' selective road', 'pj10 8df', ' 5 to 9',\n       ' all apartments road', ' london', 'pj10 8df'],\n      dtype='object')] are in the [index]"

In [9]:
multi_check_df['xx_to_yy_unit_counts']

0     0.0
4     0.0
8     0.0
12    0.0
17    0.0
Name: xx_to_yy_unit_counts, dtype: float64

In [10]:
multi_check_df

Unnamed: 0,datapoint_id,text,comma_count,land,business,building_name,city,postcode,street_name,street_number,unit_id,xx_to_yy_street_counts,xx_to_yy_unit_counts,class
0,0,"5 to 9(odds only) odd road, london (pj10 8df)",1,False,False,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,multi
4,1,"5 to 9 (even) even road, london (pj10 8df)",1,False,False,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,multi
8,2,"5 to 9 everything road, london (pj10 8df)",1,False,False,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,multi
12,3,"6,7,10 selective road, london, (pj10 8df)",4,False,False,0.0,0.0,1.0,1.0,3.0,0.0,0.0,0.0,multi
17,4,"Flats 5 to 9, all apartments road, london (pj...",2,False,False,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,single


In [None]:
ocod_data_4_postprocessed = post_process_expanded_data(ocod_data_3_expanded, ocod_data_1_initial)

# Clean up intermediate results
del results, ocod_data_2_entities, ocod_data_3_expanded
gc.collect()


ocod_data_5_preprocessed = preprocess_expandaded_ocod_data(ocod_data_4_postprocessed, postcode_district_lookup)


price_paid_df = load_and_process_pricepaid_data(
    file_path=str(price_paid_path), 
    processed_dir=processed_price_paid_dir,
    postcode_district_lookup=postcode_district_lookup, 
    years_needed=[2017, 2018, 2019]
)


ocod_data_6_with_lads = add_missing_lads_ocod(ocod_data_5_preprocessed, price_paid_df)


ocod_data_7_street_matched = street_and_building_matching(ocod_data_6_with_lads, price_paid_df, voa_businesses)


ocod_data_8_substreet_matched = substreet_matching(ocod_data_7_street_matched, price_paid_df, voa_businesses)

# Clean up price paid data
del price_paid_df
gc.collect()


ocod_data_9_with_counts = counts_of_businesses_per_oa_lsoa(ocod_data_8_substreet_matched, voa_businesses)


ocod_data_10_voa_matched = voa_address_match_all_data(ocod_data_9_with_counts, voa_businesses)

ocod_data_11_class1 = classification_type1(ocod_data_10_voa_matched)

ocod_data_12_class2 = classification_type2(ocod_data_11_class1)

ocod_data_13_contracted = contract_ocod_after_classification(ocod_data_12_class2, class_type='class2', classes=['residential'])


columns = ['title_number', 'within_title_id', 'within_larger_title', 'unique_id', 
           'unit_id', 'unit_type', 'building_name', 'street_number', 'street_name', 
           'postcode', 'city', 'district', 'region', 'property_address', 'oa11cd', 
           'lsoa11cd', 'msoa11cd', 'lad11cd', 'class', 'class2']

ocod_data_14_final = ocod_data_13_contracted.loc[:, columns].rename(columns={
    'within_title_id': 'nested_id',
    'within_larger_title': 'nested_title'
})

print(f"Final data shape: {ocod_data_14_final.shape}")
print(f"Final data columns: {list(ocod_data_14_final.columns)}")

print("\n" + "="*50)
print("PROCESSING COMPLETE!")
print("="*50)

In [6]:
results

NameError: name 'results' is not defined