In [2]:
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data,
    parsing_and_expansion_process,
    post_process_expanded_data,
    load_postcode_district_lookup,
)
from enhance_ocod.locate_and_classify import (
    preprocess_expanded_ocod_data,
    add_missing_lads_ocod,
    load_voa_ratinglist,
    street_and_building_matching,
    substreet_matching,
    counts_of_businesses_per_oa_lsoa,
    voa_address_match_all_data,
    classification_type1,
    classification_type2,
    contract_ocod_after_classification,
)
from enhance_ocod.inference import convert_to_entity_dataframe
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
from pathlib import Path
from tqdm import tqdm
import time
import gc  # Add for memory management
import pandas as pd
import pickle
# There is a warning related to bfill and ffill which is basically internal to pandas so silencing here
import warnings

warnings.filterwarnings("ignore", message=".*Downcasting object dtype arrays.*")

SCRIPT_DIR = Path("/teamspace/studios/this_studio/enhance_ocod/notebooks")

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = (
    SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
)


ONSPD_dir = SCRIPT_DIR.parent / "data" / "onspd"
ONSPD_files = list(ONSPD_dir.glob("*"))
ONSPD_path = ONSPD_files[0] if ONSPD_files else None

price_paid_dir = SCRIPT_DIR.parent / "data" / "price_paid_data"
price_paid_files = list(price_paid_dir.glob("*"))
price_paid_path = price_paid_files[0] if price_paid_files else None


voa_dir = SCRIPT_DIR.parent / "data" / "voa"
voa_files = list(voa_dir.glob("*"))
voa_path = voa_files[0] if voa_files else None


processed_price_paid_dir = SCRIPT_DIR.parent / "data" / "processed_price_paid"
output_dir.mkdir(parents=True, exist_ok=True)

parsed_results_dir = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts2"
parsed_results_dir.mkdir(parents=True, exist_ok=True)

# List of all zip files in input_dir
#
# TESTING!!! only 10 files!
#
all_files = sorted([f for f in input_dir.glob("OCOD_FULL_*.zip")])


print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)


Loading common reference data...


  postcode_district_lookup = pd.read_csv(f)[


Initial rows: 2293839


In [3]:



zip_file = Path('/teamspace/studios/this_studio/enhance_ocod/data/ocod_history/OCOD_FULL_2022_02.zip')

In [4]:
out_name = zip_file.stem + ".parquet"
out_path = output_dir / out_name

# Define parsed results file path
parsed_results_file = parsed_results_dir / f"{zip_file.stem}_parsed_results.pkl"

print(f"Processing {zip_file.name}...")

# Load and process the OCOD data
ocod_data = load_and_prep_OCOD_data(str(zip_file))

###############
# Parse addresses
###############
if parsed_results_file.exists():
    print(f"Loading cached parsing results for {zip_file.name}...")
    with open(parsed_results_file, "rb") as f:
        results = pickle.load(f)
    print(
        f"Loaded cached results with success rate: {results['summary']['success_rate']:.1%}"
    )
else:
    from enhance_ocod.inference import parse_addresses_pipeline
    import torch
    torch.set_float32_matmul_precision("medium")
    print(f"Parsing addresses for {zip_file.name}...")
    start_time = time.time()

    results = parse_addresses_pipeline(
        df=ocod_data,
        short_batch_size=128,  # The default seems really slow, might be to do with loading not sure
        model_path=str(model_path),
        target_column="property_address",
    )

    end_time = time.time()
    print(f"Address parsing took {end_time - start_time:.2f} seconds")
    print(f"Success rate: {results['summary']['success_rate']:.1%}")

    # Save parsing results
    print(f"Saving parsing results to {parsed_results_file}...")
    with open(parsed_results_file, "wb") as f:
        pickle.dump(results, f)


Processing OCOD_FULL_2022_02.zip...
Loading cached parsing results for OCOD_FULL_2022_02.zip...
Loaded cached results with success rate: 100.0%


In [47]:
filtered_entries = [entry for entry in results['results'] if "alaska building" in entry["original_address"].lower()]

In [7]:
results['results'][0:5]

[{'row_index': 0,
  'datapoint_id': 0,
  'original_address': 'Westleigh Lodge Care Home, Nel Pan Lane, Leigh (WN7 5JT)',
  'entities': [{'type': 'building_name',
    'text': 'Westleigh Lodge Care Home',
    'start': 0,
    'end': 25,
    'confidence': 0.99999654},
   {'type': 'street_name',
    'text': ' Nel Pan Lane',
    'start': 26,
    'end': 39,
    'confidence': 0.9999978},
   {'type': 'city',
    'text': ' Leigh',
    'start': 40,
    'end': 46,
    'confidence': 0.99988425},
   {'type': 'postcode',
    'text': 'WN7 5JT',
    'start': 48,
    'end': 55,
    'confidence': 0.9999841}],
  'parsed_components': {'building_name': ['Westleigh Lodge Care Home'],
   'street_name': [' Nel Pan Lane'],
   'city': [' Leigh'],
   'postcode': ['WN7 5JT']}},
 {'row_index': 1,
  'datapoint_id': 1,
  'original_address': 'Flat 1, 1a Canal Street, Manchester (M1 3HE)',
  'entities': [{'type': 'unit_type',
    'text': 'Flat',
    'start': 0,
    'end': 4,
    'confidence': 0.9999956},
   {'type': 'u

In [36]:
# Continue with post-parsing processing
test_0 = convert_to_entity_dataframe(results)
test = parsing_and_expansion_process(all_entities=test_0)


Processing 421,466 entities into DataFrame...
Processed 150,000/421,466 entities


Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 421,466


In [39]:
ocod_data = post_process_expanded_data(test, ocod_data)

In [65]:
results['results'][47]

{'row_index': 47,
 'datapoint_id': 47,
 'original_address': '3a, 5a and 8a Hesketh Drive, Standish, Wigan (WN6 0SF)',
 'entities': [{'type': 'street_number',
   'text': '3a',
   'start': 0,
   'end': 2,
   'confidence': 0.99989045},
  {'type': 'street_number',
   'text': ' 5a',
   'start': 3,
   'end': 6,
   'confidence': 0.9999326},
  {'type': 'street_number',
   'text': ' 8a',
   'start': 10,
   'end': 13,
   'confidence': 0.9999838},
  {'type': 'street_name',
   'text': ' Hesketh Drive',
   'start': 13,
   'end': 27,
   'confidence': 0.9999949},
  {'type': 'city',
   'text': ' Wigan',
   'start': 38,
   'end': 44,
   'confidence': 0.99999774},
  {'type': 'postcode',
   'text': 'WN6 0SF',
   'start': 46,
   'end': 53,
   'confidence': 0.9999707}],
 'parsed_components': {'street_number': ['3a', ' 5a', ' 8a'],
  'street_name': [' Hesketh Drive'],
  'city': [' Wigan'],
  'postcode': ['WN6 0SF']}}

In [61]:
number = 48

print(results['results'][number]['original_address'])
results['results'][number]

1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18, 19, 21 and 22 Heath House Close, Lowton, and 106 and 108 Stone Cross Lane North, Lowton, Warrington


{'row_index': 48,
 'datapoint_id': 48,
 'original_address': '1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18, 19, 21 and 22 Heath House Close, Lowton, and 106 and 108 Stone Cross Lane North, Lowton, Warrington',
 'entities': [{'type': 'street_number',
   'text': '1',
   'start': 0,
   'end': 1,
   'confidence': 0.9999207},
  {'type': 'street_number',
   'text': ' 2',
   'start': 2,
   'end': 4,
   'confidence': 0.99931705},
  {'type': 'street_number',
   'text': ' 3',
   'start': 5,
   'end': 7,
   'confidence': 0.99909544},
  {'type': 'street_number',
   'text': ' 4',
   'start': 8,
   'end': 10,
   'confidence': 0.9993315},
  {'type': 'street_number',
   'text': ' 5',
   'start': 11,
   'end': 13,
   'confidence': 0.99914265},
  {'type': 'street_number',
   'text': ' 6',
   'start': 14,
   'end': 16,
   'confidence': 0.9993337},
  {'type': 'street_number',
   'text': ' 7',
   'start': 17,
   'end': 19,
   'confidence': 0.99954295},
  {'type': 'street_number',
   'text': ' 8',
   'start

In [13]:
def spread_address_labels2(df, all_multi_ids):
    """
    This function spreads the address dataframe so that each
    label class is its own column
    """
    # Filter and pivot in one step, using datapoint_id as the index
    temp_df = df[df.datapoint_id.isin(all_multi_ids)].copy()
    
    # Pivot with datapoint_id as index to preserve it
    pivoted_df = temp_df.pivot_table(
        index="datapoint_id", 
        columns="label", 
        values="label_text",
        aggfunc='first'  # Handle duplicates if any
    )
    
    # Reset index to make datapoint_id a column again
    pivoted_df = pivoted_df.reset_index()
    
    # Add back the text column
    pivoted_df = pivoted_df.merge(
        temp_df[["datapoint_id", "text"]].drop_duplicates(),
        on="datapoint_id",
        how="left"
    )
    
    return pivoted_df

In [27]:
import numpy as np

def identify_multi_addresses2(all_entities):
    """
    Identify different types of address patterns and add classification columns.
    
    This function analyzes the all_entities dataframe to classify addresses as:
    - Single addresses (one property)
    - Multi-unit addresses (multiple units/flats within a property) 
    - Multi-property addresses (multiple separate properties)
    
    Args:
        all_entities (pd.DataFrame): DataFrame containing address entities with columns:
            datapoint_id, text, label, label_text
    
    Returns:
        pd.DataFrame: Original dataframe with added boolean columns:
            - is_single_address: True for single property addresses
            - is_multi_unit: True for addresses with multiple units/flats
            - is_multi_property: True for addresses with multiple properties
            - is_multi_address: True for any multi-address (unit or property)
    """
    
    xx_to_yy_regex = r"^\d+\s?(?:-|to)\s?\d+$"
    
    # Create analysis dataframe with basic metrics
    multi_check_df = all_entities[["datapoint_id", "text"]].drop_duplicates()
    multi_check_df["comma_count"] = multi_check_df["text"].str.count(",")
    
    # Identify land/plot and business addresses (typically single properties)
    multi_check_df["land"] = multi_check_df["text"].str.contains(
        r"^(?:land|plot|airspace|car|parking)", case=False
    )
    
    multi_check_df["business"] = multi_check_df["text"].str.contains(
        r"cinema|hotel|office|centre|\bpub|holiday\s?inn|travel\s?lodge|business|cafe|^shop| shop|restaurant|home|^stores?\b|^storage\b|company|ltd|limited|plc|retail|leisure|industrial|hall of|trading|commercial|works",
        case=False
    )
    
    # Count occurrences of each label type per datapoint
    label_counts = (
        all_entities[["datapoint_id", "label"]]
        .groupby(["datapoint_id", "label"])
        .size()
        .unstack(fill_value=0)
    )
    
    # Count xx-to-yy patterns in street numbers and unit IDs
    xx_to_yy_street_counts = (
        all_entities[
            all_entities["label_text"].str.contains(xx_to_yy_regex, na=False) &
            (all_entities["label"] == "street_number")
        ]
        .groupby("datapoint_id")
        .size()
        .reindex(multi_check_df["datapoint_id"], fill_value=0)
    )
    
    xx_to_yy_unit_counts = (
        all_entities[
            all_entities["label_text"].str.contains(xx_to_yy_regex, na=False) &
            (all_entities["label"] == "unit_id")
        ]
        .groupby("datapoint_id")
        .size()
        .reindex(multi_check_df["datapoint_id"], fill_value=0)
    )
    
    # Merge all analysis data
    multi_check_df = multi_check_df.merge(
        label_counts, left_on="datapoint_id", right_index=True, how="left"
    ).fillna(0)
    
    # Add xx-to-yy counts
    multi_check_df["xx_to_yy_street_counts"] = xx_to_yy_street_counts
    multi_check_df["xx_to_yy_unit_counts"] = xx_to_yy_unit_counts
    
    # Ensure required columns exist
    required_columns = ["building_name", "unit_id", "street_number"]
    for col in required_columns:
        if col not in multi_check_df.columns:
            multi_check_df[col] = 0
    
    # Apply classification logic
    conditions = [
        multi_check_df["land"],  # Land/plot addresses
        multi_check_df["business"],  # Business addresses  
        (multi_check_df["building_name"] == 1) & (multi_check_df["unit_id"] == 0),  # Single building, no units
        (multi_check_df["xx_to_yy_unit_counts"] > 0) | (multi_check_df["xx_to_yy_street_counts"] > 0),  # Range patterns
        multi_check_df["street_number"] > 1,  # Multiple street numbers
        multi_check_df["unit_id"] > 1,  # Multiple unit IDs
        (multi_check_df["street_number"] <= 1) & (multi_check_df["xx_to_yy_street_counts"] == 0) & (multi_check_df["unit_id"] <= 1),  # Single address indicators
    ]
    
    choices = ["single", "single", "single", "multi", "multi", "multi", "single"]
    multi_check_df["address_class"] = np.select(conditions, choices, default="unknown")
    
    # Create boolean classification columns
    multi_check_df["is_single_address"] = multi_check_df["address_class"] == "single"
    multi_check_df["is_multi_unit"] = (
        (multi_check_df["address_class"] == "multi") & 
        (multi_check_df["unit_id"] > 0)
    )
    multi_check_df["is_multi_property"] = (
        (multi_check_df["address_class"] == "multi") & 
        (multi_check_df["unit_id"] == 0)
    )
    multi_check_df["is_multi_address"] = (
        multi_check_df["is_multi_unit"] | 
        multi_check_df["is_multi_property"]
    )
    
    # Select only the columns we need to merge back
    classification_columns = [
        "datapoint_id", "is_single_address", "is_multi_unit", 
        "is_multi_property", "is_multi_address"
    ]
    
    # Merge classification back to original dataframe
    result = all_entities.merge(
        multi_check_df[classification_columns], 
        on="datapoint_id", 
        how="left"
    )
    
    return result

In [None]:
from enhance_ocod.address_parsing import (identify_multi_addresses, spread_address_labels, 
ensure_required_columns, add_backfill_blockers, backfill_address_labels, final_parsed_addresses)

all_entities = test_0.copy()

required_columns = [
    "building_name",
    "street_name",
    "street_number",
    "number_filter",
    "unit_id",
    "unit_type",
    "city",
    "postcode",
]

# Continue with existing logic
multi_unit_id, multi_property, all_multi_ids = identify_multi_addresses(
    all_entities
)
df12 = spread_address_labels(all_entities, all_multi_ids)
df1 = spread_address_labels2(all_entities, all_multi_ids)
# The columns are filled with an empty string as at the moment all columns should be strings
# Ensurinng string prevents errors later when cleaning is performed on street_name and other variables
# This is not being changed to default behaviour as I may need to implement more significant changes later
df2 = ensure_required_columns(df1, required_columns, "")

# Blockers prevent the filling of wrong information. As an example if a building is going to back fill up
# previous addresses it should not back fill past another street as this is highly unlikely to be the same building
df3 = add_backfill_blockers(df2)
df4 = backfill_address_labels(df3)



In [34]:
test = identify_multi_addresses2(all_entities)

In [35]:
test

Unnamed: 0,datapoint_id,label,start,end,text,label_text,label_id_count,is_single_address,is_multi_unit,is_multi_property,is_multi_address
0,0,building_name,0,25,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",Westleigh Lodge Care Home,0,True,False,False,False
1,0,street_name,26,39,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",Nel Pan Lane,0,True,False,False,False
2,0,city,40,46,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",Leigh,0,True,False,False,False
3,0,postcode,48,55,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",WN7 5JT,0,True,False,False,False
4,1,unit_type,0,4,"Flat 1, 1a Canal Street, Manchester (M1 3HE)",Flat,0,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
421461,94087,unit_id,7,10,"Storage 17, Discovery Dock Apartments East, 3 ...",17,0,True,False,False,False
421462,94087,building_name,11,42,"Storage 17, Discovery Dock Apartments East, 3 ...",Discovery Dock Apartments East,0,True,False,False,False
421463,94087,street_name,45,63,"Storage 17, Discovery Dock Apartments East, 3 ...",South Quay Square,0,True,False,False,False
421464,94087,city,64,71,"Storage 17, Discovery Dock Apartments East, 3 ...",London,0,True,False,False,False


In [59]:
df = final_parsed_addresses(
    df4,
    all_entities,
    multi_property,
    multi_unit_id,
    all_multi_ids,
    expand_addresses=True,
)

i= 1000  expand time,0.073 filter time0.049 make_dataframe_time 0.495
i= 2000  expand time,0.147 filter time0.098 make_dataframe_time 0.992
i= 3000  expand time,0.221 filter time0.149 make_dataframe_time 1.493


 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30'
 '31' '32' '33' '34' '35' '36' '120' '121' '122' '123' '124' '125' '126'
 '127' '128' '129' '130' '131' '132' '65' '66' '67' '68' '69' '70' '71'
 '72' '73' '74' '75' '76' '77' '78' '79' '80' '81' '82' '83' '84' '85'
 '86' '87' '88' '89' '90' '91' '92' '93' '94' '1' '2' '3' '4' '5' '6' '7'
 '8' '9' '10' '11' '12' '13' '2' '4' '6' '8' '10' '12' '14' '16' '18' '20'
 '22' '1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '2' '3' '4' '5'
 '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20'
 '21' '22' '23' '24' '25' '26' '27' '28' '29' '30' '31' '32' '33' '34'
 '35' '36' '37' '38' '39' '40' '41' '42' '43' '44' '45' '46' '47' '48'
 '49' '50' '51' '52' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13'
 '14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27'
 '28' '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '

In [60]:
df

Unnamed: 0,index,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,0,13,block,Peterborough,even,,Miller Way,2,,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
1,0,13,block,Peterborough,even,,Miller Way,4,,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
2,0,13,block,Peterborough,even,,Miller Way,6,,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
3,0,13,block,Peterborough,even,,Miller Way,8,,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
4,0,13,block,Peterborough,even,,Miller Way,10,,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
...,...,...,...,...,...,...,...,...,...,...,...
86447,,94083,,London,,EC3M 7AW,Lime Street,,,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)"
86448,,94084,,London,,EC2M 3AD,Bishopsgate,155,,,"155 Bishopsgate, London (EC2M 3AD)"
86449,,94085,,London,,EC3M 7AF,Lime Street,,,,"9th and 10th floors, 52 Lime Street, London (E..."
86450,,94086,,London,,EC3A 7BA,Bevis Marks,,,,"part of Tenth Floor, 6 Bevis Marks, London (EC..."


In [10]:
ocod_data.loc[ocod_data['title_number']=='CB400630']

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,country_incorporated
0,CB400630,1,CB400630-1,True,Freehold,,,,2-24,Miller Way,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
0,CB400630,2,CB400630-2,True,Freehold,,,,15-25,Hammonds Drive,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
1,CB400630,3,CB400630-3,True,Freehold,,,,1-19,Rudd Close,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
2,CB400630,4,CB400630-4,True,Freehold,,,,1-19,Rudd Close,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
3,CB400630,5,CB400630-5,True,Freehold,,,,,,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY


In [6]:

ocod_data

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,country_incorporated
0,CB400630,1,CB400630-1,True,Freehold,,,,2-24,Miller Way,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
1,SYK594298,1,SYK594298-1,True,Freehold,,,,1-25,Acorn Way,S66 3XH,Rotherham,ROTHERHAM,SOUTH YORKSHIRE,YORKS AND HUMBER,,"1-25 (excluding 13) Acorn Way, Sunnyside, Roth...",JERSEY
2,MAN211565,1,MAN211565-1,True,Freehold,,,,2 to 4,Junction Close,BL6 5GN,Bolton,BOLTON,GREATER MANCHESTER,NORTH WEST,,"2 to 4 and 6 to 11 (inclusive), Junction Close...",GUERNSEY
3,BM408625,1,BM408625-1,True,Freehold,,,,33-63,Stadium Approach,,Aylesbury,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,SOUTH EAST,51756.0,"33-63 (odd), Stadium Approach, Aylesbury",GUERNSEY
4,LN169893,1,LN169893-1,True,Freehold,,,,5-7,Princes Square,W2 4NP,London,CITY OF WESTMINSTER,GREATER LONDON,GREATER LONDON,,"5-7 Princes Square, London (W2 4NP)",JERSEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86447,AGL470128,1,AGL470128-1,False,Leasehold,,,,,Lime Street,EC3M 7AW,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)",IRELAND
86448,AGL473126,1,AGL473126-1,False,Leasehold,,,,155,Bishopsgate,EC2M 3AD,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"155 Bishopsgate, London (EC2M 3AD)",JAPAN
86449,AGL473595,1,AGL473595-1,False,Leasehold,,,,,Lime Street,EC3M 7AF,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"9th and 10th floors, 52 Lime Street, London (E...",AUSTRALIA
86450,AGL475468,1,AGL475468-1,False,Leasehold,,,,,Bevis Marks,EC3A 7BA,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",INDIA


In [32]:
import pandas as pd
price_paid_headers = [
    "transaction_unique_identifier",
    "price",
    "date_of_transfer",
    "postcode",
    "property_type",
    "old_new",
    "duration",
    "paon",
    "saon",
    "street",
    "locality",
    "town",
    "district",
    "county",
    "ppd_category_type",
    "record_status",
]
df = pd.read_csv('/teamspace/studios/this_studio/enhance_ocod/data/price_paid_data/price_paid_complete_may_2025.csv', nrows=500,
names = price_paid_headers)

In [33]:
df

Unnamed: 0,transaction_unique_identifier,price,date_of_transfer,postcode,property_type,old_new,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status
0,{7130F2C1-B3E4-41A5-9103-3DC40A6A5466},166500,1995-11-22 00:00,CM23 4PA,D,Y,F,19,,MAYFLOWER GARDENS,BISHOP'S STORTFORD,BISHOP'S STORTFORD,EAST HERTFORDSHIRE,HERTFORDSHIRE,A,A
1,{76F6D626-1D6C-4744-B12E-3A364DB17E29},59000,1995-09-27 00:00,L12 0AY,D,N,L,7,,TRENT CLOSE,LIVERPOOL,LIVERPOOL,LIVERPOOL,MERSEYSIDE,A,A
2,{E8BA1D5E-93DC-4896-AAA2-3A365F979FD9},118000,1995-12-15 00:00,SL3 8XX,D,N,F,24,,SOUTHWOLD SPUR,SLOUGH,SLOUGH,SLOUGH,SLOUGH,A,A
3,{32158129-2288-4763-8DEA-487BEBDF257F},48500,1995-01-27 00:00,CV12 8TF,S,N,F,5,,CHELTENHAM CLOSE,BEDWORTH,BEDWORTH,NUNEATON AND BEDWORTH,WARWICKSHIRE,A,A
4,{CB756459-6205-4740-9184-487BF87F37BD},27500,1995-04-20 00:00,SY11 1HP,S,N,F,105,,BEATRICE STREET,OSWESTRY,OSWESTRY,OSWESTRY,SHROPSHIRE,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,{27A3B2EB-5B33-421E-A6F6-3DC7F14D67F9},52500,1995-08-11 00:00,PO30 3AT,T,Y,F,16,,CALLOWAY CLOSE,ARRETON,NEWPORT,ISLE OF WIGHT,ISLE OF WIGHT,A,A
496,{BF46287F-5625-423A-BEE7-3A3A33104AE3},71500,1995-07-20 00:00,LS29 0TJ,S,Y,F,7,,BROADFIELD WAY,ADDINGHAM,ILKLEY,BRADFORD,WEST YORKSHIRE,A,A
497,{F296BE23-80D0-4EBB-9DFE-3A3A3E1CB239},46950,1995-06-30 00:00,DL6 1HF,T,N,F,245,,ASHLANDS ROAD,NORTHALLERTON,NORTHALLERTON,HAMBLETON,NORTH YORKSHIRE,A,A
498,{043606DE-7F63-47D6-BC9B-3A3A4B937397},82500,1995-07-31 00:00,NR16 2QH,D,Y,F,40,,BREWSTERS,EAST HARLING,NORWICH,BRECKLAND,NORFOLK,A,A


# Geolocations 

In [40]:

###############
# Geolocate
###############
print(f"Geolocating {zip_file.name}...")

ocod_data = preprocess_expanded_ocod_data(ocod_data, postcode_district_lookup)


Geolocating OCOD_FULL_2022_02.zip...


In [41]:
price_paid_df = load_and_process_pricepaid_data(
    file_path=str(price_paid_path),
    processed_dir=processed_price_paid_dir,
    postcode_district_lookup=postcode_district_lookup,
    years_needed=[2024, 2023, 2022],
)

ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df) 

Processed data found. Skipping preprocessing.


Loaded 737255 records for year 2024
Loaded 845990 records for year 2023
Loaded 1068645 records for year 2022


In [6]:
from enhance_ocod.price_paid_process import building_gazetteer

data_folder = SCRIPT_DIR.parent / "data" / "gazeteer"
output_file = data_folder / "building_gazeteer.parquet"

# Check if the file exists
if output_file.exists():
    # Load the existing file
    building_gazetteer_df = pd.read_parquet(output_file)
    print(f"Loaded existing file: {output_file}")
else:
    # Create the folder if it doesn't exist
    data_folder.mkdir(parents=True, exist_ok=True)
    
    # Run the function to create the gazetteer
    building_gazetteer_df = building_gazetteer()
    
    # Save the created dataframe
    building_gazetteer_df.to_parquet(output_file)
    print(f"Created new building gazetteer and saved to: {output_file}")

31it [00:58,  1.89s/it]


Created new building gazetteer and saved to: /teamspace/studios/this_studio/enhance_ocod/data/gazeteer/building_gazeteer.parquet


In [28]:
building_gazetteer_df.loc[building_gazetteer_df['building_name'].str.contains('shalewood')]

Unnamed: 0,building_name,oa11cd,lsoa11cd,msoa11cd,lad11cd


In [43]:
ocod_data.loc[ocod_data['property_address'].str.contains('Deals Gateway')]

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,...,price_paid,property_address,country_incorporated,postcode2,oa11cd,lsoa11cd,msoa11cd,street_number2,street_name2,lad11cd
149,TGL411644,1,TGL411644-1,False,Leasehold,101-114,,,,Deals Gateway,...,,"Ground to ninth Floor Flats being 101-114, 201...",JERSEY,,,,,,DealGateway,E09000023
40791,TGL227654,1,TGL227654-1,False,Leasehold,702,Flat,Alaska Building,,Deals Gateway,...,359160.0,"Flat 702, Alaska Building, Deals Gateway, Lond...",BRITISH VIRGIN ISLANDS,se137qu,E00174025,E01033325,E02000660,,DealGateway,E09000023
58496,TGL227195,1,TGL227195-1,False,Leasehold,513,Flat,Alaska Building,,Deals Gateway,...,243780.0,"Flat 513 Alaska Building, Deals Gateway, Londo...",BRITISH VIRGIN ISLANDS,se137qu,E00174025,E01033325,E02000660,,DealGateway,E09000023
61964,TGL432332,1,TGL432332-1,False,Leasehold,,,Washington Building,,Deals Gateway,...,,"Washington Building, Deals Gateway, London (SE...",JERSEY,se137se,E00174015,E01033325,E02000660,,DealGateway,E09000023
63557,TGL432327,1,TGL432327-1,False,Leasehold,,,Brooklyn Building,,Blackheath Road,...,,"Brooklyn Building, 32 Blackheath Road, London ...",JERSEY,se108ga,E00176525,E01001644,E02006931,,BlackheathRoad,E09000011


In [21]:
ocod_data.loc[ocod_data['postcode2'].isna() & ocod_data['building_name'].notna()].copy()

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,...,price_paid,property_address,country_incorporated,postcode2,oa11cd,lsoa11cd,msoa11cd,street_number2,street_name2,lad11cd
3,MAN214542,1,MAN214542-1,False,Freehold,,,Shalewood Court,2-12,,...,3902021.0,"2-12 (even) Shalewood Court, 1-11 (odd) Shalem...",GUERNSEY,,,,,12,,E08000010
4,NGL783077,1,NGL783077-1,False,Freehold,,,Staple Inn Hall,335 to 338,,...,,"335 to 338 High Holborn, 1 to 4 Holborn, Stapl...",CYPRUS,,,,,338,,E09000001
11,BK455927,1,BK455927-1,False,Freehold,,,Court,33 to 37,,...,,"33 to 37 (Inclusive),Blenheim Court, and 130 t...",GUERNSEY,,,,,37,,E06000040
24,CH613311,1,CH613311-1,False,Freehold,1-10,lands,Merlin Court,,,...,,"1-10 (incl) and 12 Merlin Court, Crewe and Apa...",GUERNSEY,,,,,,,E06000049
45,SL71971,1,SL71971-1,False,Freehold,,land,Colemere House Farm,1,,...,,"Colemere House Farm, Spring Farm, Kenwick Farm...",BRITISH VIRGIN ISLANDS,,,,,1,,E06000051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20077,AGL102462,1,AGL102462-1,False,Freehold,,,Hendon Delivery Office,,Brampton Grove,...,4000000.0,"Hendon Delivery Office, 144 Brampton Grove, Lo...",GUERNSEY,,,,,,BramptonGrove,E09000003
20078,AGL121548,1,AGL121548-1,False,Leasehold,,Flat,First,,High Street,...,247500.0,"First, Second and Third Floor Flat, 174 High S...",JERSEY,,,,,,HighStreet,E09000009
20082,AGL345741,1,AGL345741-1,False,Freehold,,land,Oakleigh Court,,Church Hill Road,...,,"land at Oakleigh Court, Church Hill Road, Barnet",GUERNSEY,,,,,,ChurchHillRoad,E09000003
20084,AGL355106,1,AGL355106-1,False,Leasehold,,,Queensbridge House,,Upper Thames Street,...,411641.0,"Queensbridge House, Upper Thames Street, London",JERSEY,,,,,,UpperThameStreet,E09000001


In [26]:
temp_ocod = ocod_data.loc[ocod_data['postcode2'].isna() & ocod_data['building_name'].notna()].copy()
temp_ocod['building_name'] = temp_ocod['building_name'].str.lower()
temp_ocod = temp_ocod.merge(building_gazetteer_df, on = ['building_name', 'lad11cd'], suffixes = ("",'_lookup'), how = 'left')
temp_ocod.loc[:, ['building_name','lad11cd', 'lsoa11cd', 'lsoa11cd_lookup', 'property_address']]

Unnamed: 0,building_name,lad11cd,lsoa11cd,lsoa11cd_lookup,property_address
0,shalewood court,E08000010,,,"2-12 (even) Shalewood Court, 1-11 (odd) Shalem..."
1,staple inn hall,E09000001,,,"335 to 338 High Holborn, 1 to 4 Holborn, Stapl..."
2,court,E06000040,,,"33 to 37 (Inclusive),Blenheim Court, and 130 t..."
3,merlin court,E06000049,,,"1-10 (incl) and 12 Merlin Court, Crewe and Apa..."
4,colemere house farm,E06000051,,,"Colemere House Farm, Spring Farm, Kenwick Farm..."
...,...,...,...,...,...
5721,hendon delivery office,E09000003,,,"Hendon Delivery Office, 144 Brampton Grove, Lo..."
5722,first,E09000009,,,"First, Second and Third Floor Flat, 174 High S..."
5723,oakleigh court,E09000003,,,"land at Oakleigh Court, Church Hill Road, Barnet"
5724,queensbridge house,E09000001,,,"Queensbridge House, Upper Thames Street, London"


In [33]:
temp_ocod.loc[temp_ocod['building_name'].str.contains('arizona'), ['building_name','lad11cd', 'lsoa11cd', 'lsoa11cd_lookup', 'property_address']]

Unnamed: 0,building_name,lad11cd,lsoa11cd,lsoa11cd_lookup,property_address


In [None]:
ocod_data2 = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)

In [None]:






ocod_data = substreet_matching(ocod_data, price_paid_df, voa_businesses)

# Clean up price paid data
del price_paid_df
gc.collect()

###########
# Classify
###########
print(f"Classifying {zip_file.name}...")
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)

ocod_data = classification_type1(ocod_data)
ocod_data = classification_type2(ocod_data)

ocod_data = contract_ocod_after_classification(
    ocod_data, class_type="class2", classes=["residential"]
)

columns = [
    "title_number",
    "within_title_id",
    "within_larger_title",
    "unique_id",
    "unit_id",
    "unit_type",
    "building_name",
    "street_number",
    "street_name",
    "postcode",
    "city",
    "district",
    "region",
    "property_address",
    "oa11cd",
    "lsoa11cd",
    "msoa11cd",
    "lad11cd",
    "country_incorporated",
    "class",
    "class2",
]

ocod_data = ocod_data.loc[:, columns].rename(
    columns={"within_title_id": "nested_id", "within_larger_title": "nested_title"}
)
# Save results
ocod_data.to_parquet(out_path)
print(f"Saved processed data to {out_path}")

print("All files processed.")


 # new parsing process

In [70]:
import pandas as pd
from typing import List, Dict, Any
import multiprocessing as mp
from functools import partial
import numpy as np

# Define the desired column order at module level so it's consistent everywhere
DESIRED_COLUMN_ORDER = [
    'unit_type', 'unit_id', 'number_filter', 'building_name', 
    'street_number', 'street_name', 'postcode', 'city',
    'datapoint_id',  'original_address'
]

def parse_single_address(address_data: Dict[str, Any]) -> pd.DataFrame:
    """
    Parse a single address from the provided data structure.
    
    Args:
        address_data: Dictionary containing 'datapoint_id', 'entities', 'original_address', etc.
    
    Returns:
        DataFrame with parsed address rows plus metadata columns
    """
    
    # Define hierarchy levels (lower number = higher in hierarchy, more general)
    hierarchy = {
        'city': 0,
        'postcode': 1,
        'street_name': 2,
        'street_number': 3,
        'building_name': 4,
        'number_filter': 5,
        'unit_id': 6,
        'unit_type': 7  # Special case - travels backwards
    }
    
    entities = address_data['entities']
    
    # Handle empty entities
    if not entities:
        # Return empty row with metadata - ensure all columns are present
        empty_row = {col: [None] for col in DESIRED_COLUMN_ORDER}
        empty_row['datapoint_id'] = [address_data['datapoint_id']]
        empty_row['row_index'] = [address_data['row_index']]
        empty_row['original_address'] = [address_data['original_address']]
        return pd.DataFrame(empty_row)[DESIRED_COLUMN_ORDER]
    
    # Sort entities by start position
    sorted_entities = sorted(entities, key=lambda x: x['start'])
    
    # Track current context at each hierarchy level
    current_context = {}
    address_rows = []
    
    def find_backwards_unit_type(unit_entity, all_entities):
        """Find unit_type that comes before the given unit_id"""
        unit_start = unit_entity['start']
        
        for entity in reversed(all_entities):
            if entity['start'] < unit_start and entity['type'] == 'unit_type':
                return entity
        return None
    
    def create_row(primary_entity, context):
        """Create a row with primary entity and inherited context"""
        row = {}
        
        # Add the primary entity
        if primary_entity['type'] == 'unit_id':
            row['unit_id'] = primary_entity['text'].strip()
            # Look backwards for unit_type
            unit_type = find_backwards_unit_type(primary_entity, sorted_entities)
            if unit_type:
                row['unit_type'] = unit_type['text'].strip()
        else:
            row[primary_entity['type']] = primary_entity['text'].strip()
        
        # Inherit all higher-level context (lower hierarchy numbers)
        primary_level = hierarchy[primary_entity['type']]
        for entity_type, level in hierarchy.items():
            if level < primary_level and entity_type in context:
                row[entity_type] = context[entity_type]['text'].strip()
        
        return row
    
    def update_context_from_ahead(start_index):
        """Look ahead and update context with upcoming entities"""
        temp_context = current_context.copy()
        
        for j in range(start_index, len(sorted_entities)):
            next_entity = sorted_entities[j]
            if next_entity['type'] not in ['unit_type']:
                temp_context[next_entity['type']] = next_entity
        
        return temp_context
    
    # Process entities
    i = 0
    while i < len(sorted_entities):
        entity = sorted_entities[i]
        entity_type = entity['type']
        
        # Skip unit_type (handled separately)
        if entity_type == 'unit_type':
            i += 1
            continue
        
        if entity_type == 'unit_id':
            # Units inherit context from higher hierarchy levels
            # Get full context including look-ahead
            full_context = update_context_from_ahead(i + 1)
            
            # Create row with inherited context
            row = create_row(entity, full_context)
            address_rows.append(row)
            i += 1
            
        elif entity_type == 'street_number':
            # Collect consecutive street numbers
            street_numbers = []
            start_idx = i
            while i < len(sorted_entities) and sorted_entities[i]['type'] == 'street_number':
                street_numbers.append(sorted_entities[i])
                i += 1
            
            # Update context with look-ahead
            full_context = update_context_from_ahead(start_idx)
            
            # Process boundary entities that come after street numbers
            j = i
            while j < len(sorted_entities):
                next_entity = sorted_entities[j]
                if next_entity['type'] in ['street_name', 'postcode']:
                    # Update context and move main pointer
                    current_context[next_entity['type']] = next_entity
                    full_context[next_entity['type']] = next_entity
                    i = j + 1
                    break
                elif next_entity['type'] not in ['unit_type', 'street_number', 'unit_id']:
                    # Update context with other entities
                    current_context[next_entity['type']] = next_entity
                    full_context[next_entity['type']] = next_entity
                j += 1
            
            # Update main context with street numbers (they become available for lower hierarchy)
            for street_num in street_numbers:
                current_context['street_number'] = street_num
                
                # Create row for this street number
                row = create_row(street_num, full_context)
                address_rows.append(row)
        
        else:
            # Update context for other entity types
            current_context[entity_type] = entity
            i += 1
    
    # Handle case where we have context but no primary entities (street_number/unit_id)
    if not address_rows and current_context:
        row = {}
        for entity_type in ['city', 'postcode', 'street_name', 'building_name', 'number_filter']:
            if entity_type in current_context:
                row[entity_type] = current_context[entity_type]['text'].strip()
        if row:
            address_rows.append(row)
    
    # Convert to DataFrame
    if not address_rows:
        # Create empty row if no address components found
        address_rows = [{}]
    
    df = pd.DataFrame(address_rows)
    
    # Add metadata columns
    df['datapoint_id'] = address_data['datapoint_id']
    df['row_index'] = address_data['row_index'] 
    df['original_address'] = address_data['original_address']
    
    # Ensure all columns exist (fill missing with None)
    for col in DESIRED_COLUMN_ORDER:
        if col not in df.columns:
            df[col] = None
    
    # Return with correct column order
    return df[DESIRED_COLUMN_ORDER]

def process_address_batch(address_data_list: List[Dict[str, Any]]) -> pd.DataFrame:
    """
    Process a batch of addresses.
    
    Args:
        address_data_list: List of address data dictionaries
    
    Returns:
        Combined DataFrame with all parsed addresses
    """
    results = []
    
    for address_data in address_data_list:
        try:
            df = parse_single_address(address_data)
            results.append(df)
        except Exception as e:
            # Create error row with metadata - ensure all columns are present
            error_row = {col: [None] for col in DESIRED_COLUMN_ORDER}
            error_row.update({
                'datapoint_id': [address_data['datapoint_id']],
                'row_index': [address_data['row_index']],
                'original_address': [address_data['original_address']],
            })
            error_df = pd.DataFrame(error_row)[DESIRED_COLUMN_ORDER]
            error_df['error'] = str(e)  # Add error column at the end
            results.append(error_df)
    
    if results:
        # Concatenate and ensure column order is preserved
        combined_df = pd.concat(results, ignore_index=True)
        
        # Ensure the final result has the correct column order
        final_columns = [col for col in DESIRED_COLUMN_ORDER if col in combined_df.columns]
        
        # Add any extra columns (like 'error') at the end
        extra_columns = [col for col in combined_df.columns if col not in DESIRED_COLUMN_ORDER]
        final_columns.extend(extra_columns)
        
        return combined_df[final_columns]
    else:
        return pd.DataFrame(columns=DESIRED_COLUMN_ORDER)

def parse_all_addresses(address_data_list: List[Dict[str, Any]], 
                       use_multiprocessing: bool = True, 
                       n_cores: int = None,
                       chunk_size: int = 1000,
                       show_progress: bool = True) -> pd.DataFrame:
    """
    Parse all addresses with optional multiprocessing.
    
    Args:
        address_data_list: List of address data dictionaries
        use_multiprocessing: Whether to use multiple cores
        n_cores: Number of cores to use (None = auto-detect)
        chunk_size: Size of batches for processing
        show_progress: Whether to print progress updates
    
    Returns:
        Combined DataFrame with all parsed addresses
    """
    
    if not address_data_list:
        return pd.DataFrame(columns=DESIRED_COLUMN_ORDER)
    
    total_addresses = len(address_data_list)
    
    if show_progress:
        print(f"Processing {total_addresses:,} addresses...")
    
    if not use_multiprocessing or total_addresses < chunk_size:
        # Single-threaded processing
        if show_progress:
            print("Using single-threaded processing...")
        return process_address_batch(address_data_list)
    
    # Multi-threaded processing
    if n_cores is None:
        n_cores = mp.cpu_count()
    
    if show_progress:
        print(f"Using multiprocessing with {n_cores} cores...")
    
    # Split into chunks
    chunks = [address_data_list[i:i+chunk_size] 
              for i in range(0, len(address_data_list), chunk_size)]
    
    if show_progress:
        print(f"Split into {len(chunks)} chunks of ~{chunk_size} addresses each")
    
    # Process chunks in parallel
    with mp.Pool(processes=n_cores) as pool:
        if show_progress:
            print("Processing chunks...")
        
        results = pool.map(process_address_batch, chunks)
    
    # Combine all results
    if show_progress:
        print("Combining results...")
    
    # Filter out empty results and concatenate
    non_empty_results = [r for r in results if not r.empty]
    
    if non_empty_results:
        final_result = pd.concat(non_empty_results, ignore_index=True)
        
        final_columns = [col for col in DESIRED_COLUMN_ORDER if col in final_result.columns]
        extra_columns = [col for col in final_result.columns if col not in DESIRED_COLUMN_ORDER]
        final_columns.extend(extra_columns)
        
        final_result = final_result[final_columns]
    else:
        final_result = pd.DataFrame(columns=DESIRED_COLUMN_ORDER)
    
    if show_progress:
        print(f"✅ Completed! Processed {len(final_result):,} address rows from {total_addresses:,} original addresses")
    
    return final_result

In [73]:
out = parse_all_addresses(results['results'],
chunk_size = 10000)

Processing 94,088 addresses...
Using multiprocessing with 4 cores...
Split into 10 chunks of ~10000 addresses each
Processing chunks...
Combining results...
✅ Completed! Processed 109,695 address rows from 94,088 original addresses


In [74]:
import pandas as pd
from typing import List, Dict, Any
import multiprocessing as mp
from functools import partial
import numpy as np
import time # For timing

# Define the desired column order at module level so it's consistent everywhere
DESIRED_COLUMN_ORDER = [
    'unit_type', 'unit_id', 'number_filter', 'building_name',
    'street_number', 'street_name', 'postcode', 'city',
    'datapoint_id',  'original_address'
]

# Define hierarchy levels (lower number = higher in hierarchy, more general)
HIERARCHY = {
    'city': 0,
    'postcode': 1,
    'street_name': 2,
    'street_number': 3,
    'building_name': 4,
    'number_filter': 5,
    'unit_id': 6,
    'unit_type': 7
}

def parse_single_address_optimized(address_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Parse a single address from the provided data structure.
    Returns a list of dictionaries, each representing a row.
    """
    entities = address_data['entities']
    datapoint_id = address_data['datapoint_id']
    row_index = address_data['row_index']
    original_address = address_data['original_address']

    # Handle empty entities
    if not entities:
        # Return a single empty row with metadata
        empty_row = {col: None for col in DESIRED_COLUMN_ORDER}
        empty_row.update({
            'datapoint_id': datapoint_id,
            'row_index': row_index,
            'original_address': original_address
        })
        return [empty_row]

    sorted_entities = sorted(entities, key=lambda x: x['start'])

    current_context = {}
    address_rows_data = [] # Collect dictionaries here

    # Helper to find unit_type backwards
    def find_backwards_unit_type(unit_entity_start, all_entities):
        for entity in reversed(all_entities):
            if entity['start'] < unit_entity_start and entity['type'] == 'unit_type':
                return entity['text'].strip()
        return None

    # Helper to create a row dictionary
    def create_row_dict(primary_entity_type, primary_entity_text, context_dict, original_address, datapoint_id, row_index, unit_type_val=None):
        row = {col: None for col in DESIRED_COLUMN_ORDER} # Initialize with None
        row['datapoint_id'] = datapoint_id
        row['row_index'] = row_index
        row['original_address'] = original_address

        if primary_entity_type: # For entities like unit_id, street_number
            row[primary_entity_type] = primary_entity_text

        if unit_type_val: # For unit_id specific handling
            row['unit_type'] = unit_type_val

        # Inherit all higher-level context
        primary_level = HIERARCHY.get(primary_entity_type, 100) # Use a high number for non-primary entities
        for entity_type, level in HIERARCHY.items():
            if level < primary_level and entity_type in context_dict:
                row[entity_type] = context_dict[entity_type]
            elif entity_type == primary_entity_type and entity_type != 'unit_id': # Ensure primary entity is in its place
                row[entity_type] = primary_entity_text


        return row

    i = 0
    while i < len(sorted_entities):
        entity = sorted_entities[i]
        entity_type = entity['type']
        entity_text = entity['text'].strip()

        if entity_type == 'unit_type':
            i += 1
            continue

        if entity_type == 'unit_id':
            unit_type_val = find_backwards_unit_type(entity['start'], sorted_entities)
            row_data = create_row_dict(
                'unit_id', entity_text, current_context,
                original_address, datapoint_id, row_index, unit_type_val
            )
            address_rows_data.append(row_data)
            i += 1

        elif entity_type == 'street_number':
            street_numbers = []
            start_idx = i
            while i < len(sorted_entities) and sorted_entities[i]['type'] == 'street_number':
                street_numbers.append(sorted_entities[i])
                i += 1

            # Update current context for entities found immediately after street numbers
            j = i
            while j < len(sorted_entities):
                next_entity = sorted_entities[j]
                if next_entity['type'] in ['street_name', 'postcode']:
                    current_context[next_entity['type']] = next_entity['text'].strip()
                    i = j + 1 # Advance the main pointer
                    break # Only take the first street_name/postcode after street number block
                elif next_entity['type'] not in ['unit_type', 'street_number', 'unit_id']:
                    # Update context with other entities
                    current_context[next_entity['type']] = next_entity['text'].strip()
                j += 1

            for street_num_entity in street_numbers:
                current_context['street_number'] = street_num_entity['text'].strip() # Update context for lower hierarchy
                row_data = create_row_dict(
                    'street_number', street_num_entity['text'].strip(), current_context,
                    original_address, datapoint_id, row_index
                )
                address_rows_data.append(row_data)

        else:
            current_context[entity_type] = entity_text
            i += 1

    # Handle case where we have context but no primary entities (street_number/unit_id)
    if not address_rows_data and current_context:
        row_data = create_row_dict(None, None, current_context, original_address, datapoint_id, row_index)
        address_rows_data.append(row_data)

    if not address_rows_data: # If still no rows after all processing
        empty_row = {col: None for col in DESIRED_COLUMN_ORDER}
        empty_row.update({
            'datapoint_id': datapoint_id,
            'row_index': row_index,
            'original_address': original_address
        })
        return [empty_row]

    return address_rows_data

def process_address_batch_optimized(address_data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Process a batch of addresses, returning a list of dictionaries (rows).
    """
    all_batch_rows = []

    for address_data in address_data_list:
        try:
            parsed_rows = parse_single_address_optimized(address_data)
            all_batch_rows.extend(parsed_rows)
        except Exception as e:
            # Append an error row directly as a dictionary
            error_row = {col: None for col in DESIRED_COLUMN_ORDER}
            error_row.update({
                'datapoint_id': address_data['datapoint_id'],
                'row_index': address_data['row_index'],
                'original_address': address_data['original_address'],
                'error': str(e)  # Add error information
            })
            all_batch_rows.append(error_row)

    return all_batch_rows

def parse_all_addresses_optimized(address_data_list: List[Dict[str, Any]],
                       use_multiprocessing: bool = True,
                       n_cores: int = None,
                       chunk_size: int = 1000,
                       show_progress: bool = True) -> pd.DataFrame:
    """
    Parse all addresses with optional multiprocessing, returning a single DataFrame.
    """
    if not address_data_list:
        return pd.DataFrame(columns=DESIRED_COLUMN_ORDER)

    total_addresses = len(address_data_list)

    if show_progress:
        print(f"Processing {total_addresses:,} addresses...")
        start_time = time.time()

    all_results_data = []

    if not use_multiprocessing or total_addresses < chunk_size:
        if show_progress:
            print("Using single-threaded processing...")
        all_results_data = process_address_batch_optimized(address_data_list)
    else:
        if n_cores is None:
            n_cores = mp.cpu_count()

        if show_progress:
            print(f"Using multiprocessing with {n_cores} cores...")

        chunks = [address_data_list[i:i+chunk_size]
                  for i in range(0, len(address_data_list), chunk_size)]

        if show_progress:
            print(f"Split into {len(chunks)} chunks of ~{chunk_size} addresses each")
            chunk_start_time = time.time()

        with mp.Pool(processes=n_cores) as pool:
            # Map the function to chunks, results will be lists of dicts
            list_of_list_of_dicts = pool.map(process_address_batch_optimized, chunks)

        if show_progress:
            print(f"Chunks processed in {time.time() - chunk_start_time:.2f} seconds.")
            print("Combining results...")

        # Flatten the list of lists of dictionaries into a single list of dictionaries
        for batch_rows in list_of_list_of_dicts:
            all_results_data.extend(batch_rows)

    if all_results_data:
        # Create DataFrame from the list of dictionaries ONLY ONCE
        final_result = pd.DataFrame(all_results_data)

        # Reorder columns and handle potential 'error' column
        final_columns = [col for col in DESIRED_COLUMN_ORDER if col in final_result.columns]
        extra_columns = [col for col in final_result.columns if col not in DESIRED_COLUMN_ORDER]
        final_columns.extend(extra_columns)

        final_result = final_result[final_columns]
    else:
        final_result = pd.DataFrame(columns=DESIRED_COLUMN_ORDER)

    if show_progress:
        end_time = time.time()
        print(f"✅ Completed! Processed {len(final_result):,} address rows from {total_addresses:,} original addresses in {end_time - start_time:.2f} seconds.")

    return final_result

In [75]:
out = parse_all_addresses_optimized(results['results'],
chunk_size = 10000)

Processing 94,088 addresses...
Using multiprocessing with 4 cores...
Split into 10 chunks of ~10000 addresses each
Chunks processed in 3.46 seconds.
Combining results...
✅ Completed! Processed 109,695 address rows from 94,088 original addresses in 3.68 seconds.
