# New address parser

In [1]:

entities =[{'type': 'unit_id',
    'text': ' 36 - 40',
    'start': 5,
    'end': 13,
    'confidence': 0.9996221},
{'type': 'number_filter',
    'text': 'even',
    'start': 15,
    'end': 19,
    'confidence': 0.999858},
{'type': 'unit_id',
    'text': ' 42-44',
    'start': 21,
    'end': 27,
    'confidence': 0.9986492},
{'type': 'building_name',
    'text': ' climb house',
    'start': 28,
    'end': 40,
    'confidence': 0.999928},
{'type': 'unit_id',
    'text': ' 1-5',
    'start': 47,
    'end': 51,
    'confidence': 0.997581},
{'type': 'building_name',
    'text': ' down buildings',
    'start': 51,
    'end': 66,
    'confidence': 0.99611604},
{'type': 'street_name',
    'text': ' chapel street',
    'start': 67,
    'end': 81,
    'confidence': 0.99992263},
{'type': 'postcode',
    'text': ' se45 6pq',
    'start': 90,
    'end': 99,
    'confidence': 0.99991864}]


In [66]:
################ ANTRHOPIC
import pandas as pd
from typing import List, Dict, Any, Optional

class AddressNode:
    def __init__(self, entity: Dict[str, Any]):
        self.entity = entity
        self.type = entity['type']
        self.text = entity['text']
        self.start = entity['start']
        self.end = entity['end']
        self.children = []
        self.parent = None
        self.unit_type = None
    
    def add_child(self, child_node):
        self.children.append(child_node)
        child_node.parent = self
    
    def get_full_address(self) -> Dict[str, str]:
        """Build complete address by traversing up to root"""
        address = {}
        current = self
        
        # Traverse up the tree collecting all components
        while current:
            address[current.type] = current.text.strip()
            if current.unit_type:
                address['unit_type'] = current.unit_type.text.strip()
            current = current.parent
        
        return address

class AddressGraph:
    def __init__(self, entities: List[Dict[str, Any]]):
        self.hierarchy_levels = {
            'city': 0,
            'postcode': 1, 
            'street_name': 2,
            'street_number': 3,
            'building_name': 4,
            'number_filter': 5,
            'unit_id': 6,
            'unit_type': 7
        }
        
        self.nodes = []
        self.node_by_type = {}
        
        self._build_graph(entities)
    
    def _build_graph(self, entities: List[Dict[str, Any]]):
        # Separate unit_types for special handling
        regular_entities = [e for e in entities if e['type'] != 'unit_type']
        unit_types = [e for e in entities if e['type'] == 'unit_type']
        
        # Create nodes for regular entities
        for entity in regular_entities:
            node = AddressNode(entity)
            self.nodes.append(node)
        
        # Sort nodes by hierarchy level, then by position
        self.nodes.sort(key=lambda x: (self.hierarchy_levels[x.type], x.start))
        
        # Group nodes by hierarchy level
        levels = {}
        for node in self.nodes:
            level = self.hierarchy_levels[node.type]
            if level not in levels:
                levels[level] = []
            levels[level].append(node)
        
        # Connect nodes level by level
        sorted_levels = sorted(levels.keys())
        
        for i, current_level in enumerate(sorted_levels[1:], 1):  # Skip level 0 (roots)
            parent_level = sorted_levels[i-1]
            
            for child in levels[current_level]:
                # Find best parent from previous level
                best_parent = self._find_best_parent(child, levels[parent_level])
                if best_parent:
                    best_parent.add_child(child)
                else:
                    # If no parent found in immediate level, look further up
                    for j in range(i-2, -1, -1):
                        best_parent = self._find_best_parent(child, levels[sorted_levels[j]])
                        if best_parent:
                            best_parent.add_child(child)
                            break
        
        # Handle unit types
        self._connect_unit_types(unit_types)
    
    def _find_best_parent(self, child: AddressNode, potential_parents: List[AddressNode]) -> Optional[AddressNode]:
        """Find the best parent for a child node"""
        valid_parents = []
        
        for parent in potential_parents:
            # For addresses, higher hierarchy (parent) often comes AFTER lower hierarchy (child)
            # Child should come before parent in the text
            if child.start < parent.start:  
                distance = parent.start - child.end    
                valid_parents.append((parent, distance))
        
        if not valid_parents:
            return None
        
        # Return parent with minimum distance
        return min(valid_parents, key=lambda x: x[1])[0]
    
    def _connect_unit_types(self, unit_types: List[Dict[str, Any]]):
        """Connect unit_type entities to their corresponding unit_id nodes"""
        unit_nodes = [n for n in self.nodes if n.type == 'unit_id']
        
        for unit_node in unit_nodes:
            # Find closest preceding unit_type
            best_unit_type = None
            min_distance = float('inf')
            
            for unit_type_entity in unit_types:
                if unit_type_entity['start'] < unit_node.start:
                    distance = unit_node.start - unit_type_entity['end']
                    if distance < min_distance and distance < 30:
                        min_distance = distance
                        best_unit_type = unit_type_entity
            
            if best_unit_type:
                unit_node.unit_type = AddressNode(best_unit_type)
    
    def get_addresses(self) -> pd.DataFrame:
        """Extract all complete addresses"""
        # Find leaf nodes (nodes with no children)
        leaf_nodes = [node for node in self.nodes if not node.children]
        
        # If no leaf nodes, use all nodes (fallback)
        if not leaf_nodes:
            leaf_nodes = self.nodes
        
        addresses = []
        for leaf in leaf_nodes:
            address = leaf.get_full_address()
            addresses.append(address)
        
        if not addresses:
            return pd.DataFrame()
        
        # Create DataFrame
        df = pd.DataFrame(addresses)
        
        # Define column order
        column_order = ['unit_type', 'unit_id', 'number_filter', 'building_name',
                       'street_number', 'street_name', 'postcode', 'city']
        
        existing_columns = [col for col in column_order if col in df.columns]
        df = df[existing_columns]
        
        return df
    
    def visualize_graph(self):
        """Visualize the graph structure"""
        # Find root nodes
        roots = [node for node in self.nodes if node.parent is None]
        
        def print_tree(node, level=0):
            indent = "  " * level
            unit_info = f" (unit_type: {node.unit_type.text})" if node.unit_type else ""
            print(f"{indent}{node.type}: '{node.text.strip()}'{unit_info}")
            for child in node.children:
                print_tree(child, level + 1)
        
        print("Address Graph:")
        for root in roots:
            print_tree(root)
        
        # Also show leaf nodes for debugging
        leaves = [node for node in self.nodes if not node.children]
        print(f"\nLeaf nodes: {[(n.type, n.text.strip()) for n in leaves]}")

def parse_addresses_to_dicts(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Parse entities and return list of dictionaries instead of DataFrame"""
    if not entities:
        return []
    
    graph = AddressGraph(entities)
    
    # Find leaf nodes
    leaf_nodes = [node for node in graph.nodes if not node.children]
    if not leaf_nodes:
        leaf_nodes = graph.nodes
    
    # Convert each leaf to dictionary
    address_dicts = []
    for leaf in leaf_nodes:
        address = leaf.get_full_address()
        address_dicts.append(address)
    
    return address_dicts

def process_addresses(address_data_list: List[Dict]) -> pd.DataFrame:
    
    all_rows = []
    
    for address_data in address_data_list:
        try:
            entities = address_data['entities']
            
            if not entities:
                continue
            
            # Get parsed addresses as dictionaries (not DataFrame)
            parsed_addresses = parse_addresses_to_dicts(entities)
            
            # Add metadata to each parsed address
            metadata = {
                'row_index': address_data['row_index'],
                'datapoint_id': address_data['datapoint_id'],
                'property_address': address_data['original_address']
            }
            
            for address_dict in parsed_addresses:
                # Merge parsed address with metadata
                full_row = {**address_dict, **metadata}
                all_rows.append(full_row)
                
        except Exception as e:
            # Log error but continue processing
            print(f"Error processing address {address_data['row_index']}: {e}")
            continue
    
    # Single DataFrame creation at the end
    if not all_rows:
        return pd.DataFrame()
    
    df = pd.DataFrame(all_rows)
    
    # Define column order (address columns first, then metadata)
    address_columns = ['unit_type', 'unit_id', 'number_filter', 'building_name',
                      'street_number', 'street_name', 'postcode', 'city']
    metadata_columns = ['datapoint_id']
    
    # Reorder columns (only include columns that exist)
    all_columns = address_columns + metadata_columns
    existing_columns = [col for col in all_columns if col in df.columns]
    df = df[existing_columns]
    
    return df

# Test
def test_with_visualization():
    entities = [
        {'type': 'unit_type', 'text': 'Flat', 'start': 0, 'end': 4, 'confidence': 0.999},
        {'type': 'unit_id', 'text': ' 1A', 'start': 4, 'end': 7, 'confidence': 0.999},
        {'type': 'unit_type', 'text': ' Flat', 'start': 8, 'end': 12, 'confidence': 0.999},
        {'type': 'unit_id', 'text': ' 1B', 'start': 12, 'end': 15, 'confidence': 0.999},
        {'type': 'street_number', 'text': ' 25', 'start': 16, 'end': 19, 'confidence': 0.999},
        {'type': 'street_name', 'text': ' Oak Street', 'start': 19, 'end': 30, 'confidence': 0.999},
        {'type': 'city', 'text': ' London', 'start': 31, 'end': 38, 'confidence': 0.999}
    ]
    
    graph = AddressGraph(entities)
    graph.visualize_graph()
    result = graph.get_addresses()
    print("\nParsed Addresses:")
    print(result)



test_with_visualization()

graph = AddressGraph(entities)
graph.visualize_graph()
result = graph.get_addresses()
print("\nParsed Addresses:")
print(result)


Address Graph:
city: 'London'
  street_name: 'Oak Street'
    street_number: '25'
      unit_id: '1A' (unit_type: Flat)
      unit_id: '1B' (unit_type:  Flat)

Leaf nodes: [('unit_id', '1A'), ('unit_id', '1B')]

Parsed Addresses:
  unit_type unit_id street_number street_name    city
0      Flat      1A            25  Oak Street  London
1      Flat      1B            25  Oak Street  London
Address Graph:
postcode: 'se45 6pq'
  street_name: 'chapel street'
    building_name: 'climb house'
      number_filter: 'even'
        unit_id: '36 - 40'
      unit_id: '42-44'
    building_name: 'down buildings'
      unit_id: '1-5'

Leaf nodes: [('unit_id', '36 - 40'), ('unit_id', '42-44'), ('unit_id', '1-5')]

Parsed Addresses:
   unit_id number_filter   building_name    street_name  postcode
0  36 - 40          even     climb house  chapel street  se45 6pq
1    42-44           NaN     climb house  chapel street  se45 6pq
2      1-5           NaN  down buildings  chapel street  se45 6pq


In [44]:
temp_ents = {'summary': {'total_addresses': 1,
  'successful_parses': 1,
  'failed_parses': 0,
  'success_rate': 1.0,
  'batch_size_used': 512},
 'results': [{'row_index': 0,
   'datapoint_id': 0,
   'original_address': 'Flats 36 - 40 (even), 42-44, climb house, Flats 1-5 down buildings, chapel street, London, se45 6pq',
   'entities': [{'type': 'unit_id',
     'text': ' 36 - 40',
     'start': 5,
     'end': 13,
     'confidence': 0.9996221},
    {'type': 'number_filter',
     'text': 'even',
     'start': 15,
     'end': 19,
     'confidence': 0.999858},
    {'type': 'unit_id',
     'text': ' 42-44',
     'start': 21,
     'end': 27,
     'confidence': 0.9986492},
    {'type': 'building_name',
     'text': ' climb house',
     'start': 28,
     'end': 40,
     'confidence': 0.999928},
    {'type': 'unit_id',
     'text': ' 1-5',
     'start': 47,
     'end': 51,
     'confidence': 0.997581},
    {'type': 'building_name',
     'text': ' down buildings',
     'start': 51,
     'end': 66,
     'confidence': 0.99611604},
    {'type': 'street_name',
     'text': ' chapel street',
     'start': 67,
     'end': 81,
     'confidence': 0.99992263},
    {'type': 'postcode',
     'text': ' se45 6pq',
     'start': 90,
     'end': 99,
     'confidence': 0.99991864}],
   'parsed_components': {'unit_id': [' 36 - 40', ' 42-44', ' 1-5'],
    'number_filter': ['even'],
    'building_name': [' climb house', ' down buildings'],
    'street_name': [' chapel street'],
    'postcode': [' se45 6pq']}}]}

In [46]:
process_addresses(temp_ents['results'])


Unnamed: 0,unit_id,number_filter,building_name,street_name,postcode,row_index,datapoint_id,property_address
0,36 - 40,even,climb house,chapel street,se45 6pq,0,0,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
1,42-44,,climb house,chapel street,se45 6pq,0,0,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
2,1-5,,down buildings,chapel street,se45 6pq,0,0,"Flats 36 - 40 (even), 42-44, climb house, Flat..."


# New pipeline approach

In [55]:
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data,
    parsing_and_expansion_process,
    post_process_expanded_data,
    load_postcode_district_lookup,
)
from enhance_ocod.locate_and_classify import (
    preprocess_expanded_ocod_data,
    add_missing_lads_ocod,
    load_voa_ratinglist,
    street_and_building_matching,
    substreet_matching,
    counts_of_businesses_per_oa_lsoa,
    voa_address_match_all_data,
    classification_type1,
    classification_type2,
    contract_ocod_after_classification,
)
from enhance_ocod.inference import convert_to_entity_dataframe
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
from pathlib import Path
from tqdm import tqdm
import time
import gc  # Add for memory management
import pandas as pd
import numpy as np
import pickle
# There is a warning related to bfill and ffill which is basically internal to pandas so silencing here
import warnings



warnings.filterwarnings("ignore", message=".*Downcasting object dtype arrays.*")

SCRIPT_DIR = Path("/teamspace/studios/this_studio/enhance_ocod/notebooks")

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = (
    SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
)


ONSPD_dir = SCRIPT_DIR.parent / "data" / "onspd"
ONSPD_files = list(ONSPD_dir.glob("*"))
ONSPD_path = ONSPD_files[0] if ONSPD_files else None

price_paid_dir = SCRIPT_DIR.parent / "data" / "price_paid_data"
price_paid_files = list(price_paid_dir.glob("*"))
price_paid_path = price_paid_files[0] if price_paid_files else None


voa_dir = SCRIPT_DIR.parent / "data" / "voa"
voa_files = list(voa_dir.glob("*"))
voa_path = voa_files[0] if voa_files else None


processed_price_paid_dir = SCRIPT_DIR.parent / "data" / "processed_price_paid"
output_dir.mkdir(parents=True, exist_ok=True)

parsed_results_dir = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts2"
parsed_results_dir.mkdir(parents=True, exist_ok=True)

# List of all zip files in input_dir
#
# TESTING!!! only 10 files!
#
all_files = sorted([f for f in input_dir.glob("OCOD_FULL_*.zip")])


print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)


zip_file = Path('/teamspace/studios/this_studio/enhance_ocod/data/ocod_history/OCOD_FULL_2022_02.zip')

Loading common reference data...


  postcode_district_lookup = pd.read_csv(f)[


Initial rows: 2293839


# Load results

In [75]:
out_name = zip_file.stem + ".parquet"
out_path = output_dir / out_name

# Define parsed results file path
parsed_results_file = parsed_results_dir / f"{zip_file.stem}_parsed_results.pkl"

print(f"Processing {zip_file.name}...")

# Load and process the OCOD data
ocod_data = load_and_prep_OCOD_data(str(zip_file))

###############
# Parse addresses
###############
if parsed_results_file.exists():
    print(f"Loading cached parsing results for {zip_file.name}...")
    with open(parsed_results_file, "rb") as f:
        results = pickle.load(f)
    print(
        f"Loaded cached results with success rate: {results['summary']['success_rate']:.1%}"
    )
else:
    from enhance_ocod.inference import parse_addresses_pipeline
    import torch
    torch.set_float32_matmul_precision("medium")
    print(f"Parsing addresses for {zip_file.name}...")
    start_time = time.time()

    results = parse_addresses_pipeline(
        df=ocod_data,
        short_batch_size=128,  # The default seems really slow, might be to do with loading not sure
        model_path=str(model_path),
        target_column="property_address",
    )

    end_time = time.time()
    print(f"Address parsing took {end_time - start_time:.2f} seconds")
    print(f"Success rate: {results['summary']['success_rate']:.1%}")

    # Save parsing results
    print(f"Saving parsing results to {parsed_results_file}...")
    with open(parsed_results_file, "wb") as f:
        pickle.dump(results, f)


Processing OCOD_FULL_2022_02.zip...


Loading cached parsing results for OCOD_FULL_2022_02.zip...
Loaded cached results with success rate: 100.0%


In [82]:
results['results'][0:3]

[{'row_index': 0,
  'datapoint_id': 0,
  'original_address': 'Westleigh Lodge Care Home, Nel Pan Lane, Leigh (WN7 5JT)',
  'entities': [{'type': 'building_name',
    'text': 'Westleigh Lodge Care Home',
    'start': 0,
    'end': 25,
    'confidence': 0.99999654},
   {'type': 'street_name',
    'text': ' Nel Pan Lane',
    'start': 26,
    'end': 39,
    'confidence': 0.9999978},
   {'type': 'city',
    'text': ' Leigh',
    'start': 40,
    'end': 46,
    'confidence': 0.99988425},
   {'type': 'postcode',
    'text': 'WN7 5JT',
    'start': 48,
    'end': 55,
    'confidence': 0.9999841}],
  'parsed_components': {'building_name': ['Westleigh Lodge Care Home'],
   'street_name': [' Nel Pan Lane'],
   'city': [' Leigh'],
   'postcode': ['WN7 5JT']}},
 {'row_index': 1,
  'datapoint_id': 1,
  'original_address': 'Flat 1, 1a Canal Street, Manchester (M1 3HE)',
  'entities': [{'type': 'unit_type',
    'text': 'Flat',
    'start': 0,
    'end': 4,
    'confidence': 0.9999956},
   {'type': 'u

In [67]:
processed_addresses_df = process_addresses(results['results'])

In [68]:
processed_addresses_df

Unnamed: 0,unit_type,unit_id,number_filter,building_name,street_number,street_name,postcode,city,datapoint_id
0,,,,,,,,Leigh,0
1,,,,Westleigh Lodge Care Home,,Nel Pan Lane,WN7 5JT,,0
2,,,,,,,,Manchester,1
3,Flat,1,,,,Canal Street,M1 3HE,,1
4,,,,,,,,Manchester,2
...,...,...,...,...,...,...,...,...,...
188036,,,,,,Lime Street,EC3M 7AF,,94085
188037,,,,,,,,London,94086
188038,,,,,,Bevis Marks,EC3A 7BA,,94086
188039,,,,,,,,London,94087


In [79]:
def post_process_expanded_data(expanded_data, ocod_data):
    """
    This function adds in additional meta-data from the ocod dataset and prepares the final expanded dataset to be
    exported for geo-location and classification
    it takes two arguements
    expanded_data is a pandas dataframe produced by the 'final_parsed_addresses' function
    """
    full_expanded_data = expanded_data.merge(
        ocod_data, how="left", left_on="datapoint_id", right_index=True
    )

    full_expanded_data["within_title_id"] = (
        full_expanded_data.groupby("title_number").cumcount() + 1
    )
    full_expanded_data["unique_id"] = [
        str(x) + "-" + str(y)
        for x, y in zip(
            full_expanded_data["title_number"], full_expanded_data["within_title_id"]
        )
    ]

    tmp_df = (
        full_expanded_data[["title_number", "within_title_id"]]
        .groupby("title_number")
        .max("within_title_id")
    ) > 1
    tmp_df.columns = tmp_df.columns.str.replace(
        "within_title_id", "within_larger_title"
    )  # could also be called nested_address
    full_expanded_data = full_expanded_data.merge(
        tmp_df, how="left", left_on="title_number", right_index=True
    )

    full_expanded_data["postcode"] = full_expanded_data["postcode"].str.upper()
    del tmp_df

    # re-order the columns and drop columns that are not needed

    full_expanded_data = full_expanded_data[
        [
            "title_number",
            "within_title_id",
            "unique_id",
            "within_larger_title",
            "tenure",
            "unit_id",
            "unit_type",
            "building_name",
            "street_number",
            "street_name",
            "postcode",
            "city",
            "district",
            "county",
            "region",
            "price_paid",
            "property_address",
            "country_incorporated",
        ]
    ]

    return full_expanded_data

In [80]:
post_processed_data = post_process_expanded_data(processed_addresses_df, ocod_data)

In [81]:
post_processed_data

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,country_incorporated
0,MAN296957,1,MAN296957-1,True,Leasehold,,,,,,,Leigh,WIGAN,GREATER MANCHESTER,NORTH WEST,,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",ISLE OF MAN
1,MAN296957,2,MAN296957-2,True,Leasehold,,,Westleigh Lodge Care Home,,Nel Pan Lane,WN7 5JT,,WIGAN,GREATER MANCHESTER,NORTH WEST,,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",ISLE OF MAN
2,MAN297185,1,MAN297185-1,True,Leasehold,,,,,,,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,270000.0,"Flat 1, 1a Canal Street, Manchester (M1 3HE)",JERSEY
3,MAN297185,2,MAN297185-2,True,Leasehold,1,Flat,,,Canal Street,M1 3HE,,MANCHESTER,GREATER MANCHESTER,NORTH WEST,270000.0,"Flat 1, 1a Canal Street, Manchester (M1 3HE)",JERSEY
4,MAN299183,1,MAN299183-1,True,Leasehold,,,,,,,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,243516.0,"Flat 201, 1 Regent Road, Manchester (M3 4AY)",BRITISH VIRGIN ISLANDS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188036,AGL473595,2,AGL473595-2,True,Leasehold,,,,,Lime Street,EC3M 7AF,,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"9th and 10th floors, 52 Lime Street, London (E...",AUSTRALIA
188037,AGL475468,1,AGL475468-1,True,Leasehold,,,,,,,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",INDIA
188038,AGL475468,2,AGL475468-2,True,Leasehold,,,,,Bevis Marks,EC3A 7BA,,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",INDIA
188039,AGL476501,1,AGL476501-1,True,Leasehold,,,,,,,London,TOWER HAMLETS,GREATER LONDON,GREATER LONDON,,"Storage 17, Discovery Dock Apartments East, 3 ...",JERSEY


In [83]:
import pandas as pd
from typing import List, Dict, Any, Optional

class AddressNode:
    def __init__(self, entity: Dict[str, Any]):
        self.entity = entity
        self.type = entity['type']
        self.text = entity['text']
        self.start = entity['start']
        self.end = entity['end']
        self.children = []
        self.parent = None
        self.unit_type = None
    
    def add_child(self, child_node):
        self.children.append(child_node)
        child_node.parent = self
    
    def get_full_address(self) -> Dict[str, str]:
        """Build complete address by traversing up to root"""
        address = {}
        current = self
        
        # Traverse up the tree collecting all components
        while current:
            address[current.type] = current.text.strip()
            if current.unit_type:
                address['unit_type'] = current.unit_type.text.strip()
            current = current.parent
        
        return address

class AddressGraph:
    def __init__(self, entities: List[Dict[str, Any]]):
        self.hierarchy_levels = {
            'city': 0,           # Always root
            'postcode': 1,       # Child of city
            'street_name': 2,    # Child of postcode or city
            'street_number': 3,  # Child of street_name
            'building_name': 4,  # Child of street_number or street_name
            'number_filter': 5,  # Child of building_name
            'unit_id': 6,        # Child of number_filter or building_name
            'unit_type': 7       # Special handling
        }
        
        self.nodes = []
        self.city_node = None  # Track the city node separately
        
        self._build_graph(entities)
    
    def _build_graph(self, entities: List[Dict[str, Any]]):
        # Separate unit_types for special handling
        regular_entities = [e for e in entities if e['type'] != 'unit_type']
        unit_types = [e for e in entities if e['type'] == 'unit_type']
        
        # Create nodes for regular entities
        for entity in regular_entities:
            node = AddressNode(entity)
            self.nodes.append(node)
            
            # Track city node separately
            if node.type == 'city':
                self.city_node = node
        
        # Sort nodes by hierarchy level, then by position
        self.nodes.sort(key=lambda x: (self.hierarchy_levels[x.type], x.start))
        
        # Group nodes by hierarchy level
        levels = {}
        for node in self.nodes:
            level = self.hierarchy_levels[node.type]
            if level not in levels:
                levels[level] = []
            levels[level].append(node)
        
        # Connect nodes level by level, but handle city specially
        sorted_levels = sorted(levels.keys())
        
        # If we have a city, make all non-city nodes connect to it eventually
        if self.city_node:
            # Connect other nodes starting from level 1
            for i, current_level in enumerate(sorted_levels[1:], 1):
                parent_level = sorted_levels[i-1]
                
                for child in levels[current_level]:
                    if child.type == 'city':  # Skip city nodes in regular processing
                        continue
                        
                    # Find best parent from previous level
                    best_parent = self._find_best_parent(child, levels[parent_level])
                    if best_parent:
                        best_parent.add_child(child)
                    else:
                        # If no parent found in immediate level, look further up
                        found_parent = False
                        for j in range(i-2, -1, -1):
                            if j == 0:  # Level 0 is city level - use special logic
                                self.city_node.add_child(child)
                                found_parent = True
                                break
                            else:
                                best_parent = self._find_best_parent(child, levels[sorted_levels[j]])
                                if best_parent:
                                    best_parent.add_child(child)
                                    found_parent = True
                                    break
                        
                        # If still no parent found, attach directly to city
                        if not found_parent:
                            self.city_node.add_child(child)
        else:
            # Original logic if no city node
            for i, current_level in enumerate(sorted_levels[1:], 1):
                parent_level = sorted_levels[i-1]
                
                for child in levels[current_level]:
                    best_parent = self._find_best_parent(child, levels[parent_level])
                    if best_parent:
                        best_parent.add_child(child)
                    else:
                        for j in range(i-2, -1, -1):
                            best_parent = self._find_best_parent(child, levels[sorted_levels[j]])
                            if best_parent:
                                best_parent.add_child(child)
                                break
        
        # Handle unit types
        self._connect_unit_types(unit_types)
    
    def _find_best_parent(self, child: AddressNode, potential_parents: List[AddressNode]) -> Optional[AddressNode]:
        """Find the best parent for a child node"""
        # Special handling: never make city a child of anything
        if child.type == 'city':
            return None
            
        valid_parents = []
        
        for parent in potential_parents:
            # Skip if trying to make city a non-root
            if parent.type == 'city':
                # City can be parent of anyone
                valid_parents.append((parent, 0))  # Give city priority with distance 0
            else:
                # For non-city relationships, maintain original position logic
                if child.start < parent.start:  
                    distance = parent.start - child.end    
                    valid_parents.append((parent, distance))
        
        if not valid_parents:
            return None
        
        # Return parent with minimum distance
        return min(valid_parents, key=lambda x: x[1])[0]
    
    def _connect_unit_types(self, unit_types: List[Dict[str, Any]]):
        """Connect unit_type entities to their corresponding unit_id nodes"""
        unit_nodes = [n for n in self.nodes if n.type == 'unit_id']
        
        for unit_node in unit_nodes:
            # Find closest preceding unit_type
            best_unit_type = None
            min_distance = float('inf')
            
            for unit_type_entity in unit_types:
                if unit_type_entity['start'] < unit_node.start:
                    distance = unit_node.start - unit_type_entity['end']
                    if distance < min_distance and distance < 30:
                        min_distance = distance
                        best_unit_type = unit_type_entity
            
            if best_unit_type:
                unit_node.unit_type = AddressNode(best_unit_type)
    
    def get_addresses(self) -> pd.DataFrame:
        """Extract all complete addresses"""
        # Find leaf nodes (nodes with no children)
        leaf_nodes = [node for node in self.nodes if not node.children]
        
        # If no leaf nodes, use all nodes (fallback)
        if not leaf_nodes:
            leaf_nodes = self.nodes
        
        addresses = []
        for leaf in leaf_nodes:
            address = leaf.get_full_address()
            addresses.append(address)
        
        if not addresses:
            return pd.DataFrame()
        
        # Create DataFrame
        df = pd.DataFrame(addresses)
        
        # Define column order
        column_order = ['unit_type', 'unit_id', 'number_filter', 'building_name',
                       'street_number', 'street_name', 'postcode', 'city']
        
        existing_columns = [col for col in column_order if col in df.columns]
        df = df[existing_columns]
        
        return df
    
    def visualize_graph(self):
        """Visualize the graph structure"""
        # Find root nodes
        roots = [node for node in self.nodes if node.parent is None]
        
        def print_tree(node, level=0):
            indent = "  " * level
            unit_info = f" (unit_type: {node.unit_type.text})" if node.unit_type else ""
            print(f"{indent}{node.type}: '{node.text.strip()}'{unit_info}")
            for child in node.children:
                print_tree(child, level + 1)
        
        print("Address Graph:")
        for root in roots:
            print_tree(root)
        
        # Also show leaf nodes for debugging
        leaves = [node for node in self.nodes if not node.children]
        print(f"\nLeaf nodes: {[(n.type, n.text.strip()) for n in leaves]}")

# Test with your example
def test_city_postcode_order():
    entities = [
        {'type': 'building_name', 'text': 'Westleigh Lodge Care Home', 'start': 0, 'end': 25, 'confidence': 0.99999654},
        {'type': 'street_name', 'text': ' Nel Pan Lane', 'start': 26, 'end': 39, 'confidence': 0.9999978},
        {'type': 'city', 'text': ' Leigh', 'start': 40, 'end': 46, 'confidence': 0.99988425},
        {'type': 'postcode', 'text': 'WN7 5JT', 'start': 48, 'end': 55, 'confidence': 0.9999841}
    ]
    
    print("Testing city-postcode order issue:")
    graph = AddressGraph(entities)
    graph.visualize_graph()
    result = graph.get_addresses()
    print("\nParsed Addresses:")
    print(result)

# Run the test
test_city_postcode_order()

Testing city-postcode order issue:
Address Graph:
city: 'Leigh'
  postcode: 'WN7 5JT'
    street_name: 'Nel Pan Lane'
      building_name: 'Westleigh Lodge Care Home'

Leaf nodes: [('building_name', 'Westleigh Lodge Care Home')]

Parsed Addresses:
               building_name   street_name postcode   city
0  Westleigh Lodge Care Home  Nel Pan Lane  WN7 5JT  Leigh
