# New address parser

In [1]:

entities =[{'type': 'unit_id',
    'text': ' 36 - 40',
    'start': 5,
    'end': 13,
    'confidence': 0.9996221},
{'type': 'number_filter',
    'text': 'even',
    'start': 15,
    'end': 19,
    'confidence': 0.999858},
{'type': 'unit_id',
    'text': ' 42-44',
    'start': 21,
    'end': 27,
    'confidence': 0.9986492},
{'type': 'building_name',
    'text': ' climb house',
    'start': 28,
    'end': 40,
    'confidence': 0.999928},
{'type': 'unit_id',
    'text': ' 1-5',
    'start': 47,
    'end': 51,
    'confidence': 0.997581},
{'type': 'building_name',
    'text': ' down buildings',
    'start': 51,
    'end': 66,
    'confidence': 0.99611604},
{'type': 'street_name',
    'text': ' chapel street',
    'start': 67,
    'end': 81,
    'confidence': 0.99992263},
{'type': 'postcode',
    'text': ' se45 6pq',
    'start': 90,
    'end': 99,
    'confidence': 0.99991864}]


In [2]:
import pandas as pd
from typing import List, Dict, Any, Optional

class AddressNode:
    def __init__(self, entity: Dict[str, Any]):
        self.entity = entity
        self.type = entity['type']
        self.text = entity['text']
        self.start = entity['start']
        self.end = entity['end']
        self.children = []
        self.parent = None
        self.unit_type = None
    
    def add_child(self, child_node):
        self.children.append(child_node)
        child_node.parent = self
    
    def get_full_address(self) -> Dict[str, str]:
        """Build complete address by traversing up to root"""
        address = {}
        current = self
        
        # Traverse up the tree collecting all components
        while current:
            address[current.type] = current.text.strip()
            if current.unit_type:
                address['unit_type'] = current.unit_type.text.strip()
            current = current.parent
        
        return address

class AddressGraph:
    def __init__(self, entities: List[Dict[str, Any]]):
        self.hierarchy_levels = {
            'city': 0,           # Always root
            'postcode': 1,       # Child of city
            'street_name': 2,    # Child of postcode or city
            'street_number': 3,  # Child of street_name
            'building_name': 4,  # Child of street_number or street_name
            'number_filter': 5,  # Child of building_name
            'unit_id': 6,        # Child of number_filter or building_name
            'unit_type': 7       # Special handling
        }
        
        self.nodes = []
        self.city_node = None  # Track the city node separately
        
        self._build_graph(entities)
    
    def _build_graph(self, entities: List[Dict[str, Any]]):
        # Separate unit_types for special handling
        regular_entities = [e for e in entities if e['type'] != 'unit_type']
        unit_types = [e for e in entities if e['type'] == 'unit_type']
        
        # Create nodes for regular entities
        for entity in regular_entities:
            node = AddressNode(entity)
            self.nodes.append(node)
            
            # Track city node separately
            if node.type == 'city':
                self.city_node = node
        
        # Sort nodes by hierarchy level, then by position
        self.nodes.sort(key=lambda x: (self.hierarchy_levels[x.type], x.start))
        
        # Group nodes by hierarchy level
        levels = {}
        for node in self.nodes:
            level = self.hierarchy_levels[node.type]
            if level not in levels:
                levels[level] = []
            levels[level].append(node)
        
        # Connect nodes level by level, but handle city specially
        sorted_levels = sorted(levels.keys())
        
        # If we have a city, make all non-city nodes connect to it eventually
        if self.city_node:
            # Connect other nodes starting from level 1
            for i, current_level in enumerate(sorted_levels[1:], 1):
                parent_level = sorted_levels[i-1]
                
                for child in levels[current_level]:
                    if child.type == 'city':  # Skip city nodes in regular processing
                        continue
                        
                    # Find best parent from previous level
                    best_parent = self._find_best_parent(child, levels[parent_level])
                    if best_parent:
                        best_parent.add_child(child)
                    else:
                        # If no parent found in immediate level, look further up
                        found_parent = False
                        for j in range(i-2, -1, -1):
                            if j == 0:  # Level 0 is city level - use special logic
                                self.city_node.add_child(child)
                                found_parent = True
                                break
                            else:
                                best_parent = self._find_best_parent(child, levels[sorted_levels[j]])
                                if best_parent:
                                    best_parent.add_child(child)
                                    found_parent = True
                                    break
                        
                        # If still no parent found, attach directly to city
                        if not found_parent:
                            self.city_node.add_child(child)
        else:
            # Original logic if no city node
            for i, current_level in enumerate(sorted_levels[1:], 1):
                parent_level = sorted_levels[i-1]
                
                for child in levels[current_level]:
                    best_parent = self._find_best_parent(child, levels[parent_level])
                    if best_parent:
                        best_parent.add_child(child)
                    else:
                        for j in range(i-2, -1, -1):
                            best_parent = self._find_best_parent(child, levels[sorted_levels[j]])
                            if best_parent:
                                best_parent.add_child(child)
                                break
        
        # Handle unit types
        self._connect_unit_types(unit_types)
    
    def _find_best_parent(self, child: AddressNode, potential_parents: List[AddressNode]) -> Optional[AddressNode]:
        """Find the best parent for a child node"""
        # Special handling: never make city a child of anything
        if child.type == 'city':
            return None
            
        valid_parents = []
        
        for parent in potential_parents:
            # Skip if trying to make city a non-root
            if parent.type == 'city':
                # City can be parent of anyone
                valid_parents.append((parent, 0))  # Give city priority with distance 0
            else:
                # For non-city relationships, maintain original position logic
                if child.start < parent.start:  
                    distance = parent.start - child.end    
                    valid_parents.append((parent, distance))
        
        if not valid_parents:
            return None
        
        # Return parent with minimum distance
        return min(valid_parents, key=lambda x: x[1])[0]
    
    def _connect_unit_types(self, unit_types: List[Dict[str, Any]]):
        """Connect unit_type entities to their corresponding unit_id nodes"""
        unit_nodes = [n for n in self.nodes if n.type == 'unit_id']
        
        for unit_node in unit_nodes:
            # Find closest preceding unit_type
            best_unit_type = None
            min_distance = float('inf')
            
            for unit_type_entity in unit_types:
                if unit_type_entity['start'] < unit_node.start:
                    distance = unit_node.start - unit_type_entity['end']
                    if distance < min_distance and distance < 30:
                        min_distance = distance
                        best_unit_type = unit_type_entity
            
            if best_unit_type:
                unit_node.unit_type = AddressNode(best_unit_type)
    
    def get_addresses(self) -> pd.DataFrame:
        """Extract all complete addresses"""
        # Find leaf nodes (nodes with no children)
        leaf_nodes = [node for node in self.nodes if not node.children]
        
        # If no leaf nodes, use all nodes (fallback)
        if not leaf_nodes:
            leaf_nodes = self.nodes
        
        addresses = []
        for leaf in leaf_nodes:
            address = leaf.get_full_address()
            addresses.append(address)
        
        if not addresses:
            return pd.DataFrame()
        
        # Create DataFrame
        df = pd.DataFrame(addresses)
        
        # Define column order
        column_order = ['unit_type', 'unit_id', 'number_filter', 'building_name',
                       'street_number', 'street_name', 'postcode', 'city']
        
        existing_columns = [col for col in column_order if col in df.columns]
        df = df[existing_columns]
        
        return df
    
    def visualize_graph(self):
        """Visualize the graph structure"""
        # Find root nodes
        roots = [node for node in self.nodes if node.parent is None]
        
        def print_tree(node, level=0):
            indent = "  " * level
            unit_info = f" (unit_type: {node.unit_type.text})" if node.unit_type else ""
            print(f"{indent}{node.type}: '{node.text.strip()}'{unit_info}")
            for child in node.children:
                print_tree(child, level + 1)
        
        print("Address Graph:")
        for root in roots:
            print_tree(root)
        
        # Also show leaf nodes for debugging
        leaves = [node for node in self.nodes if not node.children]
        print(f"\nLeaf nodes: {[(n.type, n.text.strip()) for n in leaves]}")


def parse_addresses_to_dicts(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Parse entities and return list of dictionaries instead of DataFrame"""
    if not entities:
        return []
    
    graph = AddressGraph(entities)
    
    # Find leaf nodes
    leaf_nodes = [node for node in graph.nodes if not node.children]
    if not leaf_nodes:
        leaf_nodes = graph.nodes
    
    # Convert each leaf to dictionary
    address_dicts = []
    for leaf in leaf_nodes:
        address = leaf.get_full_address()
        address_dicts.append(address)
    
    return address_dicts

def process_addresses(address_data_list: List[Dict]) -> pd.DataFrame:
    
    all_rows = []
    
    for address_data in address_data_list:
        try:
            entities = address_data['entities']
            
            if not entities:
                continue
            
            # Get parsed addresses as dictionaries (not DataFrame)
            parsed_addresses = parse_addresses_to_dicts(entities)
            
            # Add metadata to each parsed address
            metadata = {
                'row_index': address_data['row_index'],
                'datapoint_id': address_data['datapoint_id'],
                'property_address': address_data['original_address']
            }
            
            for address_dict in parsed_addresses:
                # Merge parsed address with metadata
                full_row = {**address_dict, **metadata}
                all_rows.append(full_row)
                
        except Exception as e:
            # Log error but continue processing
            print(f"Error processing address {address_data['row_index']}: {e}")
            continue
    
    # Single DataFrame creation at the end
    if not all_rows:
        return pd.DataFrame()
    
    df = pd.DataFrame(all_rows)
    
    # Define column order (address columns first, then metadata)
    address_columns = ['unit_type', 'unit_id', 'number_filter', 'building_name',
                      'street_number', 'street_name', 'postcode', 'city']
    metadata_columns = ['datapoint_id']
    
    # Reorder columns (only include columns that exist)
    all_columns = address_columns + metadata_columns
    existing_columns = [col for col in all_columns if col in df.columns]
    df = df[existing_columns]
    
    return df

# Test
def test_with_visualization():
    entities = [
        {'type': 'unit_type', 'text': 'Flat', 'start': 0, 'end': 4, 'confidence': 0.999},
        {'type': 'unit_id', 'text': ' 1A', 'start': 4, 'end': 7, 'confidence': 0.999},
        {'type': 'unit_type', 'text': ' Flat', 'start': 8, 'end': 12, 'confidence': 0.999},
        {'type': 'unit_id', 'text': ' 1B', 'start': 12, 'end': 15, 'confidence': 0.999},
        {'type': 'street_number', 'text': ' 25', 'start': 16, 'end': 19, 'confidence': 0.999},
        {'type': 'street_name', 'text': ' Oak Street', 'start': 19, 'end': 30, 'confidence': 0.999},
        {'type': 'city', 'text': ' London', 'start': 31, 'end': 38, 'confidence': 0.999}
    ]
    
    graph = AddressGraph(entities)
    graph.visualize_graph()
    result = graph.get_addresses()
    print("\nParsed Addresses:")
    print(result)



test_with_visualization()

graph = AddressGraph(entities)
graph.visualize_graph()
result = graph.get_addresses()
print("\nParsed Addresses:")
print(result)


Address Graph:
city: 'London'
  street_name: 'Oak Street'
    street_number: '25'
      unit_id: '1A' (unit_type: Flat)
      unit_id: '1B' (unit_type:  Flat)

Leaf nodes: [('unit_id', '1A'), ('unit_id', '1B')]

Parsed Addresses:
  unit_type unit_id street_number street_name    city
0      Flat      1A            25  Oak Street  London
1      Flat      1B            25  Oak Street  London
Address Graph:
postcode: 'se45 6pq'
  street_name: 'chapel street'
    building_name: 'climb house'
      number_filter: 'even'
        unit_id: '36 - 40'
      unit_id: '42-44'
    building_name: 'down buildings'
      unit_id: '1-5'

Leaf nodes: [('unit_id', '36 - 40'), ('unit_id', '42-44'), ('unit_id', '1-5')]

Parsed Addresses:
   unit_id number_filter   building_name    street_name  postcode
0  36 - 40          even     climb house  chapel street  se45 6pq
1    42-44           NaN     climb house  chapel street  se45 6pq
2      1-5           NaN  down buildings  chapel street  se45 6pq


In [3]:
temp_ents = {'summary': {'total_addresses': 1,
  'successful_parses': 1,
  'failed_parses': 0,
  'success_rate': 1.0,
  'batch_size_used': 512},
 'results': [{'row_index': 0,
   'datapoint_id': 0,
   'original_address': 'Flats 36 - 40 (even), 42-44, climb house, Flats 1-5 down buildings, chapel street, London, se45 6pq',
   'entities': [{'type': 'unit_id',
     'text': ' 36 - 40',
     'start': 5,
     'end': 13,
     'confidence': 0.9996221},
    {'type': 'number_filter',
     'text': 'even',
     'start': 15,
     'end': 19,
     'confidence': 0.999858},
    {'type': 'unit_id',
     'text': ' 42-44',
     'start': 21,
     'end': 27,
     'confidence': 0.9986492},
    {'type': 'building_name',
     'text': ' climb house',
     'start': 28,
     'end': 40,
     'confidence': 0.999928},
    {'type': 'unit_id',
     'text': ' 1-5',
     'start': 47,
     'end': 51,
     'confidence': 0.997581},
    {'type': 'building_name',
     'text': ' down buildings',
     'start': 51,
     'end': 66,
     'confidence': 0.99611604},
    {'type': 'street_name',
     'text': ' chapel street',
     'start': 67,
     'end': 81,
     'confidence': 0.99992263},
    {'type': 'postcode',
     'text': ' se45 6pq',
     'start': 90,
     'end': 99,
     'confidence': 0.99991864}],
   'parsed_components': {'unit_id': [' 36 - 40', ' 42-44', ' 1-5'],
    'number_filter': ['even'],
    'building_name': [' climb house', ' down buildings'],
    'street_name': [' chapel street'],
    'postcode': [' se45 6pq']}}]}

In [4]:
process_addresses(temp_ents['results'])


Unnamed: 0,unit_id,number_filter,building_name,street_name,postcode,datapoint_id
0,36 - 40,even,climb house,chapel street,se45 6pq,0
1,42-44,,climb house,chapel street,se45 6pq,0
2,1-5,,down buildings,chapel street,se45 6pq,0


# New pipeline approach

In [5]:
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data,
    parsing_and_expansion_process,
    post_process_expanded_data,
    load_postcode_district_lookup,
)
from enhance_ocod.locate_and_classify import (
    preprocess_expanded_ocod_data,
    add_missing_lads_ocod,
    load_voa_ratinglist,
    street_and_building_matching,
    substreet_matching,
    counts_of_businesses_per_oa_lsoa,
    voa_address_match_all_data,
    classification_type1,
    classification_type2,
    contract_ocod_after_classification,
)
from enhance_ocod.inference import convert_to_entity_dataframe
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
from pathlib import Path
from tqdm import tqdm
import time
import gc  # Add for memory management
import pandas as pd
import numpy as np
import pickle
# There is a warning related to bfill and ffill which is basically internal to pandas so silencing here
import warnings



warnings.filterwarnings("ignore", message=".*Downcasting object dtype arrays.*")

SCRIPT_DIR = Path("/teamspace/studios/this_studio/enhance_ocod/notebooks")

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = (
    SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
)


ONSPD_dir = SCRIPT_DIR.parent / "data" / "onspd"
ONSPD_files = list(ONSPD_dir.glob("*"))
ONSPD_path = ONSPD_files[0] if ONSPD_files else None

price_paid_dir = SCRIPT_DIR.parent / "data" / "price_paid_data"
price_paid_files = list(price_paid_dir.glob("*"))
price_paid_path = price_paid_files[0] if price_paid_files else None


voa_dir = SCRIPT_DIR.parent / "data" / "voa"
voa_files = list(voa_dir.glob("*"))
voa_path = voa_files[0] if voa_files else None


processed_price_paid_dir = SCRIPT_DIR.parent / "data" / "processed_price_paid"
output_dir.mkdir(parents=True, exist_ok=True)

parsed_results_dir = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts2"
parsed_results_dir.mkdir(parents=True, exist_ok=True)

# List of all zip files in input_dir
#
# TESTING!!! only 10 files!
#
all_files = sorted([f for f in input_dir.glob("OCOD_FULL_*.zip")])


print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)


zip_file = Path('/teamspace/studios/this_studio/enhance_ocod/data/ocod_history/OCOD_FULL_2022_02.zip')

Loading common reference data...


  postcode_district_lookup = pd.read_csv(f)[


Initial rows: 2293839


# Load results

In [6]:
out_name = zip_file.stem + ".parquet"
out_path = output_dir / out_name

# Define parsed results file path
parsed_results_file = parsed_results_dir / f"{zip_file.stem}_parsed_results.pkl"

print(f"Processing {zip_file.name}...")

# Load and process the OCOD data
ocod_data = load_and_prep_OCOD_data(str(zip_file))

###############
# Parse addresses
###############
if parsed_results_file.exists():
    print(f"Loading cached parsing results for {zip_file.name}...")
    with open(parsed_results_file, "rb") as f:
        results = pickle.load(f)
    print(
        f"Loaded cached results with success rate: {results['summary']['success_rate']:.1%}"
    )
else:
    from enhance_ocod.inference import parse_addresses_pipeline
    import torch
    torch.set_float32_matmul_precision("medium")
    print(f"Parsing addresses for {zip_file.name}...")
    start_time = time.time()

    results = parse_addresses_pipeline(
        df=ocod_data,
        short_batch_size=128,  # The default seems really slow, might be to do with loading not sure
        model_path=str(model_path),
        target_column="property_address",
    )

    end_time = time.time()
    print(f"Address parsing took {end_time - start_time:.2f} seconds")
    print(f"Success rate: {results['summary']['success_rate']:.1%}")

    # Save parsing results
    print(f"Saving parsing results to {parsed_results_file}...")
    with open(parsed_results_file, "wb") as f:
        pickle.dump(results, f)


Processing OCOD_FULL_2022_02.zip...


Loading cached parsing results for OCOD_FULL_2022_02.zip...
Loaded cached results with success rate: 100.0%


In [7]:
results['results'][0:3]

[{'row_index': 0,
  'datapoint_id': 0,
  'original_address': 'Westleigh Lodge Care Home, Nel Pan Lane, Leigh (WN7 5JT)',
  'entities': [{'type': 'building_name',
    'text': 'Westleigh Lodge Care Home',
    'start': 0,
    'end': 25,
    'confidence': 0.99999654},
   {'type': 'street_name',
    'text': ' Nel Pan Lane',
    'start': 26,
    'end': 39,
    'confidence': 0.9999978},
   {'type': 'city',
    'text': ' Leigh',
    'start': 40,
    'end': 46,
    'confidence': 0.99988425},
   {'type': 'postcode',
    'text': 'WN7 5JT',
    'start': 48,
    'end': 55,
    'confidence': 0.9999841}],
  'parsed_components': {'building_name': ['Westleigh Lodge Care Home'],
   'street_name': [' Nel Pan Lane'],
   'city': [' Leigh'],
   'postcode': ['WN7 5JT']}},
 {'row_index': 1,
  'datapoint_id': 1,
  'original_address': 'Flat 1, 1a Canal Street, Manchester (M1 3HE)',
  'entities': [{'type': 'unit_type',
    'text': 'Flat',
    'start': 0,
    'end': 4,
    'confidence': 0.9999956},
   {'type': 'u

In [176]:
processed_addresses_df = process_addresses(results['results'])

post_processed_data = processed_addresses_df.merge(
    ocod_data, how="left", left_on="datapoint_id", right_index=True
)[
        [
            "title_number",
            "tenure",
            "unit_id",
            "unit_type",
            "number_filter",
            "building_name",
            "street_number",
            "street_name",
            "postcode",
            "city",
            "district",
            "county",
            "region",
            "price_paid",
            "property_address",
            "country_incorporated",
        ]
    ]

post_processed_data["postcode"] = post_processed_data["postcode"].str.upper()

post_processed_data

Unnamed: 0,title_number,tenure,unit_id,unit_type,number_filter,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,country_incorporated
0,MAN296957,Leasehold,,,,Westleigh Lodge Care Home,,Nel Pan Lane,WN7 5JT,Leigh,WIGAN,GREATER MANCHESTER,NORTH WEST,,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",ISLE OF MAN
1,MAN297185,Leasehold,1,Flat,,,,Canal Street,M1 3HE,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,270000.0,"Flat 1, 1a Canal Street, Manchester (M1 3HE)",JERSEY
2,MAN299183,Leasehold,201,Flat,,,,Regent Road,M3 4AY,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,243516.0,"Flat 201, 1 Regent Road, Manchester (M3 4AY)",BRITISH VIRGIN ISLANDS
3,MAN300821,Freehold,,,,,,Gerard Street,WN4 9AA,Wigan,WIGAN,GREATER MANCHESTER,NORTH WEST,,"Land at 2a Gerard Street, Ashton In Makerfield...",DENMARK
4,MAN300971,Leasehold,111,Unit,,Timber Wharf,,Worsley Street,M15 4NZ,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,342000.0,"Unit 111, Timber Wharf, Worsley Street, Manche...",JERSEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119828,AGL470128,Leasehold,,,,,,Lime Street,EC3M 7AW,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)",IRELAND
119829,AGL473126,Leasehold,,,,,155,Bishopsgate,EC2M 3AD,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"155 Bishopsgate, London (EC2M 3AD)",JAPAN
119830,AGL473595,Leasehold,,,,,,Lime Street,EC3M 7AF,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"9th and 10th floors, 52 Lime Street, London (E...",AUSTRALIA
119831,AGL475468,Leasehold,,,,,,Bevis Marks,EC3A 7BA,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",INDIA


In [9]:
def add_geographic_metadata(df, postcode_district_lookup):
    """
    Add geographic area data and create standardized address fields, for entries with a postcode.
    
    This function enriches address data by merging geographic area codes 
    (LSOA, MSOA, LAD, etc.) based on postcode lookup, and creates 
    standardized versions of street numbers and street names for 
    improved data matching and consistency.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing address data with columns including 
        'postcode', 'street_number', and 'street_name'.
    postcode_district_lookup : pandas.DataFrame
        Lookup table containing postcode to geographic area mappings
        with 'postcode2' column for merging.
    
    Returns
    -------
    pandas.DataFrame
        Enhanced DataFrame with additional columns:
        - postcode2: Normalized postcode (lowercase, no spaces)
        - Geographic area columns from the lookup table
        - street_number2: Standardized street number (digits only)
        - street_name2: Standardized street name (no apostrophes, 
          's/ss' removed, no spaces)
    
    Notes
    -----
    The standardized street number extracts only digits for use in 
    LSOA matching where apartment letters and other characters are 
    not relevant.
    """

    ##add in the geographic area data like lsoa etc
    df["postcode2"] = (
        df["postcode"].str.lower().str.replace("\s", "", regex=True)
    )

    df = df.merge(
        postcode_district_lookup, "left", left_on="postcode2", right_on="postcode2"
    )

    # this is to ensure that the street number includes only digits as it is used in the LSOA matching where the only thing
    # that matters is the street number not whether it is a or b or whatever.
    df["street_number2"] = (
        df["street_number"]
        .str.replace(r"^.*(?=\b[0-9]+$)", "", regex=True)
        .str.replace(r"[^\d]", "", regex=True)
    )

    # This stripped out versionof street name is used several times throughout the notebook
    df["street_name2"] = (
        df.loc[:, "street_name"]
        .str.replace(r"'", "", regex=True)
        .str.replace(r"s(s)?(?=\s)", "", regex=True)
        .str.replace(r"\s", "", regex=True)
    )

    return df


In [177]:
pre_process_ocod = add_geographic_metadata(post_processed_data, postcode_district_lookup)

In [32]:
building_gazetteer = pd.read_parquet('/teamspace/studios/this_studio/enhance_ocod/data/gazeteer/building_gazeteer.parquet')
building_gazetteer['fraction'] = 1
district_gazetteer = pd.read_parquet('/teamspace/studios/this_studio/enhance_ocod/data/gazeteer/district_gazetteer.parquet')
street_gazetteer = pd.read_parquet('/teamspace/studios/this_studio/enhance_ocod/data/gazeteer/street_gazetteer.parquet')

In [54]:
def enhance_ocod_with_gazetteers(pre_process_ocod, building_gazetteer, district_gazetteer, street_gazetteer):
    """
    Enhance OCOD data by adding missing geographic codes using gazetteers.
    
    This function fills in missing LAD codes using district information, and missing
    LSOA codes using building and street gazetteers where appropriate.
    
    Parameters
    ----------
    pre_process_ocod : pandas.DataFrame
        The OCOD dataset that may contain missing geographic codes
    building_gazetteer : pandas.DataFrame
        Building gazetteer with columns: building_name, oa11cd, lsoa11cd, msoa11cd, lad11cd, fraction
    district_gazetteer : pandas.DataFrame
        District gazetteer with columns: district, lad11cd
    street_gazetteer : pandas.DataFrame
        Street gazetteer with columns: street_name2, lsoa11cd, oa11cd, msoa11cd, lad11cd, fraction
        
    Returns
    -------
    pandas.DataFrame
        Enhanced OCOD dataset with missing geographic codes filled where possible.
        Includes 'match_prob' column with fraction values and 'geog_match' categorical column
        indicating whether match was from 'building' or 'street' gazetteer.
        
    Notes
    -----
    - First adds missing LAD codes using district gazetteer
    - Then adds missing LSOA codes using building gazetteer (where building_name and lad11cd match)
    - Finally adds missing LSOA codes using street gazetteer (where street_name and lad11cd match)
    - Only fills missing values, does not overwrite existing ones
    - Geographic codes are added hierarchically (LAD first, then LSOA/OA/MSOA)
    - Performs case-insensitive matching by converting to lowercase
    - match_prob and geog_match columns track the source and quality of gazetteer matches
        
    Examples
    --------
    >>> enhanced_ocod = enhance_ocod_with_gazetteers(pre_process_ocod, 
    ...                                             building_gaz, 
    ...                                             district_gaz, 
    ...                                             street_gaz)
    """
    
    # Create a copy to avoid modifying the original
    enhanced_ocod = pre_process_ocod.copy()
    
    # Initialize new columns
    enhanced_ocod['match_prob'] = float('nan')
    enhanced_ocod['geog_match'] = None
    
    # Create lowercase versions of gazetteers for matching
    building_gazetteer_lower = building_gazetteer.copy()
    building_gazetteer_lower['building_name'] = building_gazetteer_lower['building_name'].str.lower()
    
    street_gazetteer_lower = street_gazetteer.copy()
    street_gazetteer_lower['street_name2'] = street_gazetteer_lower['street_name2'].str.lower()
    
    # Step 1: Add missing LAD codes using district gazetteer
    missing_lad_mask = enhanced_ocod['lad11cd'].isna()
    
    if missing_lad_mask.sum() > 0:
        missing_lad_data = enhanced_ocod[missing_lad_mask].copy()
        merged_districts = missing_lad_data.merge(
            district_gazetteer[['district', 'lad11cd']], 
            on='district', 
            how='left', 
            suffixes=('', '_new')
        )
        enhanced_ocod.loc[missing_lad_mask, 'lad11cd'] = merged_districts['lad11cd_new'].values
    
    # Step 2: Add missing LSOA codes using building gazetteer
    missing_lsoa_mask = enhanced_ocod['lsoa11cd'].isna()
    has_building_and_lad = (enhanced_ocod['building_name'].notna() & 
                           enhanced_ocod['lad11cd'].notna())
    building_fillable_mask = missing_lsoa_mask & has_building_and_lad
    
    if building_fillable_mask.sum() > 0:
        building_data = enhanced_ocod[building_fillable_mask].copy()
        merged_buildings = building_data.merge(
            building_gazetteer_lower[['building_name', 'lad11cd', 'oa11cd', 'lsoa11cd', 'msoa11cd', 'fraction']], 
            on=['building_name', 'lad11cd'], 
            how='left', 
            suffixes=('', '_new')
        )
        
        # Update geographic codes and tracking columns where matches found
        match_found_mask = merged_buildings['lsoa11cd_new'].notna()
        if match_found_mask.sum() > 0:
            building_indices = enhanced_ocod.index[building_fillable_mask][match_found_mask]
            
            for col in ['oa11cd', 'lsoa11cd', 'msoa11cd']:
                enhanced_ocod.loc[building_indices, col] = merged_buildings.loc[match_found_mask, f'{col}_new'].values
            
            enhanced_ocod.loc[building_indices, 'match_prob'] = merged_buildings.loc[match_found_mask, 'fraction'].values
            enhanced_ocod.loc[building_indices, 'geog_match'] = 'building'
    
    # Step 3: Add missing LSOA codes using street gazetteer
    missing_lsoa_mask = enhanced_ocod['lsoa11cd'].isna()
    has_street_and_lad = (enhanced_ocod['street_name2'].notna() & 
                         enhanced_ocod['lad11cd'].notna())
    street_fillable_mask = missing_lsoa_mask & has_street_and_lad
    
    if street_fillable_mask.sum() > 0:
        street_data = enhanced_ocod[street_fillable_mask].copy()
        merged_streets = street_data.merge(
            street_gazetteer_lower[['street_name2', 'lad11cd', 'oa11cd', 'lsoa11cd', 'msoa11cd', 'fraction']], 
            on=['street_name2', 'lad11cd'], 
            how='left', 
            suffixes=('', '_new')
        )
        
        # Update geographic codes and tracking columns where matches found
        match_found_mask = merged_streets['lsoa11cd_new'].notna()
        if match_found_mask.sum() > 0:
            street_indices = enhanced_ocod.index[street_fillable_mask][match_found_mask]
            
            for col in ['oa11cd', 'lsoa11cd', 'msoa11cd']:
                enhanced_ocod.loc[street_indices, col] = merged_streets.loc[match_found_mask, f'{col}_new'].values
            
            enhanced_ocod.loc[street_indices, 'match_prob'] = merged_streets.loc[match_found_mask, 'fraction'].values
            enhanced_ocod.loc[street_indices, 'geog_match'] = 'street'
    
    # Convert geog_match to categorical for memory efficiency
    enhanced_ocod['geog_match'] = enhanced_ocod['geog_match'].astype('category')
    
    return enhanced_ocod

In [179]:
pre_process_ocod2 = pre_process_ocod
pre_process_ocod2['building_name'] = pre_process_ocod2['building_name'].str.lower()
pre_process_ocod2['street_name2'] = pre_process_ocod2['street_name2'].str.lower()
enhanced  =  enhance_ocod_with_gazetteers(pre_process_ocod2, building_gazetteer, district_gazetteer, street_gazetteer)



In [180]:
enhanced.loc[enhanced['lsoa11cd'].notna(), ['unit_id', 'unit_type', 'building_name', 'street_number', 'street_name',
       'postcode', 'city', 'district', 'county', 'region', 'price_paid',
       'property_address', 'geog_match', 'match_prob']]

Unnamed: 0,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,geog_match,match_prob
0,,,westleigh lodge care home,,Nel Pan Lane,WN7 5JT,Leigh,WIGAN,GREATER MANCHESTER,NORTH WEST,,"Westleigh Lodge Care Home, Nel Pan Lane, Leigh...",,
1,1,Flat,,,Canal Street,M1 3HE,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,270000.0,"Flat 1, 1a Canal Street, Manchester (M1 3HE)",,
2,201,Flat,,,Regent Road,M3 4AY,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,243516.0,"Flat 201, 1 Regent Road, Manchester (M3 4AY)",,
3,,,,,Gerard Street,WN4 9AA,Wigan,WIGAN,GREATER MANCHESTER,NORTH WEST,,"Land at 2a Gerard Street, Ashton In Makerfield...",,
4,111,Unit,timber wharf,,Worsley Street,M15 4NZ,Manchester,MANCHESTER,GREATER MANCHESTER,NORTH WEST,342000.0,"Unit 111, Timber Wharf, Worsley Street, Manche...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119828,,,,,Lime Street,EC3M 7AW,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)",,
119829,,,,155,Bishopsgate,EC2M 3AD,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"155 Bishopsgate, London (EC2M 3AD)",,
119830,,,,,Lime Street,EC3M 7AF,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"9th and 10th floors, 52 Lime Street, London (E...",,
119831,,,,,Bevis Marks,EC3A 7BA,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",,


In [62]:
enhanced.groupby('geog_match')['match_prob'].describe()

  enhanced.groupby('geog_match')['match_prob'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
geog_match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
building,1162.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
street,25004.0,0.696205,0.310967,0.02414,0.396552,0.7625,1.0,1.0


## Analysing the entries with no geo-match

In [63]:
# Get the missing LSOA records
missing_lsoa_addresses = enhanced.loc[enhanced['lsoa11cd'].isna(), 'property_address']

# Count how many contain each keyword
keyword_counts = {}
keywords = ['airspace', 'air space', 'land', 'plot', 'car park', 'carpark' ]

for keyword in keywords:
    count = missing_lsoa_addresses.str.contains(keyword, case=False, na=False).sum()
    keyword_counts[keyword] = count
    print(f"'{keyword}': {count}")

# If you want to see which addresses contain ANY of these keywords
any_keyword_mask = missing_lsoa_addresses.str.contains('|'.join(keywords), case=False, na=False)
any_keyword_count = any_keyword_mask.sum()
print(f"\nAddresses containing any of {keywords}: {any_keyword_count}")

# If you want to see the actual addresses that contain these keywords
addresses_with_keywords = missing_lsoa_addresses[any_keyword_mask]
print(f"\nSample addresses containing keywords:")
print(addresses_with_keywords.head(10).tolist())

# Summary counts
print(f"\nSummary:")
print(f"Total missing LSOA: {len(missing_lsoa_addresses)}")
print(f"Contains keywords: {any_keyword_count}")
print(f"Percentage: {any_keyword_count/len(missing_lsoa_addresses)*100:.1f}%")

'airspace': 42
'air space': 13
'land': 3474
'plot': 1256
'car park': 313
'carpark': 2

Addresses containing any of ['airspace', 'air space', 'land', 'plot', 'car park', 'carpark']: 4977

Sample addresses containing keywords:
['Land at Queensgate Centre, Peterborough', 'Land and Buildings on the North East Side of Surbiton Street, Sheffield', 'Land on the north west side of New Drove, North Brink, Wisbech', 'land at Britannia Road, Stockton-On-Tees', 'Land at Great Wood Skirmett, Henley-On-Thames', 'land at Maltby Court, Oldham', 'land lying to the West of Windermere Road, Linthorpe', 'Land forming part of 7 Blenavon Court, Yarm', 'Land at Portrack, Stockton-On-Tees', 'Land adjoining River Stour, Part Of Hands Farm, Newbold On Stour, Stratford-Upon-Avon']

Summary:
Total missing LSOA: 8230
Contains keywords: 4977
Percentage: 60.5%


# do some classification

In [57]:
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)

Initial rows: 2293839


### New classification

create VOA business gazetteer

In [139]:
def add_business_matches(df, voa_businesses):
    """
    Add boolean columns for business matches based on building, street, and street number.
    
    Parameters:
    df: Main dataframe to add matches to
    
    Returns:
    df with added boolean columns: building_match, street_match, number_match
    """
    
    # Create a copy to avoid modifying the original
    enhanced_df = df.copy()
    
    # Initialize the new columns
    enhanced_df['building_match'] = False
    enhanced_df['street_match'] = False
    enhanced_df['number_match'] = False
    
    # Building match
    has_building_and_lad = (enhanced_df['building_name'].notna() & 
                           enhanced_df['lad11cd'].notna())
    
    if has_building_and_lad.sum() > 0:
        building_data = enhanced_df[has_building_and_lad].copy()
        building_match_df = voa_businesses[['building_name', 'lad11cd']].dropna().drop_duplicates()
        
        merged_buildings = building_data.merge(
            building_match_df, 
            on=['building_name', 'lad11cd'], 
            how='left', 
            indicator='_building_merge'
        )
        
        match_found_mask = merged_buildings['_building_merge'] == 'both'
        if match_found_mask.sum() > 0:
            building_indices = enhanced_df.index[has_building_and_lad][match_found_mask]
            enhanced_df.loc[building_indices, 'building_match'] = True
    
    # Street match
    has_street_and_lad = (enhanced_df['street_name2'].notna() & 
                         enhanced_df['lad11cd'].notna())
    
    if has_street_and_lad.sum() > 0:
        street_data = enhanced_df[has_street_and_lad].copy()
        street_match_df = voa_businesses[['street_name2', 'lad11cd']].dropna().drop_duplicates()
        
        merged_streets = street_data.merge(
            street_match_df, 
            on=['street_name2', 'lad11cd'], 
            how='left', 
            indicator='_street_merge'
        )
        
        match_found_mask = merged_streets['_street_merge'] == 'both'
        if match_found_mask.sum() > 0:
            street_indices = enhanced_df.index[has_street_and_lad][match_found_mask]
            enhanced_df.loc[street_indices, 'street_match'] = True
    
    # Street number match
    has_number_street_and_lad = (enhanced_df['street_number'].notna() & 
                                enhanced_df['street_name2'].notna() & 
                                enhanced_df['lad11cd'].notna())
    
    if has_number_street_and_lad.sum() > 0:
        number_data = enhanced_df[has_number_street_and_lad].copy()
        number_match_df = voa_businesses[['street_number', 'street_name2', 'lad11cd']].dropna().drop_duplicates()
        
        merged_numbers = number_data.merge(
            number_match_df, 
            on=['street_number', 'street_name2', 'lad11cd'], 
            how='left', 
            indicator='_number_merge'
        )
        
        match_found_mask = merged_numbers['_number_merge'] == 'both'
        if match_found_mask.sum() > 0:
            number_indices = enhanced_df.index[has_number_street_and_lad][match_found_mask]
            enhanced_df.loc[number_indices, 'number_match'] = True
    
    return enhanced_df

In [181]:
with_matches = add_business_matches(enhanced, voa_businesses)

In [185]:
def property_class(df):
    df["class"] = np.select(
        [
            df["property_address"].str.contains(r"^(?:land|plot)", case=False),
            df["property_address"].str.contains(
                r"^(?:[a-z\s]*)(?:garage|parking(?:\s)?space|parking space|car park(?:ing)?)",
                case=False,
            ),
            df["property_address"].str.contains(
                r"^(?:the airspace|airspace)", case=False
            ),
            df["property_address"].str.contains(
                r"penthouse|flat|apartment", case=False
            ),
            ~df["street_match"],# If there is no business on the street then it must be a residential
            df["property_address"].str.contains(
                r"cinema|hotel|office|centre|\bpub|holiday(?:\s)?inn|travel lodge|travelodge|medical|business|cafe|^shop| shop|service|logistics|building supplies|restaurant|home|^store(?:s)?\b|^storage\b|company|ltd|limited|plc|retail|leisure|industrial|hall of|trading|commercial|technology|works|club,|advertising|school|church|(?:^room)",
                case=False,
            ),
            df["property_address"].str.contains(
                r"^[a-z\s']+\b(?:land(?:s)?|plot(?:s)?)\b", case=False
            ),  # land with words before it
            df["building_name"].str.contains(
                r"\binn$|public house|^the\s\w+\sand\s\w+|(?:tavern$)",
                case=False,
                na=False,
            ),  # pubs in various guises
            df["building_match"],  # a business building was matched
            df["number_match"], # The street and number of a business was matched
            (~df["number_match"]) & df["street_match"] & df['street_number'].notna(), #The address has a street number but it does not match with any business on the street
        ],
        [
            "land",
            "carpark",
            "airspace",
            "residential",
            "residential",
            "business",
            "land",
            "business",
            "business",
            "business",
            "residential"
        ],
        default="unknown",
    )

    return df

In [186]:
classified = property_class(with_matches)

classified.groupby('class').size()

class
airspace        1719
business       16326
carpark         2996
land           18228
residential    68777
unknown        11787
dtype: int64

In [187]:
classified.loc[classified['class']=='unknown', ['title_number','building_name','property_address','street_name2', 'street_number',
'building_match','street_match', 'number_match','class']]

Unnamed: 0,title_number,building_name,property_address,street_name2,street_number,building_match,street_match,number_match,class
4,MAN300971,timber wharf,"Unit 111, Timber Wharf, Worsley Street, Manche...",worsleystreet,,False,True,False,unknown
16,CB400630,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",fengate,,False,True,False,unknown
76,MAN203328,southern tower,"Southern Tower, Bengal Mill, Bengal Street, Ma...",bengalstreet,,False,True,False,unknown
77,MAN203328,bengal mill,"Southern Tower, Bengal Mill, Bengal Street, Ma...",bengalstreet,,False,True,False,unknown
80,MAN204044,the vibe,"Unit 3, The Vibe, 175 Broughton Lane, Salford ...",broughtonlane,,False,True,False,unknown
...,...,...,...,...,...,...,...,...,...
119814,AGL355237,,"Level 10 West, 5 Broadgate, London (EC2M 2QS)",broadgate,,False,True,False,unknown
119827,AGL465195,ormond house,"4th Floor, Ormond House, 63 Queen Victoria Str...",queenvictoriastreet,,False,True,False,unknown
119828,AGL470128,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)",limestreet,,False,True,False,unknown
119830,AGL473595,,"9th and 10th floors, 52 Lime Street, London (E...",limestreet,,False,True,False,unknown


In [188]:
classified

Unnamed: 0,title_number,tenure,unit_id,unit_type,number_filter,building_name,street_number,street_name,postcode,city,...,lsoa11cd,msoa11cd,street_number2,street_name2,match_prob,geog_match,building_match,street_match,number_match,class
0,MAN296957,Leasehold,,,,westleigh lodge care home,,Nel Pan Lane,WN7 5JT,Leigh,...,E01006282,E02001313,,nelpanlane,,,False,True,False,business
1,MAN297185,Leasehold,1,Flat,,,,Canal Street,M1 3HE,Manchester,...,E01033654,E02006912,,canalstreet,,,False,True,False,residential
2,MAN299183,Leasehold,201,Flat,,,,Regent Road,M3 4AY,Manchester,...,E01033673,E02006916,,regentroad,,,False,False,False,residential
3,MAN300821,Freehold,,,,,,Gerard Street,WN4 9AA,Wigan,...,E01006223,E02001322,,gerardstreet,,,False,True,False,land
4,MAN300971,Leasehold,111,Unit,,timber wharf,,Worsley Street,M15 4NZ,Manchester,...,E01033673,E02006916,,worsleystreet,,,False,True,False,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119828,AGL470128,Leasehold,,,,,,Lime Street,EC3M 7AW,London,...,E01032739,E02000001,,limestreet,,,False,True,False,unknown
119829,AGL473126,Leasehold,,,,,155,Bishopsgate,EC2M 3AD,London,...,E01032739,E02000001,155,bishopsgate,,,False,True,True,business
119830,AGL473595,Leasehold,,,,,,Lime Street,EC3M 7AF,London,...,E01032739,E02000001,,limestreet,,,False,True,False,unknown
119831,AGL475468,Leasehold,,,,,,Bevis Marks,EC3A 7BA,London,...,E01032739,E02000001,,bevimarks,,,False,True,False,unknown


# Classify as multi property and expand

In [200]:
from enhance_ocod.labelling.ner_regex import xx_to_yy_regex
from enhance_ocod.address_parsing import expand_dataframe_numbers

df = classified.copy()

residential_mask = df['class'] == 'residential'
multi_condition = (df['unit_id'].str.contains(xx_to_yy_regex, na=False) | 
                   (df['unit_id'].isna() & df['street_number'].str.contains(xx_to_yy_regex, na=False)))

df['is_multi'] = np.where(residential_mask & multi_condition, True, False)

In [199]:
df_res.loc[df['is_multi'],:]

Unnamed: 0,title_number,tenure,unit_id,unit_type,number_filter,building_name,street_number,street_name,postcode,city,...,lsoa11cd,msoa11cd,street_number2,street_name2,match_prob,geog_match,building_match,street_match,number_match,class
355,MAN253018,Freehold,1-9,,,,11,Daisygate Drive,,Altrincham,...,E01006091,E02001277,11,daisygatedrive,1.0,street,False,False,False,residential
356,MAN253018,Freehold,1-9,,,,11,Daisygate Drive,,Altrincham,...,E01006091,E02001277,11,daisygatedrive,1.0,street,False,False,False,residential
787,SGL751004,Leasehold,1 to 16,,,billsley court,,Dagmar Road,SE25 6HZ,London,...,E01001113,E02000206,,dagmarroad,,,False,True,False,residential
998,LA948925,Freehold,17-24,,,,1-3,Cable Street,LA1 1BP,Lancaster,...,E01025093,E02005234,3,cablestreet,,,False,True,True,residential
1159,SGL640152,Freehold,1 to 10,,,the briar,,Thicket Road,SM1 4QY,Sutton,...,E01004141,E02000851,,thicketroad,,,False,False,False,residential
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119062,SGL241656,Freehold,1 to 6,,,,,Reddown Road,CR5 1AN,Coulsdon,...,E01001017,E02000236,,reddownroad,,,False,True,False,residential
119269,SGL672616,Freehold,1-8,,,martin court,,Camden Gardens,SM1 2EQ,Sutton,...,E01004162,E02000857,,camdengardens,,,False,False,False,residential
119381,NGL988682,Leasehold,2-02,Apartment,,,,Newton Street,WC2B 5EL,London,...,E01000918,E02000193,,newtonstreet,,,False,True,False,residential
119452,AGL241067,Leasehold,10/12,Flat,,,,Mildmay Park,N1 4PG,London,...,E01002776,E02000567,,mildmaypark,,,False,True,False,residential


In [201]:
test = expand_dataframe_numbers(df, column_name = 'unit_id', print_every=1000, min_count=1)

TypeError: expected string or bytes-like object