# A lot of chaos has happened this ipynb is supposed to work it out and re-create the weakly labelled data

At the end I should be able to create a clearly labelled and docstringed .py file and delete this

Then I will need to 
- re-train model
- re-tesst results
- re-choose model
- re-parse everything again

In [1]:

from enhance_ocod.inference import parse_addresses_pipeline, convert_to_entity_dataframe
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data, parsing_and_expansion_process, post_process_expanded_data, load_postcode_district_lookup)
from enhance_ocod.locate_and_classify import (preprocess_expandaded_ocod_data, 
    add_missing_lads_ocod, load_voa_ratinglist, street_and_building_matching, substreet_matching,
    counts_of_businesses_per_oa_lsoa, voa_address_match_all_data, classification_type1, classification_type2,
    contract_ocod_after_classification
)
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time
import gc  # Add for memory management

import pickle
import json
from pathlib import Path


import torch

# There is a warning related to bfill and ffill which is basically internal to pandas so silencing here
import warnings
warnings.filterwarnings('ignore', message='.*Downcasting object dtype arrays.*')

torch.set_float32_matmul_precision('medium')

SCRIPT_DIR = Path('..').parent.absolute()

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
ONSPD_path = SCRIPT_DIR.parent / "data" / "ONSPD_FEB_2025.zip"
price_paid_path = SCRIPT_DIR.parent / "data" / "price_paid_data" / "price_paid_complete_may_2025.csv"
processed_price_paid_dir = SCRIPT_DIR.parent / "data" / "processed_price_paid"
voa_path = SCRIPT_DIR.parent / "data" / "2023_non_domestic_rating_list_entries.zip"
output_dir.mkdir(parents=True, exist_ok=True)

parsed_results_dir = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts"
parsed_results_dir.mkdir(parents=True, exist_ok=True)

# List of all zip files in input_dir
#
# TESTING!!! only 10 files!
#
all_files = sorted([f for f in input_dir.glob("OCOD_FULL_*.zip")])


#test_indices = [0, 25, 50, 75]
#all_files = [all_files[i] for i in test_indices if i < len(all_files)]
print(f"Found {len(all_files)} OCOD history files.")

# Load common data once (if these don't change between files)
print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)

'../data/ocod_history_processed/OCOD_FULL_2022_02.parquet'
zip_file = input_dir / "OCOD_FULL_2022_02.zip"
out_name = zip_file.stem + ".parquet"
out_path = output_dir / out_name

Found 99 OCOD history files.
Loading common reference data...


  postcode_district_lookup = pd.read_csv(f)[['pcds', 'oslaua', 'oa11', 'lsoa11', 'msoa11', 'ctry']]


Initial rows: 2282385


In [None]:

# Define parsed results file path
parsed_results_file = parsed_results_dir / f"{zip_file.stem}_parsed_results.pkl"


print(f"Processing {zip_file.name}...")

# Load and process the OCOD data
ocod_data = load_and_prep_OCOD_data(str(zip_file))

###############
# Parse addresses
###############
if parsed_results_file.exists():
    print(f"Loading cached parsing results for {zip_file.name}...")
    with open(parsed_results_file, 'rb') as f:
        results = pickle.load(f)
    print(f"Loaded cached results with success rate: {results['summary']['success_rate']:.1%}")
else:
    print(f"Parsing addresses for {zip_file.name}...")
    start_time = time.time()

    results = parse_addresses_pipeline(
        df=ocod_data,
        short_batch_size = 128,# The default seems really slow, might be to do with loading not sure
        model_path=str(model_path),
        target_column="property_address",
    )

    end_time = time.time()
    print(f"Address parsing took {end_time - start_time:.2f} seconds")
    print(f"Success rate: {results['summary']['success_rate']:.1%}")
    
    # Save parsing results
    print(f"Saving parsing results to {parsed_results_file}...")
    with open(parsed_results_file, 'wb') as f:
        pickle.dump(results, f)

# Continue with post-parsing processing
test = convert_to_entity_dataframe(results)
test = parsing_and_expansion_process(all_entities=test)
ocod_data = post_process_expanded_data(test, ocod_data)

# Clean up
del results, test
gc.collect()

###############
# Geolocate
###############
print(f"Geolocating {zip_file.name}...")

ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)

price_paid_df = load_and_process_pricepaid_data(
    file_path=str(price_paid_path), 
    processed_dir=processed_price_paid_dir,
    postcode_district_lookup=postcode_district_lookup, 
    years_needed=[2017, 2018, 2019]
)

ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df)
ocod_data = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)
ocod_data = substreet_matching(ocod_data, price_paid_df, voa_businesses)

# Clean up price paid data
del price_paid_df
gc.collect()

###########
# Classify
###########
print(f"Classifying {zip_file.name}...")
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)

ocod_data = classification_type1(ocod_data)
ocod_data = classification_type2(ocod_data)

ocod_data = contract_ocod_after_classification(ocod_data, class_type='class2', classes=['residential'])

columns = ['title_number', 'within_title_id', 'within_larger_title', 'unique_id', 
            'unit_id', 'unit_type', 'building_name', 'street_number', 'street_name', 
            'postcode', 'city', 'district', 'region', 'property_address', 'oa11cd', 
            'lsoa11cd', 'msoa11cd', 'lad11cd','country_incorporated' ,'class', 'class2']

ocod_data = ocod_data.loc[:, columns].rename(columns={
    'within_title_id': 'nested_id',
    'within_larger_title': 'nested_title'
})
# Save results
ocod_data.to_parquet(out_path)
print(f"Saved processed data to {out_path}")

gc.collect()

Processing OCOD_FULL_2022_02.zip...
Loading cached parsing results for OCOD_FULL_2022_02.zip...
Loaded cached results with success rate: 100.0%
Processing 421,285 entities into DataFrame...
Processed 350,000/421,285 entities
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 421,285
Geolocating OCOD_FULL_2022_02.zip...
Processed data found. Skipping preprocessing.
Loaded 1067153 records for year 2017
Loaded 1037132 records for year 2018
Loaded 1011344 records for year 2019
lad 100 of 280
lad 200 of 280
Classifying OCOD_FULL_2022_02.zip...
Processing 318 LADs
Creating street match lookup...
Creating address lookup...
Performing address matching...
Saved processed data to /teamspace/studios/this_studio/enhance_ocod/data/ocod_history_processed2/OCOD_FULL_2022_02.parquet


28

In [2]:
from enhance_ocod.labelling.weak_labelling import process_dataframe_batch, convert_weak_labels_to_standard_format, remove_overlapping_spans

zip_file = input_dir / "OCOD_FULL_2022_02.zip"


df = pd.read_csv(zip_file)

  df = pd.read_csv(zip_file)


In [7]:
search_term = "Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building"
results = [entry for entry in weakly_labelled_dict if search_term.lower() in entry['text'].lower()]
print(results)

[{'text': 'Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 Arizona Building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 California Building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 Colorado Building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 Dakota Building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Idaho Building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 Indiana Building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 Montana Building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Nebraska Building, 1-10, 101-110, 201-210, 301-310 and 402-403 Utah Building, 1-10 and 101-110 Boston Building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 Madison Building, Deals Gateway, London', 'spans': [{'text': 'Alaska Buildin

In [3]:
from enhance_ocod.labelling.ner_spans import lfs  # Import your functions
from enhance_ocod.labelling.weak_labelling import create_flat_tag, create_commercial_park_tag

def debug_labelling_functions(text: str, 
                             functions = None,
                             create_mock_row: bool = True) -> None:
    """
    Debug labelling functions by showing what each one extracts from the text.
    
    Args:
        text: The text to analyze
        functions: List of labelling functions (defaults to lfs)
        create_mock_row: Whether to create a mock row with flat/commercial tags
    """
    
    if functions is None:
        functions = lfs
    
    # Create a mock row
    if create_mock_row:
        import pandas as pd
        df = pd.DataFrame({'text': [text]})
        df = create_flat_tag(df, 'text')
        df = create_commercial_park_tag(df, 'text')
        row = df.iloc[0]
    else:
        # Simple row with just text
        import pandas as pd
        row = pd.Series({'text': text})
    
    print(f"TEXT: {text}")
    print(f"LENGTH: {len(text)}")
    if create_mock_row:
        print(f"FLAT_TAG: {row.get('flat_tag', 'N/A')}")
        print(f"COMMERCIAL_PARK_TAG: {row.get('commercial_park_tag', 'N/A')}")
    print("=" * 80)
    
    all_spans = []
    
    for func in functions:
        try:
            spans = func(row)
            print(f"\n{func.__name__.upper()}:")
            
            if not spans:
                print("  No spans found")
            else:
                for start, end, label in spans:
                    extracted_text = text[start:end]
                    print(f"  [{start:3d}-{end:3d}] {label:15s} '{extracted_text}'")
                    all_spans.append((start, end, label, func.__name__))
            
        except Exception as e:
            print(f"\n{func.__name__.upper()}: ERROR - {e}")
    
    # Show overlaps
    print(f"\n{'='*80}")
    print("OVERLAP ANALYSIS:")
    
    # Sort by start position
    all_spans.sort(key=lambda x: x[0])
    
    overlaps = []
    for i in range(len(all_spans)):
        for j in range(i+1, len(all_spans)):
            span1 = all_spans[i]
            span2 = all_spans[j]
            
            # Check if they overlap
            if not (span1[1] <= span2[0] or span2[1] <= span1[0]):
                overlaps.append((span1, span2))
    
    if overlaps:
        print(f"Found {len(overlaps)} overlaps:")
        for span1, span2 in overlaps:
            print(f"  OVERLAP:")
            print(f"    {span1[3]:20s} [{span1[0]:3d}-{span1[1]:3d}] {span1[2]:15s} '{text[span1[0]:span1[1]]}'")
            print(f"    {span2[3]:20s} [{span2[0]:3d}-{span2[1]:3d}] {span2[2]:15s} '{text[span2[0]:span2[1]]}'")
            print()
    else:
        print("No overlaps found")

In [5]:
target_text = 'Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 Arizona Building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 California Building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 Colorado Building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 Dakota Building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Idaho Building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 Indiana Building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 Montana Building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Nebraska Building, 1-10, 101-110, 201-210, 301-310 and 402-403 Utah Building, 1-10 and 101-110 Boston Building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 Madison Building, Deals Gateway, London'

debug_labelling_functions(target_text)

TEXT: Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 Arizona Building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 California Building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 Colorado Building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 Dakota Building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Idaho Building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 Indiana Building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 Montana Building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Nebraska Building, 1-10, 101-110, 201-210, 301-310 and 402-403 Utah Building, 1-10 and 101-110 Boston Building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 Madison Building, Deals Gateway, London
LENGTH: 952
FLAT_TAG: True
COMMERCIAL_PAR

In [3]:


weakly_labelled_dict = process_dataframe_batch(df, 
                           batch_size = 5000,
                           text_column  = 'Property Address',
                           include_function_name  = False,
                           save_intermediate  = False,
                           verbose  = True)
                           
remove_overlapping_spans(weakly_labelled_dict)

Processing batches: 100%|██████████| 19/19 [01:32<00:00,  4.89s/it]



Processing complete:
  - Processed 94091 rows successfully
  - Found 476207 total spans
  - Average 5.06 spans per row


In [22]:
from typing import Dict, List, Optional, Tuple

def convert_weakly_labelled_list_to_dataframe(results: List[Dict], batch_size: int = 50000) -> pd.DataFrame:
    """
    Convert parsing results to structured entity DataFrame.

    Works with data produced by the weak labelling function `process_dataframe_batch`.
    This function exists due to a structural mismatch with the `convert_to_entity_dataframe` function from the inference module. This should be resolved in future versions.
    Creates a long-format DataFrame where each row represents one extracted entity,
    suitable for further analysis or database storage.

    Args:
        results: List of dictionaries with 'text' and 'spans' fields
        batch_size: Batch size for progress reporting during processing

    Returns:
        DataFrame with columns: datapoint_id, label, start, end, text, label_text, label_id_count

    Example:
    ```python
    import pandas as pd
    
    # Create example DataFrame with property addresses
    df = pd.DataFrame({
        'property_address': [
            'Westleigh Lodge Care Home, Nel Pan Lane, Leigh (WN7 5JT)',
            'Flat 1, 1a Canal Street, Manchester (M1 3HE)',
            'Flat 201, 1 Regent Road, Manchester (M3 4AY)',
            '15 Oak Avenue, Birmingham (B12 9QR)'
        ]
    })

    # Process DataFrame with property addresses
    processed_list = process_dataframe_batch(df, 
                                batch_size=5000,
                                text_column='property_address')
                                
    # Remove any overlapping spans
    remove_overlapping_spans(processed_list)

    # Convert to standard DataFrame format
    processed_list = convert_weak_labels_to_standard_format(processed_list)


    processed_df = convert_weakly_labelled_list_to_dataframe(processed_list)

    print(processed_df)
    ```
    
    Note: Input DataFrame (df) must contain a 'property_address' column.
    """
    total_entities = sum(len(result["spans"]) for result in results)
    
    if total_entities == 0:
        print("Warning: No entities found in results!")
        return pd.DataFrame(columns=['datapoint_id', 'label', 'start', 'end', 'text', 'label_text', 'label_id_count'])
    
    print(f'Processing {total_entities:,} entities into DataFrame...')
    
    # Pre-allocate arrays for efficiency
    datapoint_ids = []
    labels = []
    starts = []
    ends = []
    texts = []
    label_texts = []
    
    processed = 0
    
    for idx, result in enumerate(results):
        datapoint_id = idx  # Using list index as datapoint_id
        original_address = result["text"]
        spans = result["spans"]
        
        entity_count = len(spans)
        if entity_count > 0:
            datapoint_ids.extend([datapoint_id] * entity_count)
            labels.extend([span['label'] for span in spans])
            starts.extend([span['start'] for span in spans])
            ends.extend([span['end'] for span in spans])
            texts.extend([original_address] * entity_count)
            # Extract text using start/end positions
            label_texts.extend([original_address[span['start']:span['end']] for span in spans])
            
            processed += entity_count
            if processed % batch_size == 0:
                print(f'Processed {processed:,}/{total_entities:,} entities')
    
    all_entities = pd.DataFrame({
        'datapoint_id': datapoint_ids,
        'label': labels,
        'start': starts,
        'end': ends,
        'text': texts,
        'label_text': label_texts
    })
    
    print('Computing label counts...')
    # Add counter for multiple entities of same type within same address
    all_entities['label_id_count'] = all_entities.groupby(['datapoint_id', 'label'], sort=False).cumcount()
    
    print('✓ Named Entity Recognition processing complete')
    print(f'Total entities extracted: {len(all_entities):,}')
    
    return all_entities

In [None]:
from enhance_ocod.inference import convert_to_entity_dataframe

temp = convert_weakly_labelled_list_to_dataframe(weakly_labelled_dict)

Processing 424,440 entities into DataFrame...
Processed 350,000/424,440 entities
Processed 400,000/424,440 entities
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 424,440


In [15]:
temp['label_id_count'].describe()

count    424440.000000
mean          0.274265
std           2.502990
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         109.000000
Name: label_id_count, dtype: float64

In [23]:
processed_list = process_dataframe_batch(df, 
                           batch_size = 5000,
                           text_column  = 'property_address')
                           
remove_overlapping_spans(processed_list)

processed_list = convert_weak_labels_to_standard_format(processed_list)


processed_df = convert_weakly_labelled_list_to_dataframe(processed_list)

Processing batches: 100%|██████████| 1/1 [00:00<00:00, 136.85it/s]


Processing complete:
  - Processed 4 rows successfully
  - Found 19 total spans
  - Average 4.75 spans per row
Processing 18 entities into DataFrame...
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 18





In [7]:

test = convert_weak_labels_to_standard_format(weakly_labelled_dict)

In [10]:
temp = convert_to_entity_dataframe(test)

TypeError: list indices must be integers or slices, not str