In [1]:

from enhance_ocod.inference import parse_addresses_pipeline, convert_to_entity_dataframe
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data, parsing_and_expansion_process, post_process_expanded_data, load_postcode_district_lookup)
from enhance_ocod.locate_and_classify import (preprocess_expandaded_ocod_data, 
    add_missing_lads_ocod, load_voa_ratinglist, street_and_building_matching, substreet_matching,
    counts_of_businesses_per_oa_lsoa, voa_address_match_all_data, classification_type1, classification_type2,
    contract_ocod_after_classification
)
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time
import gc  # Add for memory management

import pickle
import json
from pathlib import Path


import torch

# There is a warning related to bfill and ffill which is basically internal to pandas so silencing here
import warnings
warnings.filterwarnings('ignore', message='.*Downcasting object dtype arrays.*')

torch.set_float32_matmul_precision('medium')

SCRIPT_DIR = Path('..').parent.absolute()

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
ONSPD_path = SCRIPT_DIR.parent / "data" / "ONSPD_FEB_2025.zip"
price_paid_path = SCRIPT_DIR.parent / "data" / "price_paid_data" / "price_paid_complete_may_2025.csv"
processed_price_paid_dir = SCRIPT_DIR.parent / "data" / "processed_price_paid"
voa_path = SCRIPT_DIR.parent / "data" / "2023_non_domestic_rating_list_entries.zip"
output_dir.mkdir(parents=True, exist_ok=True)

parsed_results_dir = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts"
parsed_results_dir.mkdir(parents=True, exist_ok=True)

# List of all zip files in input_dir
#
# TESTING!!! only 10 files!
#
all_files = sorted([f for f in input_dir.glob("OCOD_FULL_*.zip")])


#test_indices = [0, 25, 50, 75]
#all_files = [all_files[i] for i in test_indices if i < len(all_files)]
print(f"Found {len(all_files)} OCOD history files.")

# Load common data once (if these don't change between files)
print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)

'../data/ocod_history_processed/OCOD_FULL_2022_02.parquet'
zip_file = input_dir / "OCOD_FULL_2022_02.zip"
out_name = zip_file.stem + ".parquet"
out_path = output_dir / out_name

Found 99 OCOD history files.
Loading common reference data...


  postcode_district_lookup = pd.read_csv(f)[['pcds', 'oslaua', 'oa11', 'lsoa11', 'msoa11', 'ctry']]


Initial rows: 2282385


In [None]:

# Define parsed results file path
parsed_results_file = parsed_results_dir / f"{zip_file.stem}_parsed_results.pkl"

if out_path.exists():
    print(f"Skipping {zip_file.name}: already processed.")
    continue

print(f"Processing {zip_file.name}...")

# Load and process the OCOD data
ocod_data = load_and_prep_OCOD_data(str(zip_file))

###############
# Parse addresses
###############
if parsed_results_file.exists():
    print(f"Loading cached parsing results for {zip_file.name}...")
    with open(parsed_results_file, 'rb') as f:
        results = pickle.load(f)
    print(f"Loaded cached results with success rate: {results['summary']['success_rate']:.1%}")
else:
    print(f"Parsing addresses for {zip_file.name}...")
    start_time = time.time()

    results = parse_addresses_pipeline(
        df=ocod_data,
        short_batch_size = 128,# The default seems really slow, might be to do with loading not sure
        model_path=str(model_path),
        target_column="property_address",
    )

    end_time = time.time()
    print(f"Address parsing took {end_time - start_time:.2f} seconds")
    print(f"Success rate: {results['summary']['success_rate']:.1%}")
    
    # Save parsing results
    print(f"Saving parsing results to {parsed_results_file}...")
    with open(parsed_results_file, 'wb') as f:
        pickle.dump(results, f)

# Continue with post-parsing processing
test = convert_to_entity_dataframe(results)
test = parsing_and_expansion_process(all_entities=test)
ocod_data = post_process_expanded_data(test, ocod_data)

# Clean up
del results, test
gc.collect()

###############
# Geolocate
###############
print(f"Geolocating {zip_file.name}...")

ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)

price_paid_df = load_and_process_pricepaid_data(
    file_path=str(price_paid_path), 
    processed_dir=processed_price_paid_dir,
    postcode_district_lookup=postcode_district_lookup, 
    years_needed=[2017, 2018, 2019]
)

ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df)
ocod_data = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)
ocod_data = substreet_matching(ocod_data, price_paid_df, voa_businesses)

# Clean up price paid data
del price_paid_df
gc.collect()

###########
# Classify
###########
print(f"Classifying {zip_file.name}...")
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)

ocod_data = classification_type1(ocod_data)
ocod_data = classification_type2(ocod_data)

ocod_data = contract_ocod_after_classification(ocod_data, class_type='class2', classes=['residential'])

columns = ['title_number', 'within_title_id', 'within_larger_title', 'unique_id', 
            'unit_id', 'unit_type', 'building_name', 'street_number', 'street_name', 
            'postcode', 'city', 'district', 'region', 'property_address', 'oa11cd', 
            'lsoa11cd', 'msoa11cd', 'lad11cd','country_incorporated' ,'class', 'class2']

ocod_data = ocod_data.loc[:, columns].rename(columns={
    'within_title_id': 'nested_id',
    'within_larger_title': 'nested_title'
})
# Save results
ocod_data.to_parquet(out_path)
print(f"Saved processed data to {out_path}")

gc.collect()


print("All files process