In [None]:

from enhance_ocod.address_parsing import (
    process_addresses,
    expand_dataframe_numbers,
    create_unique_id
)
from enhance_ocod.locate_and_classify import (
    load_voa_ratinglist,
    add_geographic_metadata,
    enhance_ocod_with_gazetteers,
    add_business_matches,
    property_class,
    get_default_property_rules,
    fill_unknown_classes_by_group,
    drop_non_residential_duplicates
)

from pathlib import Path
import pickle
import pandas as pd
import torch
import gc

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", message=".*Downcasting object dtype arrays.*")

torch.set_float32_matmul_precision("medium")

SCRIPT_DIR = Path('../notebooks')

# ====== PATHS ======
def get_first_file_in_data_dir(dirname):
    """Get the first file in a data subdirectory, or None if no files exist."""
    data_dir = SCRIPT_DIR.parent / "data" / dirname
    files = list(data_dir.glob("*"))
    return files[0] if files else None

ONSPD_path = get_first_file_in_data_dir("onspd")
voa_path = get_first_file_in_data_dir("voa")

# ====== SPECIFY SINGLE FILE TO DEBUG ======
# TODO: Update this path to point to your specific parsed results file
parsed_results_file = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts" / "OCOD_FULL_2022_02_parsed_results.pkl"
original_ocod_file = SCRIPT_DIR.parent / "data" / "ocod_history" / "OCOD_FULL_2022_02.zip"
output_file = SCRIPT_DIR.parent / "data" / "debug_output.parquet"

print("Loading common reference data...")
from enhance_ocod.address_parsing import load_postcode_district_lookup
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)

# Load gazetteers
gazetteer_dir = SCRIPT_DIR.parent / 'data'/ 'gazetteer' 
building_file = gazetteer_dir / 'building_gazetteer.parquet'
district_file = gazetteer_dir / 'district_gazetteer.parquet'
street_file = gazetteer_dir / 'street_gazetteer.parquet'

print("Loading gazetteer files...")
building_gazetteer = pd.read_parquet(building_file)
district_gazetteer = pd.read_parquet(district_file)
street_gazetteer = pd.read_parquet(street_file)

In [None]:
# ====== LOAD PARSED RESULTS ======
print(f"Loading parsed results from {parsed_results_file}...")
with open(parsed_results_file, "rb") as f:
    results = pickle.load(f)
print(f"Loaded results with success rate: {results['summary']['success_rate']:.1%}")

# ====== LOAD ORIGINAL OCOD DATA FOR MERGING ======
from enhance_ocod.address_parsing import load_and_prep_OCOD_data
print(f"Loading original OCOD data from {original_ocod_file}...")
ocod_data = load_and_prep_OCOD_data(str(original_ocod_file))

# ====== START PROCESSING FROM RESULTS ======
print("Processing addresses from results...")
processed_addresses_df = process_addresses(results['results'])

post_processed_data = processed_addresses_df.merge(
    ocod_data, how="left", left_on="datapoint_id", right_index=True
)[
    [
        "title_number",
        "tenure",
        "unit_id",
        "unit_type",
        "number_filter",
        "building_name",
        "street_number",
        "street_name",
        "postcode",
        "city",
        "district",
        "county",
        "region",
        "price_paid",
        "property_address",
        "country_incorporated",
    ]
]

# Clean up memory
del ocod_data
gc.collect()

print("Adding geographic metadata...")
post_processed_data["postcode"] = post_processed_data["postcode"].str.upper()
pre_process_ocod = add_geographic_metadata(post_processed_data, postcode_district_lookup)

print("Enhancing with gazetteers...")
pre_process_ocod['building_name'] = pre_process_ocod['building_name'].str.lower()
pre_process_ocod['street_name2'] = pre_process_ocod['street_name2'].str.lower()

In [None]:

enhanced = enhance_ocod_with_gazetteers(pre_process_ocod, building_gazetteer, district_gazetteer, street_gazetteer)


In [None]:

print("Adding business matches...")
with_matches = add_business_matches(enhanced, voa_businesses)

print("Creating temporary unique ID...")
with_matches = create_unique_id(with_matches)

print("Classifying properties...")
rules = get_default_property_rules()
classified = property_class(with_matches.copy(), rules, include_rule_name=True)

print("Filling unknown classes by group...")
classified = fill_unknown_classes_by_group(classified)

print("Dropping non-residential duplicates...")
classified = drop_non_residential_duplicates(classified, class_col='class')

print("Expanding dataframe numbers...")
ocod_data = expand_dataframe_numbers(classified, class_var='class', print_every=10000, min_count=1)

print("Updating unique ID...")
ocod_data = create_unique_id(ocod_data)

# Select final columns
columns = [
    "title_number",
    "multi_id",
    "unique_id",
    "unit_id",
    "unit_type",
    "building_name",
    "street_number",
    "street_name",
    "postcode",
    "city",
    "district",
    "region",
    "property_address",
    "oa11cd",
    "lsoa11cd",
    "msoa11cd",
    "lad11cd",
    "country_incorporated",
    "class",
    "matched_rule",
    "is_multi",
]

ocod_data = ocod_data.loc[:, columns]



In [None]:
ocod_data['lsoa_na'] = ocod_data['lsoa11cd'].isna()


ocod_data.groupby('lsoa_na').size()

In [None]:
pd.crosstab(ocod_data['lsoa_na'], ocod_data['class'], margins = True, normalize = 'columns')

In [None]:
residential_df = ocod_data.loc[ocod_data['class']=='residential']

pd.crosstab(residential_df['lsoa_na'], residential_df['is_multi'], margins = True, normalize = 'columns')

In [None]:
pd.crosstab(ocod_data['matched_rule'], ocod_data['class'], margins = True, normalize = 'columns')

In [None]:
ocod_data.loc[ocod_data['property_address'].str.contains('the white horse', case = False)]