In [1]:
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data,
    parsing_and_expansion_process,
    post_process_expanded_data,
    load_postcode_district_lookup,
)
from enhance_ocod.locate_and_classify import (
    preprocess_expandaded_ocod_data,
    add_missing_lads_ocod,
    load_voa_ratinglist,
    street_and_building_matching,
    substreet_matching,
    counts_of_businesses_per_oa_lsoa,
    voa_address_match_all_data,
    classification_type1,
    classification_type2,
    contract_ocod_after_classification,
)
from enhance_ocod.inference import convert_to_entity_dataframe
from enhance_ocod.price_paid_process import load_and_process_pricepaid_data
from pathlib import Path
from tqdm import tqdm
import time
import gc  # Add for memory management

import pickle
# There is a warning related to bfill and ffill which is basically internal to pandas so silencing here
import warnings

warnings.filterwarnings("ignore", message=".*Downcasting object dtype arrays.*")

SCRIPT_DIR = Path("/teamspace/studios/this_studio/enhance_ocod/notebooks")

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = (
    SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
)


ONSPD_dir = SCRIPT_DIR.parent / "data" / "onspd"
ONSPD_files = list(ONSPD_dir.glob("*"))
ONSPD_path = ONSPD_files[0] if ONSPD_files else None

price_paid_dir = SCRIPT_DIR.parent / "data" / "price_paid_data"
price_paid_files = list(price_paid_dir.glob("*"))
price_paid_path = price_paid_files[0] if price_paid_files else None


voa_dir = SCRIPT_DIR.parent / "data" / "voa"
voa_files = list(voa_dir.glob("*"))
voa_path = voa_files[0] if voa_files else None


processed_price_paid_dir = SCRIPT_DIR.parent / "data" / "processed_price_paid"
output_dir.mkdir(parents=True, exist_ok=True)

parsed_results_dir = SCRIPT_DIR.parent / "data" / "parsed_ocod_dicts2"
parsed_results_dir.mkdir(parents=True, exist_ok=True)

# List of all zip files in input_dir
#
# TESTING!!! only 10 files!
#
all_files = sorted([f for f in input_dir.glob("OCOD_FULL_*.zip")])


print("Loading common reference data...")
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)


Loading common reference data...


  postcode_district_lookup = pd.read_csv(f)[


Initial rows: 2293839


In [2]:



zip_file = Path('/teamspace/studios/this_studio/enhance_ocod/data/ocod_history/OCOD_FULL_2022_02.zip')

In [3]:
out_name = zip_file.stem + ".parquet"
out_path = output_dir / out_name

# Define parsed results file path
parsed_results_file = parsed_results_dir / f"{zip_file.stem}_parsed_results.pkl"

print(f"Processing {zip_file.name}...")

# Load and process the OCOD data
ocod_data = load_and_prep_OCOD_data(str(zip_file))

###############
# Parse addresses
###############
if parsed_results_file.exists():
    print(f"Loading cached parsing results for {zip_file.name}...")
    with open(parsed_results_file, "rb") as f:
        results = pickle.load(f)
    print(
        f"Loaded cached results with success rate: {results['summary']['success_rate']:.1%}"
    )
else:
    from enhance_ocod.inference import parse_addresses_pipeline
    import torch
    torch.set_float32_matmul_precision("medium")
    print(f"Parsing addresses for {zip_file.name}...")
    start_time = time.time()

    results = parse_addresses_pipeline(
        df=ocod_data,
        short_batch_size=128,  # The default seems really slow, might be to do with loading not sure
        model_path=str(model_path),
        target_column="property_address",
    )

    end_time = time.time()
    print(f"Address parsing took {end_time - start_time:.2f} seconds")
    print(f"Success rate: {results['summary']['success_rate']:.1%}")

    # Save parsing results
    print(f"Saving parsing results to {parsed_results_file}...")
    with open(parsed_results_file, "wb") as f:
        pickle.dump(results, f)


Processing OCOD_FULL_2022_02.zip...


Loading cached parsing results for OCOD_FULL_2022_02.zip...
Loaded cached results with success rate: 100.0%


In [4]:
# Continue with post-parsing processing
test_0 = convert_to_entity_dataframe(results)
test = parsing_and_expansion_process(all_entities=test_0)
ocod_data = post_process_expanded_data(test, ocod_data)

Processing 421,466 entities into DataFrame...
Processed 150,000/421,466 entities
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 421,466


In [10]:
ocod_data.loc[ocod_data['title_number']=='CB400630']

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,country_incorporated
0,CB400630,1,CB400630-1,True,Freehold,,,,2-24,Miller Way,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
0,CB400630,2,CB400630-2,True,Freehold,,,,15-25,Hammonds Drive,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
1,CB400630,3,CB400630-3,True,Freehold,,,,1-19,Rudd Close,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
2,CB400630,4,CB400630-4,True,Freehold,,,,1-19,Rudd Close,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
3,CB400630,5,CB400630-5,True,Freehold,,,,,,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY


In [6]:

ocod_data

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,price_paid,property_address,country_incorporated
0,CB400630,1,CB400630-1,True,Freehold,,,,2-24,Miller Way,,Peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",GUERNSEY
1,SYK594298,1,SYK594298-1,True,Freehold,,,,1-25,Acorn Way,S66 3XH,Rotherham,ROTHERHAM,SOUTH YORKSHIRE,YORKS AND HUMBER,,"1-25 (excluding 13) Acorn Way, Sunnyside, Roth...",JERSEY
2,MAN211565,1,MAN211565-1,True,Freehold,,,,2 to 4,Junction Close,BL6 5GN,Bolton,BOLTON,GREATER MANCHESTER,NORTH WEST,,"2 to 4 and 6 to 11 (inclusive), Junction Close...",GUERNSEY
3,BM408625,1,BM408625-1,True,Freehold,,,,33-63,Stadium Approach,,Aylesbury,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,SOUTH EAST,51756.0,"33-63 (odd), Stadium Approach, Aylesbury",GUERNSEY
4,LN169893,1,LN169893-1,True,Freehold,,,,5-7,Princes Square,W2 4NP,London,CITY OF WESTMINSTER,GREATER LONDON,GREATER LONDON,,"5-7 Princes Square, London (W2 4NP)",JERSEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86447,AGL470128,1,AGL470128-1,False,Leasehold,,,,,Lime Street,EC3M 7AW,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)",IRELAND
86448,AGL473126,1,AGL473126-1,False,Leasehold,,,,155,Bishopsgate,EC2M 3AD,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"155 Bishopsgate, London (EC2M 3AD)",JAPAN
86449,AGL473595,1,AGL473595-1,False,Leasehold,,,,,Lime Street,EC3M 7AF,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"9th and 10th floors, 52 Lime Street, London (E...",AUSTRALIA
86450,AGL475468,1,AGL475468-1,False,Leasehold,,,,,Bevis Marks,EC3A 7BA,London,CITY OF LONDON,GREATER LONDON,GREATER LONDON,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",INDIA


In [32]:
import pandas as pd
price_paid_headers = [
    "transaction_unique_identifier",
    "price",
    "date_of_transfer",
    "postcode",
    "property_type",
    "old_new",
    "duration",
    "paon",
    "saon",
    "street",
    "locality",
    "town",
    "district",
    "county",
    "ppd_category_type",
    "record_status",
]
df = pd.read_csv('/teamspace/studios/this_studio/enhance_ocod/data/price_paid_data/price_paid_complete_may_2025.csv', nrows=500,
names = price_paid_headers)

In [33]:
df

Unnamed: 0,transaction_unique_identifier,price,date_of_transfer,postcode,property_type,old_new,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status
0,{7130F2C1-B3E4-41A5-9103-3DC40A6A5466},166500,1995-11-22 00:00,CM23 4PA,D,Y,F,19,,MAYFLOWER GARDENS,BISHOP'S STORTFORD,BISHOP'S STORTFORD,EAST HERTFORDSHIRE,HERTFORDSHIRE,A,A
1,{76F6D626-1D6C-4744-B12E-3A364DB17E29},59000,1995-09-27 00:00,L12 0AY,D,N,L,7,,TRENT CLOSE,LIVERPOOL,LIVERPOOL,LIVERPOOL,MERSEYSIDE,A,A
2,{E8BA1D5E-93DC-4896-AAA2-3A365F979FD9},118000,1995-12-15 00:00,SL3 8XX,D,N,F,24,,SOUTHWOLD SPUR,SLOUGH,SLOUGH,SLOUGH,SLOUGH,A,A
3,{32158129-2288-4763-8DEA-487BEBDF257F},48500,1995-01-27 00:00,CV12 8TF,S,N,F,5,,CHELTENHAM CLOSE,BEDWORTH,BEDWORTH,NUNEATON AND BEDWORTH,WARWICKSHIRE,A,A
4,{CB756459-6205-4740-9184-487BF87F37BD},27500,1995-04-20 00:00,SY11 1HP,S,N,F,105,,BEATRICE STREET,OSWESTRY,OSWESTRY,OSWESTRY,SHROPSHIRE,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,{27A3B2EB-5B33-421E-A6F6-3DC7F14D67F9},52500,1995-08-11 00:00,PO30 3AT,T,Y,F,16,,CALLOWAY CLOSE,ARRETON,NEWPORT,ISLE OF WIGHT,ISLE OF WIGHT,A,A
496,{BF46287F-5625-423A-BEE7-3A3A33104AE3},71500,1995-07-20 00:00,LS29 0TJ,S,Y,F,7,,BROADFIELD WAY,ADDINGHAM,ILKLEY,BRADFORD,WEST YORKSHIRE,A,A
497,{F296BE23-80D0-4EBB-9DFE-3A3A3E1CB239},46950,1995-06-30 00:00,DL6 1HF,T,N,F,245,,ASHLANDS ROAD,NORTHALLERTON,NORTHALLERTON,HAMBLETON,NORTH YORKSHIRE,A,A
498,{043606DE-7F63-47D6-BC9B-3A3A4B937397},82500,1995-07-31 00:00,NR16 2QH,D,Y,F,40,,BREWSTERS,EAST HARLING,NORWICH,BRECKLAND,NORFOLK,A,A


In [24]:

###############
# Geolocate
###############
print(f"Geolocating {zip_file.name}...")

ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)


Geolocating OCOD_FULL_2022_02.zip...


In [18]:
price_paid_df = load_and_process_pricepaid_data(
    file_path=str(price_paid_path),
    processed_dir=processed_price_paid_dir,
    postcode_district_lookup=postcode_district_lookup,
    years_needed=[2024, 2023, 2022],
)

ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df) 

Processed data found. Skipping preprocessing.
Loaded 737255 records for year 2024
Loaded 845990 records for year 2023
Loaded 1068645 records for year 2022


In [29]:
price_paid_df['street_name2'].str.contains('dealsgateway').sum()

0

In [None]:
ocod_data2 = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)

In [None]:






ocod_data = substreet_matching(ocod_data, price_paid_df, voa_businesses)

# Clean up price paid data
del price_paid_df
gc.collect()

###########
# Classify
###########
print(f"Classifying {zip_file.name}...")
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)

ocod_data = classification_type1(ocod_data)
ocod_data = classification_type2(ocod_data)

ocod_data = contract_ocod_after_classification(
    ocod_data, class_type="class2", classes=["residential"]
)

columns = [
    "title_number",
    "within_title_id",
    "within_larger_title",
    "unique_id",
    "unit_id",
    "unit_type",
    "building_name",
    "street_number",
    "street_name",
    "postcode",
    "city",
    "district",
    "region",
    "property_address",
    "oa11cd",
    "lsoa11cd",
    "msoa11cd",
    "lad11cd",
    "country_incorporated",
    "class",
    "class2",
]

ocod_data = ocod_data.loc[:, columns].rename(
    columns={"within_title_id": "nested_id", "within_larger_title": "nested_title"}
)
# Save results
ocod_data.to_parquet(out_path)
print(f"Saved processed data to {out_path}")

print("All files processed.")
