In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from plotnine import *
from pathlib import Path
from tqdm import tqdm
import seaborn as sns
import numpy as np
from datetime import datetime
from enhance_ocod.analysis import create_summarised_stats, create_mean_difference_by_groups
from enhance_ocod.address_parsing import (
    process_addresses,
    expand_dataframe_numbers,
    create_unique_id
)

data_folder = Path('../data') 
figures_folder = Path('../figures/figures')
figures_folder.mkdir(parents=True, exist_ok=True)

OCOD_history_path = data_folder / 'ocod_history_processed' 

list_of_files = list(OCOD_history_path.iterdir())

active_class_var = 'class'


LAD_COLUMN_CODE = 'LAD22CD' # change this according to the shapefile you are using
LAD_COLUMN_NAME = "LAD22NM"

# What the issue with Guernsey?

In [None]:
guernsey_res_df = []

for file in list_of_files:

    target_file = pd.read_parquet(file)

    target_file = target_file.loc[(target_file['country_incorporated']=='GUERNSEY') & 
    (target_file['class']=='residential')]

    target_file = target_file.groupby(['title_number', 'msoa11cd' ]).size().reset_index().rename(columns = {0:'counts'})
    target_file = target_file.sort_values('counts')
    target_file['file'] = Path(file).stem

    guernsey_res_df.append(target_file)

guernsey_res_df = pd.concat(guernsey_res_df, ignore_index = False)


In [None]:
guernsey_res_df['date'] = guernsey_res_df['file'].str.extract(r'(\d{4}_\d{2})$')[0]
guernsey_res_df['date'] = guernsey_res_df['date'].str.replace('_', '-') + '-01'
guernsey_res_df['date'] = pd.to_datetime(guernsey_res_df['date'])

In [None]:
guernsey_large_only = guernsey_res_df.loc[guernsey_res_df['counts']>500]

In [None]:
guernsey_large_only.sort_values('date')

In [None]:
large_group_df = pd.read_parquet(list_of_files[50])
large_group_df = large_group_df.loc[large_group_df['title_number'] == 'AGL427518' ]

large_group_df['property_address'].iloc[0]

In [None]:
large_group_df

In [None]:
large_group_df.to_csv(str(data_folder)+'/guernsey_check.csv')

From the results we can see that title AGL427518 is being incorrectly parsed resulting in several thousand fake addresses. 

# Is this a parsing failure or a model failure?



In [None]:
import pandas as pd
from enhance_ocod.inference import parse_addresses_basic

# Create example DataFrame with the two addresses
example_df = pd.DataFrame({
    'address': [
        "Apartments 201-209, 301-309, 401-409, 501-509, 601-609, 701-709, 801-809, 901-909, 1001-1009, 1101-1109, 1201-1209, 1301-1309, 1401-1409, 1501-1509, 1601-1609, 1701-1709, 1801-1809, 1901-1909, 2001-2009, 2201-2209, 2301-2309, 2401-2407, 2409, 2501-2509, 2601-2609, 2701-2709, 2801-2809, 2901-2909, 3001-3009, 3101-3109, 3201-3209, 3301-3309, 3401-3409, 3501-3509, 3601-3609, 3701-3709, 3801-3807, 3809, 3901-3909, 4001-4003, 4005-4009, 4101-4109, 4201-4209, 4301-4302 Arena Tower, 25 Crossharbour Plaza, London",
    ],
    'datapoint_id': ['addr_001']  # Optional unique identifiers
})

print("Example DataFrame:")
print(example_df)

# Default behaviour is to download the finetuned model from Hugginface model library.
results = parse_addresses_basic(example_df)
print(f"Parsed {results['summary']['successful_parses']} addresses")

In [None]:
parsed_problem_address = process_addresses(results['results'])
parsed_problem_address['class'] = 'residential'
parsed_problem_address['number_filter'] = 'all'
parsed_problem_address

In [None]:
problem_expanded = expand_dataframe_numbers(parsed_problem_address, class_var = 'class', print_every=1000000, min_count=1)

In [None]:
problem_expanded