# This notebook explores the performance of the property type classification

In [None]:
from enhance_ocod.locate_and_classify import (
    load_voa_ratinglist,
    add_geographic_metadata,
    enhance_ocod_with_gazetteers,
    add_business_matches,
    property_class,
    get_default_property_rules,
    fill_unknown_classes_by_group,
    )

from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data,
    load_postcode_district_lookup,
    process_addresses,
    expand_dataframe_numbers,
    create_unique_id
)
import pandas as pd
from pathlib import Path
import pickle

SCRIPT_DIR = Path("../data")

input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed"
model_path = (
    SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
)



def get_first_file_in_data_dir(dirname):
    """Get the first file in a data subdirectory, or None if no files exist."""
    data_dir = SCRIPT_DIR.parent / "data" / dirname
    files = list(data_dir.glob("*"))
    return files[0] if files else None

# Usage
ONSPD_path = get_first_file_in_data_dir("onspd")
price_paid_path = get_first_file_in_data_dir("price_paid_data")
voa_path = get_first_file_in_data_dir("voa")


data_folder = Path('../data') 

OCOD_history_path = data_folder / 'ocod_history_processed' 

In [None]:
postcode_district_lookup = load_postcode_district_lookup(str(ONSPD_path))

In [None]:
voa_businesses = load_voa_ratinglist(str(voa_path), postcode_district_lookup)

In [None]:
# Load gazetteers

gazetteer_dir = SCRIPT_DIR.parent / 'data'/ 'gazetteer' 

print("Loading existing gazetteer files...")
building_gazetteer = pd.read_parquet(gazetteer_dir / 'building_gazetteer.parquet')
building_gazetteer['fraction'] = 1
district_gazetteer = pd.read_parquet(gazetteer_dir / 'district_gazetteer.parquet')
street_gazetteer = pd.read_parquet(gazetteer_dir / 'street_gazetteer.parquet')

In [None]:

parsed_results_file = data_folder /'/parsed_ocod_dicts/OCOD_FULL_2022_02_parsed_results.pkl'

with open(parsed_results_file, "rb") as f:
    results = pickle.load(f)

zip_file = data_folder /'ocod_history/OCOD_FULL_2022_02.zip'

ocod_data = load_and_prep_OCOD_data(str(zip_file))

processed_addresses_df = process_addresses(results['results'])

post_processed_data = processed_addresses_df.merge(
    ocod_data, how="left", left_on="datapoint_id", right_index=True
)[
        [
            "title_number",
            "tenure",
            "unit_id",
            "unit_type",
            "number_filter",
            "building_name",
            "street_number",
            "street_name",
            "postcode",
            "city",
            "district",
            "county",
            "region",
            "price_paid",
            "property_address",
            "country_incorporated",
        ]
    ]

In [None]:
post_processed_data["postcode"] = post_processed_data["postcode"].str.upper()

pre_process_ocod = add_geographic_metadata(post_processed_data, postcode_district_lookup)

In [None]:
pre_process_ocod['building_name'] = pre_process_ocod['building_name'].str.lower()
pre_process_ocod['street_name2'] = pre_process_ocod['street_name2'].str.lower()
enhanced  =  enhance_ocod_with_gazetteers(pre_process_ocod, building_gazetteer, district_gazetteer, street_gazetteer)

In [None]:
with_matches = add_business_matches(enhanced, voa_businesses)
with_matches = create_unique_id(with_matches)


In [None]:
rules = get_default_property_rules()
default_classes  = property_class(with_matches.copy(), rules, include_rule_name=True)
default_classes = fill_unknown_classes_by_group(default_classes)
default_classes = drop_non_residential_duplicates(default_classes, class_col='class')

In [None]:
default_classes.groupby('class').size()

In [None]:
rules2 = rules + [{
    'rule_name': 'no address match residences',
    'condition': lambda df: (~df["number_match"]) & df["street_match"] & df['street_number'].notna(),
    'class': 'residential',
    'comments': 'when a property has a street number but does not match with any business on the same street it is a property, this primarily affects estate developments'
}]

no_business_match  = property_class(with_matches.copy(),  rules2, 
include_rule_name=True)

no_business_match = fill_unknown_classes_by_group(no_business_match)
no_business_match = drop_non_residential_duplicates(no_business_match, class_col='class')

In [None]:
no_business_match.groupby('class').size()

In [None]:
class_gt = pd.read_csv(data_folder / 'training_data' / 'parsed_ground_truth_complete.csv').drop_duplicates(['title_number'])
class_gt['class'] = class_gt['truth']
class_gt['class'] = class_gt['class'].str.replace('domestic', 'residential')


class_pred_data_raw = pd.read_parquet(OCOD_history_path /'OCOD_FULL_2022_02.parquet')


class_pred_data_raw = default_classes
class_pred_data = class_pred_data_raw.loc[class_pred_data_raw['title_number'].isin(class_gt['title_number']), 
['title_number', 'class']].drop_duplicates('title_number')


evaluate_classification_predictions(
    class_gt, class_pred_data)

In [None]:
class_pred_data_raw = no_business_match
class_pred_data = class_pred_data_raw.loc[class_pred_data_raw['title_number'].isin(class_gt['title_number']), 
['title_number', 'class']].drop_duplicates('title_number')

evaluate_classification_predictions(
    class_gt, class_pred_data)

In [None]:
default_expanded_df = expand_dataframe_numbers(default_classes, class_var = 'class', print_every=10000, min_count=1)

default_expanded_df = create_unique_id(default_expanded_df)

class_pred_data_raw = default_expanded_df
class_pred_data = class_pred_data_raw.loc[class_pred_data_raw['title_number'].isin(class_gt['title_number']), 
['title_number', 'class']]

merged_data = class_pred_data.merge(class_gt, on='title_number', how='left', suffixes=('_pred', '_gt'))

# Extract the two dataframes with matching rows
class_pred_data = merged_data[['title_number', 'class_pred']].rename(columns={'class_pred': 'class'})
class_gt = merged_data[['title_number', 'class_gt']].rename(columns={'class_gt': 'class'})

evaluate_classification_predictions(
    class_gt, class_pred_data)

In [None]:
no_business_expanded_df = expand_dataframe_numbers(no_business_match, class_var = 'class', print_every=10000, min_count=1)

no_business_expanded_df = create_unique_id(no_business_expanded_df)

class_pred_data_raw = no_business_expanded_df
class_pred_data = class_pred_data_raw.loc[class_pred_data_raw['title_number'].isin(class_gt['title_number']), 
['title_number', 'class']]

merged_data = class_pred_data.merge(class_gt, on='title_number', how='left', suffixes=('_pred', '_gt'))

# Extract the two dataframes with matching rows
class_pred_data = merged_data[['title_number', 'class_pred']].rename(columns={'class_pred': 'class'})
class_gt_expanded = merged_data[['title_number', 'class_gt']].rename(columns={'class_gt': 'class'})

evaluate_classification_predictions(
    class_gt_expanded, class_pred_data)

# Location classification

In [None]:
default_location = default_classes.loc[default_classes['lsoa11cd'].isna() & default_classes['class'].isin(['residential'])]

default_location.groupby('class').size()

In [None]:
default_location.groupby('is_multi').size()