In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from plotnine import *
from pathlib import Path
from tqdm import tqdm
import seaborn as sns
import numpy as np
from datetime import datetime
from enhance_ocod.analysis import create_summarised_stats, create_mean_difference_by_groups

data_folder = Path('../data') 
figures_folder = Path('../figures/figures_new')
figures_folder.mkdir(parents=True, exist_ok=True)

OCOD_history_path = data_folder / 'ocod_history_processed_new' 

list_of_files = list(OCOD_history_path.iterdir())

active_class_var = 'class'


LAD_COLUMN_CODE = 'LAD22CD' # change this according to the shapefile you are using
LAD_COLUMN_NAME = "LAD22NM"

# Single Example

In [None]:
target_file = OCOD_history_path /'OCOD_FULL_2022_02.parquet'

reference_year = pd.read_parquet(target_file)

In [None]:
postcode_test = reference_year.loc[ reference_year[active_class_var].isin(['residential'])].copy()
postcode_test['has_postcode'] = postcode_test['postcode'].notna()

In [None]:
postcode_test.groupby('has_postcode').size()/postcode_test.shape[0]

In [None]:
# Get the missing LSOA records
missing_lsoa_addresses = reference_year.loc[reference_year['lsoa11cd'].isna(), 'property_address']

# Count how many contain each keyword
keyword_counts = {}
keywords = ['airspace', 'air space', 'land', 'plot', 'car park', 'carpark', 'unit', 'industrial', 'centre' ]

for keyword in keywords:
    count = missing_lsoa_addresses.str.contains(keyword, case=False, na=False).sum()
    keyword_counts[keyword] = count
    print(f"'{keyword}': {count}")

# If you want to see which addresses contain ANY of these keywords
any_keyword_mask = missing_lsoa_addresses.str.contains('|'.join(keywords), case=False, na=False)
any_keyword_count = any_keyword_mask.sum()
print(f"\nAddresses containing any of {keywords}: {any_keyword_count}")

# If you want to see the actual addresses that contain these keywords
addresses_with_keywords = missing_lsoa_addresses[any_keyword_mask]
print(f"\nSample addresses containing keywords:")
print(addresses_with_keywords.head(10).tolist())

# Summary counts
print(f"\nSummary:")
print(f"Total missing LSOA: {len(missing_lsoa_addresses)}")
print(f"Contains keywords: {any_keyword_count}")
print(f"Percentage: {any_keyword_count/len(missing_lsoa_addresses)*100:.1f}%")

In [None]:

all_years = []

for file in list_of_files:

    target_file = pd.read_parquet(file)

    target_file = target_file.loc[target_file['lsoa11cd'].isna(),:].groupby('class').size().reset_index()

    target_file = target_file.rename(columns = {0:'values'})
    target_file['file'] = Path(file).stem

    all_years.append(target_file)

all_years = pd.concat(all_years, ignore_index = False)


## Data without LSOA

WE can see from below that across time the values are relatively stable, and that residential data makes up about 74% of the missing geo location. This means that this data is not getting priced in reducing the overall value of the offshore owned estimate.

In [None]:



all_years['date'] = all_years['file'].str.extract(r'(\d{4}_\d{2})$')[0]
all_years['date'] = all_years['date'].str.replace('_', '-') + '-01'
all_years['date'] = pd.to_datetime(all_years['date'])
# Pivot the dataframe to spread 'class' values as columns
all_years_pivot = all_years.pivot(index='file', columns='class', values='values')

# Reset index if you want 'file' as a regular column instead of index
all_years_pivot = all_years.pivot_table(index='date', columns='class', values='values', fill_value=0)
all_years_pivot = all_years_pivot.reset_index()



# Calculate the fraction each class makes up of the total
class_fractions = all_years.groupby('class')['values'].sum()
class_fractions = (class_fractions / class_fractions.sum()).round(2)
class_fractions = class_fractions.reset_index()
class_fractions.columns = ['class', 'fraction']

class_fractions

In [None]:
 ggplot(all_years_pivot, aes(x = 'date', y = 'residential'))  + geom_line() + labs(
    title = 'The total residential property which has not been\nassigned an LSOA') + scale_x_date(
     breaks=pd.date_range(start=all_years_pivot['date'].min(), end=all_years_pivot['date'].max(), freq='Y'),
     date_labels='%Y'  # Format to show only year
 ) 

# Total over time

In [None]:
total_residential_df, total_per_region_df, total_incorp_df, total_resi_lad_df = create_summarised_stats(list_of_files, active_class_var)

In [None]:
total_per_region_df.groupby(active_class_var)['counts'].sum()/total_per_region_df['counts'].sum()

In [None]:

total_residential_df['type'] = np.where(total_residential_df['is_multi'], 'Multi', 'Single')

p = ggplot(total_residential_df, aes(x = 'date', 
y = 'counts', 
color = 'type') )+ geom_line() + labs(
    title = "Total number of residential properties") +   scale_x_date(
     breaks=pd.date_range(start=total_residential_df['date'].min(), end=total_residential_df['date'].max(), freq='Y'),
     date_labels='%Y'  # Format to show only year
 )

p.save(filename = figures_folder /'total_properties.png')

p

In [None]:

p = ggplot(total_per_region_df.groupby(['region', 'date'])['counts'].sum().reset_index(), aes(x='date', y='counts', color='region')) + \
    geom_line() + \
    labs(title="Total number of properties") + \
    scale_x_date(
        breaks=pd.date_range(start=total_residential_df['date'].min(), end=total_residential_df['date'].max(), freq='Y'),
        date_labels='%Y'
    ) 

p.save(filename = figures_folder /'properties_by_region.png')

p

In [None]:
total_per_class_df = total_per_region_df.groupby([active_class_var, 'date'])['counts'].sum().reset_index()

p = ggplot(total_per_class_df.loc[total_per_class_df[active_class_var]!='residential'], aes(x = 'date', 
y = 'counts', 
color = active_class_var) )+ geom_line() + labs(
    title = "Total properties by class excluding residential") +   scale_x_date(
     breaks=pd.date_range(start=total_residential_df['date'].min(), end=total_residential_df['date'].max(), freq='Y'),
     date_labels='%Y'  # Format to show only year
 )

p.save(filename = figures_folder /'properties_by_class.png')

p

In [None]:
temp_class = total_per_class_df.pivot(index = 'date', columns = active_class_var, values = 'counts')

temp_class['res_perc'] = temp_class['residential']/temp_class.sum(axis = 1)
temp_class


# Country of incorporation

In [None]:
total_incorp_df['country_incorporated'] = total_incorp_df['country_incorporated'].str.replace("BRITISH VIRGIN ISLANDS", "BVI")

country_totals = total_incorp_df.groupby('country_incorporated')['counts'].mean()

top_10_countries = country_totals.nlargest(20).index

# The top four dominate so massively there is no point in having anyone else
top_4_countries = country_totals.nlargest(4).index

filtered_df = total_incorp_df[total_incorp_df['country_incorporated'].isin(top_4_countries)]

print("Top 10 countries by total counts:")
print(country_totals.nlargest(20))

## Total Fraction the top four make up

In [None]:
total_incorp_df.loc[total_incorp_df['country_incorporated'].isin(['BVI', 'JERSEY', 'GUERNSEY', 'ISLO OF MAN']),'counts'].sum()/total_incorp_df['counts'].sum()

In [None]:
p = ggplot(filtered_df, aes(x = 'date', 
y = 'counts', 
color = 'country_incorporated') )+ geom_line() + labs(
    title = "Total number of properties\nby country of incorporation",
    color = 'Country'
    ) +   scale_x_date(
     breaks=pd.date_range(start=filtered_df['date'].min(), end=filtered_df['date'].max(), freq='Y'),
     date_labels='%Y'  # Format to show only year
 ) 

p.save(filename = figures_folder /'properties_by_incorporation.png')

p


In [None]:
test = total_incorp_df.copy()

test['diff'] = (test['counts'] - test.groupby('country_incorporated')['counts'].shift())/test.groupby('country_incorporated')['counts'].shift()

country_totals = test.groupby('country_incorporated')['diff'].mean()

# Step 2: Get the top 10 countries
top_10_countries = country_totals.nlargest(20).index

print(country_totals.nlargest(20))

In [None]:
p = ggplot(test[test['country_incorporated'].isin(['LUXEMBOURG', 'NETHERLANDS', 'GERMANY'])], aes(x = 'date', 
y = 'counts', 
color = 'country_incorporated') )+ geom_line() + labs(
    title = "Total number of properties\nby country of incorporation",
    ) +   scale_x_date(
     breaks=pd.date_range(start=test['date'].min(), end=test['date'].max(), freq='Y'),
     date_labels='%Y'  # Format to show only year
 ) 

p

In [None]:
df_sorted_all = total_resi_lad_df.groupby(['lad11cd', 'date'])['counts'].sum().reset_index()
df_sorted_all = df_sorted_all.sort_values(['lad11cd', 'date'])
df_sorted_all['counts_diff'] = df_sorted_all.groupby(['lad11cd'])['counts'].diff()
df_sorted_all['is_multi'] = 'all'


df_sorted = total_resi_lad_df.sort_values(['lad11cd', 'date'])

# Calculate the difference in counts between consecutive dates for each LAD11cd
df_sorted['counts_diff'] = df_sorted.groupby(['lad11cd','is_multi'])['counts'].diff()


df_sorted['is_multi'] = np.where(df_sorted['is_multi'], 'multi', 'single')

df_sorted = pd.concat([df_sorted, df_sorted_all], ignore_index = True)

df_diff_change = df_sorted.groupby(['is_multi', 'lad11cd' ])['counts_diff'].mean().reset_index()
df_diff_change['total_mean_change'] = df_diff_change['counts_diff'] * total_resi_lad_df['date'].nunique()

df_diff_change.sort_values('total_mean_change')

In [None]:
df_diff_change = df_sorted.groupby(['is_multi', 'lad11cd' ])['counts_diff'].mean().reset_index()
df_diff_change['total_mean_change'] = df_diff_change['counts_diff'] * total_resi_lad_df['date'].nunique()

df_diff_change.sort_values('total_mean_change')

In [None]:
import geopandas as gpd

# Direct WFS URL with correct parameters
wfs_url = "https://dservices1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/services/Local_Authority_Districts_May_2022_UK_BSC_V3/WFSServer?service=WFS&version=2.0.0&request=GetFeature&typeName=Local_Authority_Districts_May_2022_UK_BSC_V3:LAD_MAY_2022_UK_BSC_V3&outputFormat=GEOJSON&srsName=EPSG:4326"

try:
    print("Loading Local Authority Districts data...")
    gdf = gpd.read_file(wfs_url)
    print(f"Successfully loaded {len(gdf)} Local Authority Districts")
    print(f"Columns: {list(gdf.columns)}")
    print(f"CRS: {gdf.crs}")
    
    # Display first few rows
    print("\nFirst 3 rows:")
    print(gdf[[LAD_COLUMN_CODE, LAD_COLUMN_NAME]].head(3))
    
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Helper function to calculate months between dates
def months_between(date1, date2):
    return (date2.year - date1.year) * 12 + (date2.month - date1.month)

# Process 'all' group
df_sorted_all = total_resi_lad_df.groupby(['lad11cd', 'date'])['counts'].sum().reset_index()
df_sorted_all = df_sorted_all.sort_values(['lad11cd', 'date'])
df_sorted_all['counts_diff'] = df_sorted_all.groupby(['lad11cd'])['counts'].diff()

# Calculate months between consecutive dates
df_sorted_all['prev_date'] = df_sorted_all.groupby(['lad11cd'])['date'].shift(1)
df_sorted_all['months_diff'] = df_sorted_all.apply(
    lambda row: months_between(row['prev_date'], row['date']) if pd.notna(row['prev_date']) else np.nan, 
    axis=1
)

# Calculate monthly change (divide by number of months)
df_sorted_all['monthly_change'] = df_sorted_all['counts_diff'] / df_sorted_all['months_diff']
df_sorted_all['is_multi'] = 'all'

# Process main dataframe
df_sorted = total_resi_lad_df.sort_values(['lad11cd', 'date'])

# Calculate the difference in counts between consecutive dates for each LAD11cd
df_sorted['counts_diff'] = df_sorted.groupby(['lad11cd','is_multi'])['counts'].diff()

# Calculate months between consecutive dates
df_sorted['prev_date'] = df_sorted.groupby(['lad11cd', 'is_multi'])['date'].shift(1)
df_sorted['months_diff'] = df_sorted.apply(
    lambda row: months_between(row['prev_date'], row['date']) if pd.notna(row['prev_date']) else np.nan, 
    axis=1
)

# Calculate monthly change (divide by number of months)
df_sorted['monthly_change'] = df_sorted['counts_diff'] / df_sorted['months_diff']

df_sorted['is_multi'] = np.where(df_sorted['is_multi'], 'multi', 'single')

# Use monthly_change instead of counts_diff for concatenation
df_sorted_all_final = df_sorted_all[['lad11cd', 'date', 'monthly_change', 'is_multi']]
df_sorted_final = df_sorted[['lad11cd', 'date', 'monthly_change', 'is_multi']]

df_sorted_combined = pd.concat([df_sorted_final, df_sorted_all_final], ignore_index=True)

# Calculate mean monthly change
df_diff_change = df_sorted_combined.groupby(['is_multi', 'lad11cd'])['monthly_change'].mean().reset_index()

# Calculate total months in dataset
min_date = total_resi_lad_df['date'].min()
max_date = total_resi_lad_df['date'].max()
total_months = months_between(min_date, max_date)

# Multiply by total months instead of unique dates
df_diff_change['total_mean_change'] = df_diff_change['monthly_change'] * total_months

df_diff_change.sort_values('total_mean_change')

In [None]:
temp = df_diff_change.copy()

temp['increase'] = (temp['total_mean_change']>0).astype(int)

temp.groupby('is_multi')['increase'].describe()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

# Filter gdf to only include areas with LAD23CD containing "E" or "W" Remove silly isles for visual compactness
gdf_filtered = gdf[gdf[LAD_COLUMN_CODE].str.contains('E|W', na=False) & (gdf[LAD_COLUMN_CODE]!='E06000053')]

# Create subplots with light blue background
fig, axes = plt.subplots(1, 2, figsize=(20, 10), facecolor='#E6F3FF')

# Categories to plot
categories = ['multi', 'single']

# Plot each category
for i, category in enumerate(categories):
    # Merge for each category
    gdf_temp = gdf_filtered.merge(df_diff_change.loc[df_diff_change['is_multi']==category], 
                                  left_on=LAD_COLUMN_CODE, 
                                  right_on='lad11cd', 
                                  how='left')
    
    # Fill missing values with 0
    gdf_temp['total_mean_change'] = gdf_temp['total_mean_change'].fillna(0)
    
    # Apply transformation
    gdf_temp['total_mean_change'] = np.where(gdf_temp['total_mean_change'] < -500, -500, gdf_temp['total_mean_change'])
    
    # Set light blue background for axes
    axes[i].set_facecolor('#E6F3FF')
    
    # Plot base map with light gray fill and black borders
    gdf_filtered.plot(ax=axes[i], 
                      facecolor='lightgray', 
                      edgecolor='black', 
                      linewidth=0.3,
                      alpha=0.3)
    
    # Remove rows with NaN values for ranking
    gdf_temp_clean = gdf_temp.dropna(subset=['total_mean_change'])
    
    # Get top 10 highest values
    top_10_highest = gdf_temp_clean.nlargest(20, 'total_mean_change')
    
    # Get top 10 lowest values  
    top_10_lowest = gdf_temp_clean.nsmallest(20, 'total_mean_change')
    
    # Get centroids for point plotting
    top_10_highest_centroids = top_10_highest.geometry.centroid
    top_10_lowest_centroids = top_10_lowest.geometry.centroid
    
    # Plot highest values as red circles
    axes[i].scatter(top_10_highest_centroids.x, 
                    top_10_highest_centroids.y,
                    c='red', 
                    s=100, 
                    alpha=0.8,
                    edgecolor='black',
                    linewidth=1,
                    label='Increase',
                    zorder=5)
    
    # Plot lowest values as blue circles
    axes[i].scatter(top_10_lowest_centroids.x, 
                    top_10_lowest_centroids.y,
                    c='blue', 
                    s=100, 
                    alpha=0.8,
                    edgecolor='black',
                    linewidth=1,
                    label='Decrease',
                    zorder=5)
    
    # Add legend only to the second plot
    if i == 1:
        axes[i].legend(loc='upper right', 
                      bbox_to_anchor=(1.15, 1),
                      fontsize=12,
                      frameon=True,
                      fancybox=True,
                      shadow=True)
    
    # Turn off axis
    axes[i].set_axis_off()

# Add titles manually using fig.text for perfect alignment
fig.text(0.25, 0.92, 'Multi', fontsize=14, fontweight='bold', ha='center')
fig.text(0.75, 0.92, 'Single', fontsize=14, fontweight='bold', ha='center')

# Overall title
fig.suptitle('Top 20 Highest and Lowest Change in property counts at local authority level', 
             fontsize=16, fontweight='bold', y=0.98)

plt.tight_layout()
plt.savefig(figures_folder /'change_maps.png')
plt.show()




In [None]:
gdf_filtered.merge(df_diff_change.loc[df_diff_change['is_multi']=='multi'], 
                                  left_on=LAD_COLUMN_CODE, 
                                  right_on='lad11cd', 
                                  how='left')['total_mean_change'].describe()

In [None]:
temp = gdf_filtered.merge(df_diff_change.loc[df_diff_change['is_multi']=='all'], 
                                  left_on=LAD_COLUMN_CODE, 
                                  right_on='lad11cd', 
                                  how='left').sort_values('total_mean_change')

temp.loc[temp['total_mean_change'].notna(), ["LAD22NM", "total_mean_change"]].head(10)

# concentration

As we can see the majority of change is concentrated into a very small number of local authorities. The below shows the fraction the 20 LADS with the biggest increases make up of the total increase, and the 20 with the biggest decrease make up of all decreases

In [None]:
temp.loc[temp['total_mean_change'].notna() & (temp['total_mean_change']<0), ["LAD22NM", "total_mean_change"]].head(20)['total_mean_change'].sum()/ \
    temp.loc[temp['total_mean_change'].notna() & (temp['total_mean_change']<0),'total_mean_change'].sum()

In [None]:
temp.loc[temp['total_mean_change'].notna() & (temp['total_mean_change']>0), ["LAD22NM", "total_mean_change"]].tail(20)['total_mean_change'].sum()/ \
    temp.loc[temp['total_mean_change'].notna() & (temp['total_mean_change']>0),'total_mean_change'].sum()

In [None]:
lad_price_df = create_mean_difference_by_groups( ['lad11cd', 'is_multi'], 
ocod_path = OCOD_history_path, 
class_var = active_class_var)


In [None]:
lad_price_all_df = create_mean_difference_by_groups( ['lad11cd'], ocod_path = OCOD_history_path,
class_var = active_class_var)

In [None]:
lad_price_df.groupby(['ratio_significantly_different', 'is_multi']).size()

In [None]:
lad_price_df.pivot(columns = 'is_multi', 
index = 'lad11cd', 
values = 'mean_weighted_difference').rename(columns ={False:"single_price", True:"nested_price"})

In [None]:
lad_price_all_df.rename(columns = {'mean_weighted_difference':'all_offshore_change', 'mean_unweighted_difference':'general_property_change'})

In [None]:
price_change_data = lad_price_df.pivot(columns = 'is_multi', 
index = 'lad11cd', 
values = 'mean_weighted_difference').rename(columns ={False:"single_price", True:"nested_price"})

price_change_data_all = lad_price_all_df.rename(columns = {'mean_weighted_difference':'all_offshore_change', 'mean_unweighted_difference':'general_property_change'})
price_change_data_all['price_ratio'] = price_change_data_all['all_offshore_change']/price_change_data_all['general_property_change']

volume_change_data = df_diff_change.loc[:, ['is_multi','lad11cd', 'total_mean_change']].pivot(
    columns = 'is_multi', index ='lad11cd', values = 'total_mean_change')

full_data_compare = price_change_data.merge(volume_change_data, on = 'lad11cd').merge(
price_change_data_all, on = 'lad11cd'
).fillna(0)

conditions = [
    full_data_compare['price_ratio'] > 1.05,
    full_data_compare['price_ratio'] < 0.95
]
choices = ['increase', 'decrease']

# Create categorical column
full_data_compare['price_change'] = np.select(conditions, choices, default='stable')

conditions = [
    full_data_compare['all'] > 10,
    full_data_compare['all'] < -10
]
choices = ['increase', 'decrease']

# Create categorical column
full_data_compare['volume_change'] = np.select(conditions, choices, default='stable')

In [None]:
(full_data_compare[['all', 'price_ratio']]).describe()

In [None]:
# Create crosstab with better labels
crosstab = pd.crosstab(
    full_data_compare['price_change'] ,
    full_data_compare['volume_change'] ,
    rownames=['volume increase'],
    colnames=['relative_price increase'],
    margins=False,
    normalize = False
)
print(crosstab)

In [None]:
# Create crosstab with better labels
crosstab = pd.crosstab(
    full_data_compare['price_change'] ,
    full_data_compare['volume_change'] ,
    rownames=['volume increase'],
    colnames=['relative_price increase'],
    margins=False,
    normalize = True
)
print(crosstab)

#  model analayis

In [None]:
import pandas as pd
import glob
from pathlib import Path
import numpy as np

# Find all 'overall' CSVs in the model_performance directory
performance_dir = Path("../data/model_performance")
overall_files = list(performance_dir.glob("*overall*.csv"))

# Load and concatenate all relevant CSVs
dfs = []
for file in overall_files:
    df = pd.read_csv(file)
    df['source_file'] = file.name  # Optionally add a column to indicate the source
    dfs.append(df)


combined = pd.concat(dfs, ignore_index=True)[['precision', 'recall', 'f1_score', 'source_file']]
combined['prepocessed'] = combined['source_file'].str.contains('preprocessed')
combined[['precision', 'recall', 'f1_score',]] = combined[['precision', 'recall', 'f1_score',]].round(2)
combined['model_type'] = np.where(
    combined['source_file'].str.contains('devset'), 
    'conventional', 
    np.where(
        combined['source_file'].str.contains('regex'), 
        'regex', 
        'weak-learning'
    )
)
combined = combined[['model_type',  'prepocessed', 'f1_score','precision', 'recall']].sort_values('model_type')
combined.rename(columns={'f1_score': 'F1', 'model_type': 'Model Type',
 'prepocessed': 'Preprocessed', 'precision': 'Precision', 'recall': 'Recall'}, inplace=True)
# Print the LaTeX table
latex_table = combined.to_latex(
    index=False, 
    float_format='{:.2f}'.format,
    caption='Performance Comparison of Different Model Configurations',
    label='tab:model_performance',
    position='htbp',
    escape=False
)

table_file = figures_folder / 'model_performance_table.tex'
with open(table_file, 'w') as f:
    f.write(latex_table)

print(latex_table)


In [None]:


class_table = pd.read_csv('../data/model_performance/test_original_fullset_class_performance.csv')
class_table = class_table[['class_name', 'precision', 'recall', 'f1_score', 'support']]
class_table.rename(columns={'f1_score': 'F1', 'precision': 'Precision', 'recall': 'Recall', 'class_name': active_class_var }, inplace=True)
class_table[active_class_var] = class_table[active_class_var].str.replace("_", " ")

In [None]:
class_table

In [None]:

latex_table = class_table.to_latex(
    index=False, 
    float_format='{:.2f}'.format,
    caption='Performance is high across all classes indicating a well trained, reliable model',
    label='tab:class_performance',
    position='htbp',
    escape=False
)


table_file = figures_folder / 'class_performance_table.tex'
with open(table_file, 'w') as f:
    f.write(latex_table)

print(latex_table)

