In [None]:
import pandas as pd
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
from plotnine import *
import seaborn as sns
from enhance_ocod.analysis import create_time_series_by_groups, create_mean_difference_by_groups
path = '../data/processed_price_paid/price_paid_2024.parquet'
ONSPD_path = "../data/ocod_history_processed/OCOD_FULL_2017_03.parquet"
figures_folder = Path('../figures/figures')

figures_folder.mkdir(parents=True, exist_ok=True)

active_ocod_path = '../data/ocod_history_processed'

active_class_var = 'class'


# Get MSOA dwelling counts from 2021 census

https://statistics.ukdataservice.ac.uk/dataset/england-and-wales-census-2021-rm205-dwelling-type/resource/f7cde7e0-5c9b-4091-bfb6-b34db820f30e

In [None]:
msoa_dwelling = pd.read_excel('https://ukds-ckan.s3.eu-west-1.amazonaws.com/2021/ONS/dwelling-type/RM205-Dwelling-Type-2021-msoa-ONS.xlsx',
).rename(columns = {'Middle layer Super Output Areas Code':'msoa11cd', 'Observation':'dwellings'})
msoa_dwelling = msoa_dwelling.groupby('msoa11cd')['dwellings'].sum().reset_index()

In [None]:
sns.histplot(msoa_dwelling, x = 'dwellings')

## Overall figures

THe below shows the total value of all residential property as well as the total residential value

In [None]:
overall_figures = create_time_series_by_groups(msoa_dwellings = msoa_dwelling, grouping_vars=None, 
ocod_path = active_ocod_path,
class_var = active_class_var)

In [None]:


overall_figures.loc[overall_figures.index.isin([overall_figures.index[0], overall_figures.index[-1]]), 
['date','year','month', 'ocod_mean', 'ocod_median' ,'ocod_ratio_mean', 'total_value_ocod_mean' ]]

In [None]:
# Get the first and last rows
subset = overall_figures.loc[overall_figures.index.isin([overall_figures.index[0], overall_figures.index[-1]]), 
                           ['date','year','month', 'ocod_mean', 'ocod_median' ,'ocod_ratio_mean', 'total_value_ocod_mean']]

# Calculate percentage change
pct_change = subset.select_dtypes(include='number').pct_change().iloc[-1] * 100

print("Data:")
print(subset)
print("\nPercentage Change:")
print(pct_change)

## Region figures

THe below shows the total value of all residential property as well as the total residential value

In [None]:

df_msoa_region = create_time_series_by_groups(msoa_dwellings = msoa_dwelling, grouping_vars= ['region'], 
ocod_path = active_ocod_path,
class_var = active_class_var)


In [None]:
fract_total = df_msoa_region[['total_value_ocod_mean', 'region']].groupby('region').mean()

print(fract_total)

(fract_total /fract_total['total_value_ocod_mean'].sum()).round(2)*100

In [None]:
p = ggplot(df_msoa_region.loc[df_msoa_region['region']!='GREATER LONDON'], aes(x = 'date', y = 'ocod_ratio_mean', color = 'region')) + geom_line() + labs(
    title = "Relative value by region excluding London") +   scale_x_date(
     breaks=pd.date_range(start=df_msoa_region['date'].min(), end=df_msoa_region['date'].max(), freq='YE'),
     date_labels='%Y'  # Format to show only year
 )

p.save(filename = figures_folder / 'relative_value_excluding_london.png')

p

In [None]:
p = ggplot(df_msoa_region, aes(x = 'date', y = 'ocod_ratio_mean', color = 'region')) + geom_line() + labs(
    title = "Relative value by region") +   scale_x_date(
     breaks=pd.date_range(start=df_msoa_region['date'].min(), end=df_msoa_region['date'].max(), freq='YE'),
     date_labels='%Y'  # Format to show only year
 )


p.save(filename = figures_folder / 'relative_value_region.png')

p

In [None]:
df_msoa_nested = create_time_series_by_groups(msoa_dwellings = msoa_dwelling, grouping_vars= ['is_multi'],
ocod_path = active_ocod_path,
class_var = active_class_var)

In [None]:
p = ggplot(df_msoa_nested, aes(x = 'date', y = 'ocod_ratio_mean', color = 'is_multi')) + geom_line() + labs(
    title = "Relative value of single and multiple properties",
    color = 'is multi') +   scale_x_date(
     breaks=pd.date_range(start=df_msoa_nested['date'].min(), end=df_msoa_nested['date'].max(), freq='YE'),
     date_labels='%Y'  # Format to show only year
 )

p.save(filename = figures_folder / 'relative_value_nested.png')

p

In [None]:
p = ggplot(df_msoa_nested, aes(x = 'date', y = 'ocod_total_counts', color = 'is_multi')) + geom_line() + labs(
    title = "Quantity by multiple property status") +   scale_x_date(
     breaks=pd.date_range(start=df_msoa_region['date'].min(), end=df_msoa_region['date'].max(), freq='YE'),
     date_labels='%Y'  # Format to show only year
 )

p

# Country of Incorporation
 

In [None]:
df_msoa_incorporated = create_time_series_by_groups( msoa_dwellings = msoa_dwelling, 
grouping_vars=  ['country_incorporated'], 
ocod_path = '../data/ocod_history_processed',
class_var = active_class_var)

In [None]:
df_msoa_incorporated.to_csv(figures_folder / 'country.csv')

In [None]:
df_msoa_incorporated['country_incorporated'] = df_msoa_incorporated['country_incorporated'].str.replace("BRITISH VIRGIN ISLANDS", "BVI")

p =  ggplot(df_msoa_incorporated.loc[df_msoa_incorporated['country_incorporated'].isin(['JERSEY', 'GUERNSEY', 
'ISLE OF MAN', 'BRITISH VIRGIN ISLANDS', 'BVI'])], 
aes(x = 'date', y = 'ocod_ratio_mean', color = 'country_incorporated')) + geom_line() + labs(
    title = "Relative value of the country of incorporation\nwith highest quantity of properties") +   scale_x_date(
     breaks=pd.date_range(start=df_msoa_incorporated['date'].min(), end=df_msoa_incorporated['date'].max(), freq='YE'),
     date_labels='%Y'  # Format to show only year
 ) + \
  guides(color = guide_legend(title = "country")) 

p.save(filename = figures_folder / 'relative_value_incorporation.png')

p

In [None]:
incorp_last_month_df = df_msoa_incorporated[(df_msoa_incorporated['date'] == pd.to_datetime('2025-05-01')) ]

incorp_last_month_df['fraction_of_total_incorp_value'] = incorp_last_month_df['total_value_ocod_mean']/incorp_last_month_df['total_value_ocod_mean'].sum()

incorp_last_month_df.sort_values('total_value_ocod_mean').tail().sum(numeric_only = True)

In [None]:
top4_df = df_msoa_incorporated[(df_msoa_incorporated['date'] == pd.to_datetime('2025-05-01')) & df_msoa_incorporated['country_incorporated'].isin(['JERSEY', 'GUERNSEY', 
'ISLE OF MAN', 'BRITISH VIRGIN ISLANDS', 'BVI'])]

In [None]:
df_msoa_incorporated.loc[df_msoa_incorporated['country_incorporated'].isin(['BVI', 'JERSEY', 'GUERNSEY', 'ISLE OF MAN']),
['date','ocod_total_counts', 'total_dwelling_count', 'total_value_ocod_mean',
       'total_value_dwelling_mean', 'fraction_of_total_value',
       'country_incorporated']]

In [None]:
countries = [
    "JERSEY",
    "BVI",
    "GUERNSEY",
    "ISLE OF MAN",
    "GIBRALTAR",
    "LUXEMBOURG",
    "SINGAPORE",
    "IRELAND",
    "HONG KONG",
    "PANAMA",
    "CAYMAN ISLANDS",
    "SEYCHELLES",
    "NETHERLANDS",
    "CYPRUS",
    "BAHAMAS",
    "MAURITIUS",
    "GERMANY",
    "AUSTRALIA",
    "BERMUDA",
    "DELAWARE, U.S.A."
]

In [None]:
df = df_msoa_incorporated.loc[df_msoa_incorporated['country_incorporated'].isin(countries)]

# Get values at min and max dates
grouped = df.groupby('country_incorporated').apply(
    lambda x: pd.Series({
        'min_date_value': x.loc[x['date'].idxmin(), 'ocod_ratio_mean'],
        'max_date_value': x.loc[x['date'].idxmax(), 'ocod_ratio_mean']
    })
)

# Calculate ratio of change
grouped['ratio_change'] = grouped['max_date_value'] / grouped['min_date_value']
result = grouped['ratio_change']

result.sort_values().reset_index()

In [None]:

df_msoa_region_nested = create_time_series_by_groups(  msoa_dwellings = msoa_dwelling, grouping_vars = ['region', 'is_multi'], 
ocod_path = active_ocod_path,
class_var = active_class_var)

In [None]:
p = ggplot(df_msoa_region_nested.loc[df_msoa_region_nested['region']!='GREATER LONDON'], 
aes(x = 'date', y = 'ocod_ratio_mean', linetype = 'is_multi')) + geom_line() + labs(
    title = "Relative value by region excluding London\nSolid line single property, dashed line multi property",
    linetype = 'nested',
    y = 'Cost relative to all residential property') +   scale_x_date(
     breaks=pd.date_range(start=df_msoa_region_nested['date'].min(), end=df_msoa_region_nested['date'].max(), freq='YE'),
     date_labels='%Y'
 ) + facet_wrap("region") +\
    theme(
        axis_text_x=element_text(angle=90,hjust = 1),
        legend_position = 'none'
    )


p.save(filename = figures_folder / 'relative_value_nested_region.png')

p

In [None]:
df_msoa_lad = create_time_series_by_groups(msoa_dwellings = msoa_dwelling, grouping_vars = ['lad11cd'], 
ocod_path = active_ocod_path,
class_var = active_class_var )


# All property percentage value

It looks like the fraction of total value of the residential properties is broadly flat across the 10 year period, fluctating between 1.3 and 1.4%

In [None]:
all_property_value = df_msoa_lad.groupby('date')[['ocod_total_counts','total_value_ocod_mean', 'total_value_dwelling_mean']].sum().reset_index()

all_property_value['fraction_value'] = all_property_value['total_value_ocod_mean'] / all_property_value['total_value_dwelling_mean']

all_property_value['percentage_value'] = (all_property_value['fraction_value'] *100).round(2)


In [None]:
p = ggplot(all_property_value, aes(x = 'date', y = 'percentage_value')) + geom_line() + labs(
    title = "Offshore residential property as a percentage of total value") +   scale_x_date(
     breaks=pd.date_range(start=all_property_value['date'].min(), end=all_property_value['date'].max(), freq='YE'),
     date_labels='%Y'  # Format to show only year
 )

p

In [None]:
df_msoa_lad['fraction'] = df_msoa_lad.groupby('date')['total_value_ocod_mean'].transform(lambda x: x / x.sum())

In [None]:
df_msoa_lad.groupby('lad11cd')[['fraction', 'total_value_ocod_mean']].mean().sort_values('fraction').sort_values('fraction',ascending = False)

In [None]:
df_msoa_lad.groupby('lad11cd')[['fraction', 'total_value_ocod_mean']].mean().sort_values('fraction').sort_values('fraction',ascending = False).head(2).sum()

In [None]:
df_msoa_lad.to_csv('../data/lad.csv')