In [1]:
import pandas as pd

### Old population data

In [8]:
population_data_file_path = '../data/raw/Population by borough 1939 to 2039.xlsx'
population_data = pd.read_excel(
    population_data_file_path,
    sheet_name='Population',
    skiprows=3,
    usecols='B:P',
    na_values=['n/a', 'N/A', 'N/A', 'n/a'],
    engine='openpyxl'
)

population_data

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,,mid-year estimate,mid-year estimate,Census,,Projection,Projection,Projection,Projection,,Growth\n1939-2015,,,Growth\n2014-2015,
1,Area Name,1939,1988,2011,,2015,2021,2031,2039,,,,,,
2,,,,,,,,,,,total,%,,total,%
3,City of London,9,6.2,7.375,,8.101776,9.559778,10.843221,11.623187,,-0.898224,-0.099803,,0.211497,0.026157
4,Barking and Dagenham,184,156.6,185.911,,205.434874,224.332972,250.46011,271.193003,,21.434874,0.116494,,3.949193,0.019904
5,Barnet,296,289.5,356.386,,393.215093,415.066095,448.240994,469.008723,,97.215093,0.328429,,5.96088,0.015861
6,Bexley,179,217.7,231.997,,237.012319,243.193377,253.74122,258.484219,,58.012319,0.324091,,2.239583,0.009392
7,Brent,310,246.4,311.215,,326.338839,348.633248,374.68888,381.645253,,16.338839,0.052706,,2.624403,0.008207
8,Bromley,237,295.2,309.392,,320.083395,328.780563,342.514284,350.868707,,83.083395,0.350563,,3.484156,0.010872
9,Camden,301,179,220.338,,230.283248,241.602601,257.415289,266.825465,,-70.716752,-0.234939,,3.069207,0.01326


In [9]:
# Combine the first two rows into a single header, ignore the growth rate columns for now
population_data.columns = [
    f"{col1} {col2}" if pd.notna(col1) and pd.notna(col2) 
    else col2
    for col1, col2 in zip(population_data.iloc[0], population_data.iloc[1])
]

# Drop the first three rows
population_data = population_data.drop(index=[0, 1, 2])

# Drop all columns with all NaN values
population_data = population_data.dropna(axis=1, how='all')

# Drop all rows with all NaN values
population_data = population_data.dropna(axis=0, how='all')

# Drop the last 6 rows
population_data = population_data.drop(index=population_data.index[-6:])

# Reset the index
population_data = population_data.reset_index(drop=True)

# Rename columns: replace spaces with underscores and lowercase them
population_data.columns = [
    col.strip().lower().replace(" ", "_").replace("-", "_") if isinstance(col, str) else col
    for col in population_data.columns
]

# Multiply the values in the population columns by 1000
population_columns = [
    'mid_year_estimate_1939', 
    'mid_year_estimate_1988',
    'census_2011', 
    'projection_2015', 
    'projection_2021', 
    'projection_2031',
    'projection_2039'
]
for col in population_columns:
    population_data[col] = population_data[col].astype(str).str.replace(' ', '').str.replace(',', '').astype(float) * 1000
    population_data[col] = population_data[col].astype(int)

# Drop the columns with NaN as the name
population_data = population_data.loc[:, ~population_data.columns.isna()]

In [None]:
# # Exclude aggregate areas from the dataset
# exclude_names = ["inner london", "outer london", "greater london"]
# population_data = population_data[~population_data["area_name"].str.lower().isin(exclude_names)]

# # Store as CSV file
# population_data.to_csv(
#     '../data/processed/population_by_borough_old.csv',
#     index=False,
#     header=True,
#     encoding='utf-8'
# )

### New population data - population density by borough dataset

In [10]:
housing_density_data_file_path = '../data/raw/housing-density-borough.csv'
housing_density_data = pd.read_csv(housing_density_data_file_path)
housing_density_data.head()

Unnamed: 0,Code,Name,Year,Source,Population,Inland_Area _Hectares,Total_Area_Hectares,Population_per_hectare,Square_Kilometres,Population_per_square_kilometre
0,E09000001,City of London,1999,ONS MYE,6581,290.4,314.9,22.7,2.9,2266.2
1,E09000001,City of London,2000,ONS MYE,7014,290.4,314.9,24.2,2.9,2415.3
2,E09000001,City of London,2001,ONS MYE,7359,290.4,314.9,25.3,2.9,2534.1
3,E09000001,City of London,2002,ONS MYE,7280,290.4,314.9,25.1,2.9,2506.9
4,E09000001,City of London,2003,ONS MYE,7115,290.4,314.9,24.5,2.9,2450.1


### TODO: Housing density data cleaning & verification 

In [11]:
housing_density_data['Name'].unique()

# Get unique area names from both datasets
population_area_names = set(population_data['area_name'].unique())
density_area_names = set(housing_density_data['Name'].unique())

# Find overlapping and non-overlapping names
overlap = population_area_names & density_area_names
only_in_population = population_area_names - density_area_names
only_in_density = density_area_names - population_area_names

print("Overlapping area names:", overlap)
print("Only in population_data:", only_in_population)
print("Only in housing_density_data:", only_in_density)

Overlapping area names: {'Greater London', 'Islington', 'Kensington and Chelsea', 'Ealing', 'Brent', 'Waltham Forest', 'Merton', 'Hillingdon', 'Bromley', 'Kingston upon Thames', 'Hounslow', 'Lewisham', 'Inner London', 'Richmond upon Thames', 'Haringey', 'Wandsworth', 'Hammersmith and Fulham', 'Lambeth', 'Tower Hamlets', 'Greenwich', 'Sutton', 'Redbridge', 'Westminster', 'Harrow', 'Hackney', 'Havering', 'Barnet', 'Camden', 'Barking and Dagenham', 'City of London', 'Newham', 'Enfield', 'Outer London', 'Southwark', 'Croydon', 'Bexley'}
Only in population_data: set()
Only in housing_density_data: {'London'}


#### Looks like housing density dataset contains 4 aggregate areas: London, Greater/Outer/Inner London

In [12]:
# Filter for the relevant area names
area_names_to_compare = ['London', 'Greater London', 'Inner London', 'Outer London']
comparison_df = housing_density_data[housing_density_data['Name'].isin(area_names_to_compare)]

# Pivot the data for easier comparison: rows=Year, columns=Name, values=Population
pivot_df = comparison_df.pivot(index='Year', columns='Name', values='Population')
pivot_df

Name,Greater London,Inner London,London,Outer London
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999,,2750716.0,7153912.0,4403196.0
2000,,2804949.0,7236712.0,4431763.0
2001,,2859375.0,7322403.0,4463028.0
2002,,2890923.0,7376671.0,4485748.0
2003,,2898563.0,7394817.0,4496254.0
2004,,2915912.0,7432730.0,4516818.0
2005,,2957048.0,7519009.0,4561961.0
2006,,2989670.0,7597825.0,4608155.0
2007,,3030764.0,7693473.0,4662709.0
2008,,3078914.0,7812161.0,4733247.0


In [13]:
# Check if London = Inner London + Outer London for years up to and including 2010
years_london = pivot_df.index[pivot_df.index <= 2010]
london_sum = pivot_df.loc[years_london, 'Inner London'] + pivot_df.loc[years_london, 'Outer London']
london_equal = pivot_df.loc[years_london, 'London'].fillna(0).astype(float) == london_sum.fillna(0).astype(float)
print("London == Inner London + Outer London (<=2010):", london_equal.all())
if not london_equal.all():
    print("Years where equality fails (<=2010):", years_london[~london_equal].tolist())

# Check if Greater London = Inner London + Outer London for years from 2011 onwards
years_greater = pivot_df.index[pivot_df.index >= 2011]
greater_sum = pivot_df.loc[years_greater, 'Inner London'] + pivot_df.loc[years_greater, 'Outer London']
greater_equal = pivot_df.loc[years_greater, 'Greater London'].fillna(0).astype(float) == greater_sum.fillna(0).astype(float)
print("Greater London == Inner London + Outer London (>=2011):", greater_equal.all())
if not greater_equal.all():
    print("Years where equality fails (>=2011):", years_greater[~greater_equal].tolist())

London == Inner London + Outer London (<=2010): True
Greater London == Inner London + Outer London (>=2011): False
Years where equality fails (>=2011): [2017, 2025, 2026, 2034, 2036, 2040, 2050]


In [14]:
# Find years where Greater London != Inner London + Outer London (>=2011)
failed_years = years_greater[~greater_equal]
for year in failed_years:
    inner = pivot_df.loc[year, 'Inner London']
    outer = pivot_df.loc[year, 'Outer London']
    greater = pivot_df.loc[year, 'Greater London']
    sum_inner_outer = inner + outer
    print(f"Year {year}: Inner London = {inner:.1f}, Outer London = {outer:.1f}, Sum = {sum_inner_outer:.1f}, Greater London = {greater:.1f}, Diff = {greater - sum_inner_outer:.1f}")

Year 2017: Inner London = 3575589.0, Outer London = 5328414.0, Sum = 8904003.0, Greater London = 8904004.0, Diff = 1.0
Year 2025: Inner London = 3885100.0, Outer London = 5765576.0, Sum = 9650676.0, Greater London = 9650677.0, Diff = 1.0
Year 2026: Inner London = 3920962.0, Outer London = 5811462.0, Sum = 9732424.0, Greater London = 9732425.0, Diff = 1.0
Year 2034: Inner London = 4175372.0, Outer London = 6152435.0, Sum = 10327807.0, Greater London = 10327806.0, Diff = -1.0
Year 2036: Inner London = 4232419.0, Outer London = 6232369.0, Sum = 10464788.0, Greater London = 10464787.0, Diff = -1.0
Year 2040: Inner London = 4333810.0, Outer London = 6384923.0, Sum = 10718733.0, Greater London = 10718734.0, Diff = 1.0
Year 2050: Inner London = 4536953.0, Outer London = 6671295.0, Sum = 11208248.0, Greater London = 11208247.0, Diff = -1.0


#### So, London == Greater London (pre-2011) and Inner + Outer almost always neatly adds up to the total

#### What could cause discrepancy in Inner + Outer London reported totals and borough aggregates? Have these Inner/Outer London borough assignments changed over time?
Classifications for Inner and Outer London have changed over time due to population shifts and administrative reorganizations.

Until 1889, the City of London was the only part of London with an administrative existence separate from the historic counties. From 1889 to 1965, the County of London, carved from the historic counties of Middlesex, Surrey, and Kent, administered an area comprising present-day Inner London plus the outer boroughs of Newham and Haringey. The boundaries established in 1889 responded to the rapid development of suburban areas in the 19th century.

By the mid-20th century, London's suburban population had spread far beyond the County of London's boundaries. To address this shift, the present boroughs were established in 1965 by amalgamating existing boroughs and districts to form the new metropolitan county of Greater London. The steepest population decline occurred in the densest areas, with Inner London boroughs losing more than one-third of their population in the postwar decades. London's population began slowly growing again in the 1990s.

Sources: 

1. [Greater London: Introduction](https://www.britannica.com/place/Greater-London#ref1)

2. [London: Demographic trends](https://www.britannica.com/place/London/City-layout#ref13491)

3. [London: City layout](https://www.britannica.com/place/London/City-layout#ref13487)

In [15]:
inner_london_boroughs = {
    # "Greenwich", is apparently not an inner borough
    "Camden",  "Hackney", "Hammersmith and Fulham", "Islington",
    "Kensington and Chelsea", "Lambeth", "Lewisham", "Southwark",
    "Tower Hamlets", "Wandsworth", "Westminster",
    # https://www.britannica.com/topic/Inner-London says these are also inner boroughs
    "Haringey", "Newham", "City of London"
}

# Prepare borough lists
inner_boroughs = list(inner_london_boroughs | {"City of London"})
outer_boroughs = set(density_area_names) - set(inner_boroughs) - set(area_names_to_compare)

In [16]:
# Create borough containment csv dataset
rows = []
for borough in inner_boroughs:
    rows.append({"borough": borough, "aggregate": "Inner London"})
    
for borough in outer_boroughs:
    rows.append({"borough": borough, "aggregate": "Outer London"})

# Create DataFrame and save as CSV
boroughs_aggregate_df = pd.DataFrame(rows)
boroughs_aggregate_df.to_csv("../data/processed/boroughs_containment.csv", index=False)

In [None]:
# Check for discrepancies between Inner/Outer London aggregates and dataset totals
years = sorted(housing_density_data['Year'].unique())

# Prepare results
discrepancies = []

for year in years:
    # Inner London sum
    inner_sum = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'].isin(inner_boroughs))
    ]['Population'].sum()
    inner_total = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Inner London')
    ]['Population'].sum()
    if not pd.isna(inner_total) and abs(inner_sum - inner_total) > 1:
        discrepancies.append({
            'Year': year,
            'Area': 'Inner London',
            'Sum of Boroughs': int(inner_sum),
            'Dataset Total': int(inner_total),
            'Difference': int(inner_sum - inner_total)
        })
    # Outer London sum
    outer_sum = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'].isin(outer_boroughs))
    ]['Population'].sum()
    outer_total = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Outer London')
    ]['Population'].sum()
    if not pd.isna(outer_total) and abs(outer_sum - outer_total) > 1:
        discrepancies.append({
            'Year': year,
            'Area': 'Outer London',
            'Sum of Boroughs': int(outer_sum),
            'Dataset Total': int(outer_total),
            'Difference': int(outer_sum - outer_total)
        })

# Display as table
if discrepancies:
    print(pd.DataFrame(discrepancies).to_string(index=False))
else:
    print("No discrepancies found between borough sums and dataset totals for Inner/Outer London.")

 Year         Area  Sum of Boroughs  Dataset Total  Difference
 2017 Outer London          5328416        5328414           2
 2020 Inner London          3710559        3710562          -3
 2025 Outer London          5765574        5765576          -2
 2026 Inner London          3920964        3920962           2
 2026 Outer London          5811464        5811462           2
 2029 Inner London          4015874        4015872           2
 2031 Inner London          4082436        4082434           2
 2031 Outer London          6031302        6031304          -2
 2036 Inner London          4232417        4232419          -2
 2036 Outer London          6232367        6232369          -2
 2038 Inner London          4284519        4284516           3
 2041 Outer London          6418875        6418878          -3
 2042 Inner London          4380023        4380025          -2
 2042 Outer London          6452243        6452241           2
 2044 Inner London          4422226        4422228     

In [20]:
# 1. Overwrite Inner and Outer London population totals with computed borough aggregates

# Compute borough aggregates for each year
inner_agg = housing_density_data[
    housing_density_data['Name'].isin(inner_boroughs)
].groupby('Year')['Population'].sum()
outer_agg = housing_density_data[
    housing_density_data['Name'].isin(outer_boroughs)
].groupby('Year')['Population'].sum()

# Update the dataset with these aggregates
for year, value in inner_agg.items():
    mask = (housing_density_data['Name'] == 'Inner London') & (housing_density_data['Year'] == year)
    housing_density_data.loc[mask, 'Population'] = value

for year, value in outer_agg.items():
    mask = (housing_density_data['Name'] == 'Outer London') & (housing_density_data['Year'] == year)
    housing_density_data.loc[mask, 'Population'] = value

# 2. Rename 'London' to 'Greater London' and set its population to Inner + Outer London aggregate

# Rename 'London' to 'Greater London'
housing_density_data.loc[housing_density_data['Name'] == 'London', 'Name'] = 'Greater London'

# Compute Inner + Outer London aggregate for each year
greater_agg = inner_agg.add(outer_agg, fill_value=0)

# Update population values for 'Greater London'
for year, value in greater_agg.items():
    mask = (housing_density_data['Name'] == 'Greater London') & (housing_density_data['Year'] == year)
    housing_density_data.loc[mask, 'Population'] = value

In [28]:
# Check if Inner London and Outer London aggregates now match the reported totals in the dataset

# For each year, compare borough sum to dataset total for Inner and Outer London
inner_discrepancies = []
outer_discrepancies = []

for year in years:
    # Inner London
    borough_sum_inner = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'].isin(inner_boroughs))
    ]['Population'].sum()
    dataset_total_inner = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Inner London')
    ]['Population'].sum()
    if not pd.isna(dataset_total_inner) and abs(borough_sum_inner - dataset_total_inner) > 1:
        inner_discrepancies.append({
            'Year': year,
            'Sum of Boroughs': int(borough_sum_inner),
            'Dataset Total': int(dataset_total_inner),
            'Difference': int(borough_sum_inner - dataset_total_inner)
        })
    # Outer London
    borough_sum_outer = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'].isin(outer_boroughs))
    ]['Population'].sum()
    dataset_total_outer = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Outer London')
    ]['Population'].sum()
    if not pd.isna(dataset_total_outer) and abs(borough_sum_outer - dataset_total_outer) > 1:
        outer_discrepancies.append({
            'Year': year,
            'Sum of Boroughs': int(borough_sum_outer),
            'Dataset Total': int(dataset_total_outer),
            'Difference': int(borough_sum_outer - dataset_total_outer)
        })

if not inner_discrepancies and not outer_discrepancies:
    print("All aggregates now match the reported totals for Inner and Outer London.")
else:
    if inner_discrepancies:
        print("Inner London discrepancies:")
        print(pd.DataFrame(inner_discrepancies).to_string(index=False))
    if outer_discrepancies:
        print("Outer London discrepancies:")
        print(pd.DataFrame(outer_discrepancies).to_string(index=False))

# Also check if Inner London + Outer London equals Greater London for each year
greater_discrepancies = []

for year in years:
    inner_total = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Inner London')
    ]['Population'].sum()
    outer_total = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Outer London')
    ]['Population'].sum()
    greater_total = housing_density_data[
        (housing_density_data['Year'] == year) &
        (housing_density_data['Name'] == 'Greater London')
    ]['Population'].sum()
    sum_inner_outer = inner_total + outer_total
    if not pd.isna(greater_total) and abs(sum_inner_outer - greater_total) > 1:
        greater_discrepancies.append({
            'Year': year,
            'Inner + Outer': int(sum_inner_outer),
            'Greater London': int(greater_total),
            'Difference': int(sum_inner_outer - greater_total)
        })

if not greater_discrepancies:
    print("All aggregates now match for Inner + Outer London = Greater London.")
else:
    print("Greater London discrepancies (Inner + Outer != Greater London):")
    print(pd.DataFrame(greater_discrepancies).to_string(index=False))

All aggregates now match the reported totals for Inner and Outer London.
All aggregates now match for Inner + Outer London = Greater London.


In [33]:
# Remove 'Code' and all columns containing 'hectare' (case-insensitive) from housing_density_data
cols_to_drop = ['Code'] + [col for col in housing_density_data.columns if 'hectare' in col.lower()]
housing_density_data = housing_density_data.drop(columns=cols_to_drop)

In [35]:
# Store the cleaned housing density dataset as a CSV file in the processed data folder
housing_density_data.to_csv(
    '../data/processed/housing_density_borough.csv',
    index=False,
    encoding='utf-8'
)

TODO: compare population (mostly projections) against 2022 figures and against population_by_borough_old dataset we have

TODO: perhaps add a type which indicates easily if population value is measured or projection