In [27]:
import pandas as pd
businesses_data_file_path = '../data/processed/businesses_with_boroughs.csv'
population_data_file_path = '../data/processed/population_by_borough.csv'

businesses_data = pd.read_csv(businesses_data_file_path)
population_data = pd.read_csv(population_data_file_path)

businesses_data_boroughs = businesses_data['area'].unique()
population_data_boroughs = population_data['area_name'].unique()

overlapping_boroughs = set(businesses_data_boroughs).intersection(set(population_data_boroughs))
print(f'{len(overlapping_boroughs)} boroughs are present in both datasets (out of {len(businesses_data_boroughs)} in businesses data and {len(population_data_boroughs)} in population data):')

33 boroughs are present in both datasets (out of 34 in businesses data and 36 in population data):


In [28]:
matching_boroughs = set(businesses_data_boroughs).intersection(population_data_boroughs)
only_in_businesses = set(businesses_data_boroughs) - set(population_data_boroughs)
only_in_population = set(population_data_boroughs) - set(businesses_data_boroughs)

# Create a DataFrame
max_len = max(len(matching_boroughs), len(only_in_businesses), len(only_in_population))
data = {
    "Matching Boroughs": list(matching_boroughs) + [""] * (max_len - len(matching_boroughs)),
    "Only in Businesses Data": list(only_in_businesses) + [""] * (max_len - len(only_in_businesses)),
    "Only in Population Data": list(only_in_population) + [""] * (max_len - len(only_in_population)),
}

boroughs_comparison_df = pd.DataFrame(data)

boroughs_comparison_df

Unnamed: 0,Matching Boroughs,Only in Businesses Data,Only in Population Data
0,Bexley,,Inner London
1,Greenwich,,Greater London
2,Wandsworth,,Outer London
3,Harrow,,
4,Croydon,,
5,City of London,,
6,Islington,,
7,Waltham Forest,,
8,Newham,,
9,Merton,,


In [None]:
# There are 40 businesses with missing area in the businesses data. 
# Let's check if we can find these in the businesses_with_streets.csv file.

# Load the businesses_with_streets.csv file
businesses_with_streets = pd.read_csv('../data/processed/businesses_with_streets.csv')

# Get osm_ids with missing area in businesses_data
missing_area_osm_ids = businesses_data[businesses_data['area'].isnull()]['osm_id']

# Filter the other file for these osm_ids
matching_rows = businesses_with_streets[businesses_with_streets['osm_id'].isin(missing_area_osm_ids)]

# Check for available location data
has_geometry = matching_rows['geometry'].notnull().sum()
has_street_name = matching_rows['street_name'].notnull().sum()

print(f"{has_geometry} out of {len(missing_area_osm_ids)} have geometry.")
print(f"{has_street_name} out of {len(missing_area_osm_ids)} have street_name.")

40 out of 40 have geometry.
11 out of 40 have street_name.


### So with geometry value we should be able to map them to the correct Boroughs

In [23]:
# Inner/ Outer/ Greater London are subsets of boroughs to indicate the total population over many/all boroughs.
# https://commons.wikimedia.org/wiki/File:Outer_Inner_London_Boroughs.png
# Define Inner London boroughs
inner_london_boroughs = {
    "Camden", "Greenwich", "Hackney", "Hammersmith and Fulham", "Islington",
    "Kensington and Chelsea", "Lambeth", "Lewisham", "Southwark",
    "Tower Hamlets", "Wandsworth", "Westminster"
}

# Clean whitespace
population_data["area_name"] = population_data["area_name"].str.strip()

# Exclude aggregate rows from manual sum calculation
boroughs_only = population_data[
    ~population_data["area_name"].isin(["Inner London", "Outer London", "Greater London"])
]

# Compute manual inner and outer London sums
manual_inner = boroughs_only[
    boroughs_only["area_name"].isin(inner_london_boroughs)
]["census_2011"].sum()

manual_outer = boroughs_only[
    ~boroughs_only["area_name"].isin(inner_london_boroughs)
]["census_2011"].sum()

# Get reported values from dataset
reported_inner = population_data.loc[
    population_data["area_name"] == "Inner London", "census_2011"
].values[0]

reported_outer = population_data.loc[
    population_data["area_name"] == "Outer London", "census_2011"
].values[0]

reported_total = population_data.loc[
    population_data["area_name"] == "Greater London", "census_2011"
].values[0]

# Print and compare
print(f"Computed Inner London: {manual_inner}")
print(f"Reported Inner London: {reported_inner}")
print("Difference:", manual_inner - reported_inner)

print(f"\nComputed Outer London: {manual_outer}")
print(f"Reported Outer London: {reported_outer}")
print("Difference:", manual_outer - reported_outer)

print(f"\nCpmputed Greater London: {manual_inner + manual_outer}")
print(f"Reported Greater London: {reported_total}")
print("Difference:", (manual_inner + manual_outer) - reported_total)

Computed Inner London: 2916173
Reported Inner London: 3231901
Difference: -315728

Computed Outer London: 5257768
Reported Outer London: 4942040
Difference: 315728

Cpmputed Greater London: 8173941
Reported Greater London: 8173941
Difference: 0


### Probably there are some differing conceptions of what Inner and Outer London is exactly. 
### Doesn't matter to us, the totals seem to match so the data is at least internally coherent. 
### Exclude the Inner, Outer and Greater London locations from the graph nodes for now.  