### Goal of this notebook is to process raw business survival rate data such that it can be appended to the Population nodes in the Knowledge graph

In [8]:
import pandas as pd

df = pd.read_csv("../data/raw/business-survival-rates.csv", na_values=[';', 'NA', ':', ''])

print(df.shape)
print("\nNull values per column:")
df.isna().sum()

(1020, 14)

Null values per column:


code                        0
area                        0
year                        0
births                      0
1_year_survival_number      0
1_year_survival_rate        0
2_year_survival_number     51
2_year_survival_rate       51
3_year_survival_number    102
3_year_survival_rate      102
4_year_survival_number    153
4_year_survival_rate      153
5_year_survival_number    204
5_year_survival_rate      204
dtype: int64

In [9]:
years = df['year'].dropna().unique()
years.sort()
boroughs = df['area'].dropna().unique()
boroughs.sort()

print(f"\nYear range: {years[0]} - {years[-1]}")
print(f"Number of boroughs: {len(boroughs)}")


Year range: 2002 - 2021
Number of boroughs: 51


In [10]:
numeric_cols = df.select_dtypes(include='number').columns
for col in numeric_cols:
    if (df[col] < 0).any():
        print(f"Warning: Negative values found in column '{col}'")

# 6. Show summary statistics
print("\nSummary statistics:")
df.drop(columns=['code', 'area', 'year']).describe()


Summary statistics:


Unnamed: 0,births,1_year_survival_number,1_year_survival_rate,2_year_survival_number,2_year_survival_rate,3_year_survival_number,3_year_survival_rate,4_year_survival_number,4_year_survival_rate,5_year_survival_number,5_year_survival_rate
count,1020.0,1020.0,1020.0,969.0,969.0,918.0,918.0,867.0,867.0,816.0,816.0
mean,31464.833333,29271.906863,92.721863,23345.784314,74.449536,18388.382353,58.541612,15109.607843,48.366897,12772.708333,41.065319
std,78218.477841,72921.977582,3.492929,57964.803993,4.32723,45672.562878,4.695703,37576.22139,4.451732,31818.114641,4.174375
min,435.0,410.0,60.5,335.0,46.8,250.0,33.4,205.0,21.5,170.0,16.6
25%,1575.0,1445.0,92.1,1145.0,72.3,890.0,55.8,725.0,45.8,600.0,38.7
50%,2517.5,2330.0,93.5,1815.0,74.5,1400.0,58.6,1130.0,48.6,942.5,41.2
75%,20406.25,19268.75,94.7,15550.0,77.2,12386.25,61.6,10140.0,51.2,8615.0,43.6
max,397540.0,378505.0,100.0,282775.0,88.2,215085.0,71.1,178505.0,62.5,152495.0,56.2


In [19]:
businesses_data_file_path = '../data/processed/businesses_with_boroughs.csv'
businesses_df = pd.read_csv(businesses_data_file_path)
businesses_boroughs = businesses_df['area'].dropna().unique()
print(f"Number of boroughs in businesses dataset: {len(businesses_boroughs)}")

# Compare with survival rates dataset boroughs
print(f"Number of boroughs in survival rates dataset: {len(boroughs)}")

# Boroughs in businesses but not in survival rates
missing_in_survival = set(businesses_boroughs) - set(boroughs)
print(f"Boroughs in businesses but not in survival rates: {missing_in_survival}")

# Boroughs in survival rates but not in businesses
missing_in_businesses = set(boroughs) - set(businesses_boroughs)
print(f"Boroughs in survival rates but not in businesses: {missing_in_businesses}")

Number of boroughs in businesses dataset: 33
Number of boroughs in survival rates dataset: 51
Boroughs in businesses but not in survival rates: set()
Boroughs in survival rates but not in businesses: {'Scotland', 'West Midlands', 'United Kingdom', 'Outer London', 'North East', 'Great Britain', 'South East', 'East', 'England', 'Northern Ireland', 'Yorkshire And The Humber', 'London', 'England And Wales', 'East Midlands', 'Inner London', 'South West', 'North West', 'Wales'}


TODO: remove irrelevant 'boroughs' from survival rates dataset. Perhaps do keep aggregates and nation-wide: [London, Inner/Outer London], rename London to Greater London

In [20]:
df

Unnamed: 0,code,area,year,births,1_year_survival_number,1_year_survival_rate,2_year_survival_number,2_year_survival_rate,3_year_survival_number,3_year_survival_rate,4_year_survival_number,4_year_survival_rate,5_year_survival_number,5_year_survival_rate
0,E09000001,City of London,2002,1145,1025,89.5,915.0,79.9,760.0,66.4,660.0,57.6,600.0,52.4
1,E09000002,Barking and Dagenham,2002,435,410,94.3,335.0,77.0,250.0,57.5,205.0,47.1,175.0,40.2
2,E09000003,Barnet,2002,2330,2185,93.8,1805.0,77.5,1290.0,55.4,1030.0,44.2,855.0,36.7
3,E09000004,Bexley,2002,765,710,92.8,595.0,77.8,470.0,61.4,385.0,50.3,325.0,42.5
4,E09000005,Brent,2002,1635,1530,93.6,1135.0,69.4,810.0,49.5,625.0,38.2,525.0,32.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,S92000003,Scotland,2021,18910,17730,93.8,,,,,,,,
1016,N92000002,Northern Ireland,2021,6655,6000,90.2,,,,,,,,
1017,K04000001,England And Wales,2021,338430,316190,93.4,,,,,,,,
1018,K03000001,Great Britain,2021,357340,333920,93.4,,,,,,,,


In [29]:
# Add aggregates (renaming 'London' to 'Greater London'), filter out irrelevant borough rows
aggregates = {'Greater London', 'Inner London', 'Outer London'}
df['area'] = df['area'].replace({'London': 'Greater London'})
boroughs_to_keep = set(businesses_boroughs) | aggregates
print(f"Number of boroughs to keep: {len(boroughs_to_keep)}")

df_filtered = df[df['area'].isin(boroughs_to_keep)].copy()

# Drop columns containing 'code' / 'survival number'
cols_to_drop = ['code'] + [col for col in df_filtered.columns if 'number' in col.lower()]
df_filtered = df_filtered.drop(columns=cols_to_drop)
df_filtered

Number of boroughs to keep: 36


Unnamed: 0,area,year,births,1_year_survival_rate,2_year_survival_rate,3_year_survival_rate,4_year_survival_rate,5_year_survival_rate
0,City of London,2002,1145,89.5,79.9,66.4,57.6,52.4
1,Barking and Dagenham,2002,435,94.3,77.0,57.5,47.1,40.2
2,Barnet,2002,2330,93.8,77.5,55.4,44.2,36.7
3,Bexley,2002,765,92.8,77.8,61.4,50.3,42.5
4,Brent,2002,1635,93.6,69.4,49.5,38.2,32.1
...,...,...,...,...,...,...,...,...
1000,Wandsworth,2021,2440,94.7,,,,
1001,Westminster,2021,7145,96.1,,,,
1002,Inner London,2021,44930,94.4,,,,
1003,Outer London,2021,40375,93.7,,,,


In [30]:
df_filtered.to_csv('../data/processed/boroughs_business_survival_rate.csv', index=False)