# MSOA Data Preprocessing

In [1]:
# read the data and take a look at the shape
import pandas as pd
import geopandas as gpd
import os
import numpy as np

## load data

### read population data (2019 csv)

source:
https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/middlesuperoutputareamidyearpopulationestimates

In [2]:
url = 'https://raw.githubusercontent.com/LingruFeng/dissertation/main/data_raw/msoa_population_2019estimated.csv'
df = pd.read_csv(url, low_memory=False)
df=df.rename(columns = {"All Ages": "population"})
print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 6,791 x 3


### read MSOA boundary data

source:
https://geoportal.statistics.gov.uk/datasets/ons::middle-layer-super-output-areas-december-2011-boundaries-super-generalised-clipped-bsc-ew-v3/about

In [3]:
msoa = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/MSOA_boundary.gpkg?raw=true')
msoa = msoa.to_crs('epsg:27700')

drop useless columns

In [4]:
drop_column = ['MSOA11NMW','BNG_E','BNG_N','LONG','LAT','Shape__Length','Shape__Area']
msoa.drop(columns=drop_column, axis=1,inplace=True)

filter the data, only leave england data

In [5]:
msoa = msoa[msoa.MSOA11CD.str.startswith('E')]
msoa.shape

(6791, 4)

join population data

In [6]:
MSOA = pd.merge(left=msoa, right=df[['MSOA Code','population']], how='left', left_on='MSOA11CD', right_on='MSOA Code')
MSOA.drop(columns=['MSOA Code','OBJECTID'], axis=1,inplace=True)

In [7]:
MSOA

Unnamed: 0,MSOA11CD,MSOA11NM,geometry,population
0,E02000001,City of London 001,"POLYGON ((532947.881 181895.061, 533412.520 18...",9721
1,E02000002,Barking and Dagenham 001,"POLYGON ((549002.599 190873.761, 549122.828 18...",7735
2,E02000003,Barking and Dagenham 002,"POLYGON ((548956.371 189063.484, 549087.474 18...",11174
3,E02000004,Barking and Dagenham 003,"POLYGON ((551945.701 186027.855, 551672.764 18...",6687
4,E02000005,Barking and Dagenham 004,"POLYGON ((549420.553 187442.648, 548985.314 18...",10432
...,...,...,...,...
6786,E02006930,Greenwich 037,"POLYGON ((540585.068 178133.897, 540368.424 17...",9127
6787,E02006931,Greenwich 038,"POLYGON ((538775.968 177841.277, 539072.243 17...",9806
6788,E02006932,Liverpool 060,"POLYGON ((335757.632 390987.474, 335739.219 39...",17515
6789,E02006933,Liverpool 061,"POLYGON ((335096.788 389638.891, 334715.024 38...",8539


### read MSOA population weighted centroids data

set population weighted centroids as demand point

source:
https://geoportal.statistics.gov.uk/datasets/ons::middle-layer-super-output-areas-december-2011-population-weighted-centroids/about

In [8]:
demand = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/MSOA_Population_Weighted_Centroids.gpkg?raw=true')
demand = demand.to_crs('epsg:27700')

filter the data, only leave england data

In [9]:
demand = demand[demand.msoa11cd.str.startswith('E')]
demand.shape

(6791, 4)

In [10]:
demand = pd.merge(left=demand, right=df[['MSOA Code','population']], how='left', left_on='msoa11cd', right_on='MSOA Code')

In [11]:
demand.drop(columns=['MSOA Code','msoa11nm'], axis=1,inplace=True)

In [12]:
demand

Unnamed: 0,objectid,msoa11cd,geometry,population
0,1,E02002536,POINT (445582.345 524175.434),9771
1,2,E02002537,POINT (446777.151 524256.841),8641
2,3,E02002534,POINT (461356.929 515118.900),5693
3,4,E02002535,POINT (446117.027 525455.836),9336
4,5,E02002532,POINT (461053.212 516175.379),6655
...,...,...,...,...
6786,6787,E02004669,POINT (393469.114 227500.260),7496
6787,6788,E02006096,POINT (332829.367 109219.836),7001
6788,6789,E02003088,POINT (343276.340 158947.520),8444
6789,6790,E02006070,POINT (331710.269 136880.200),8291


## read vaccination site point data, clean and reformat

In [13]:
hospital = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/hospital_hub.gpkg?raw=true')
hospital = hospital.to_crs('epsg:27700')

pharmacy = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/pharmacy.gpkg?raw=true')
pharmacy = pharmacy.to_crs('epsg:27700')

pcn = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/primary_care_network.gpkg?raw=true')
pcn = pcn.to_crs('epsg:27700')

vc = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/vaccination_centre.gpkg?raw=true')
vc = vc.to_crs('epsg:27700')

In [14]:
# Drop useless columns
drop_columns = ['Region','ICS_STP','Trust_or_s','Address','Postcode','city','country','result_num','status','formatted_','place_id','location_t','latlong']
hospital.drop(columns=drop_columns, axis=1,inplace=True)

drop_columns = ['Region','CCG','Site_name','Address','Postcode','city','country','result_num','status','formatted_','place_id','location_t','latlong']
pcn.drop(columns=drop_columns, axis=1,inplace=True)

drop_columns = ['Name_of_si','Address','Postcode','city','country','result_num','status','formatted_','place_id','location_t','latlong']
pharmacy.drop(columns=drop_columns, axis=1,inplace=True)

drop_columns = ['Centre','Address','Postcode','city','country','result_num','status','formatted_','place_id','location_t','latlong']
vc.drop(columns=drop_columns, axis=1,inplace=True)

In [15]:
# format the dataframe
vc['hospital']=0
vc['pcn']=0
vc['pharmacy']=0
vc['vc']=1

hospital['hospital']=1
hospital['pcn']=0
hospital['pharmacy']=0
hospital['vc']=0

pharmacy['hospital']=0
pharmacy['pcn']=0
pharmacy['pharmacy']=1
pharmacy['vc']=0

pcn['hospital']=0
pcn['pcn']=1
pcn['pharmacy']=0
pcn['vc']=0

In [16]:
print("Hospital Hub number:",hospital.shape[0])
print("Primary Care Network number:",pcn.shape[0])
print("Pharmacy:",pharmacy.shape[0])
print("Vaccination Centre number:",vc.shape[0])

Hospital Hub number: 267
Primary Care Network number: 1018
Pharmacy: 200
Vaccination Centre number: 115


In [17]:
# concat four types of vaccination site data into one dataframe
site = pd.concat([hospital, pcn, pharmacy, vc], axis=0, ignore_index=True).reset_index()

In [18]:
site

Unnamed: 0,index,geometry,hospital,pcn,pharmacy,vc
0,0,POINT (504748.029 248914.642),1,0,0,0
1,1,POINT (505059.230 222903.988),1,0,0,0
2,2,POINT (532504.110 176066.629),1,0,0,0
3,3,POINT (532504.110 176066.629),1,0,0,0
4,4,POINT (504396.617 222557.941),1,0,0,0
...,...,...,...,...,...,...
1595,1595,POINT (196928.058 72385.978),0,0,0,1
1596,1596,POINT (524814.936 172019.963),0,0,0,1
1597,1597,POINT (526813.753 181129.037),0,0,0,1
1598,1598,POINT (408626.628 90816.439),0,0,0,1


## Export the data

In [19]:
site.to_file("vaccination_site.gpkg", driver="GPKG")

In [20]:
MSOA.to_file("MSOA_Boundary_with_population.gpkg", driver="GPKG")

In [21]:
demand.to_file("MSOA_Population_Weighted_Centroids_with_population.gpkg", driver="GPKG")

In [22]:
site

Unnamed: 0,index,geometry,hospital,pcn,pharmacy,vc
0,0,POINT (504748.029 248914.642),1,0,0,0
1,1,POINT (505059.230 222903.988),1,0,0,0
2,2,POINT (532504.110 176066.629),1,0,0,0
3,3,POINT (532504.110 176066.629),1,0,0,0
4,4,POINT (504396.617 222557.941),1,0,0,0
...,...,...,...,...,...,...
1595,1595,POINT (196928.058 72385.978),0,0,0,1
1596,1596,POINT (524814.936 172019.963),0,0,0,1
1597,1597,POINT (526813.753 181129.037),0,0,0,1
1598,1598,POINT (408626.628 90816.439),0,0,0,1


In [23]:
MSOA

Unnamed: 0,MSOA11CD,MSOA11NM,geometry,population
0,E02000001,City of London 001,"POLYGON ((532947.881 181895.061, 533412.520 18...",9721
1,E02000002,Barking and Dagenham 001,"POLYGON ((549002.599 190873.761, 549122.828 18...",7735
2,E02000003,Barking and Dagenham 002,"POLYGON ((548956.371 189063.484, 549087.474 18...",11174
3,E02000004,Barking and Dagenham 003,"POLYGON ((551945.701 186027.855, 551672.764 18...",6687
4,E02000005,Barking and Dagenham 004,"POLYGON ((549420.553 187442.648, 548985.314 18...",10432
...,...,...,...,...
6786,E02006930,Greenwich 037,"POLYGON ((540585.068 178133.897, 540368.424 17...",9127
6787,E02006931,Greenwich 038,"POLYGON ((538775.968 177841.277, 539072.243 17...",9806
6788,E02006932,Liverpool 060,"POLYGON ((335757.632 390987.474, 335739.219 39...",17515
6789,E02006933,Liverpool 061,"POLYGON ((335096.788 389638.891, 334715.024 38...",8539


In [24]:
demand

Unnamed: 0,objectid,msoa11cd,geometry,population
0,1,E02002536,POINT (445582.345 524175.434),9771
1,2,E02002537,POINT (446777.151 524256.841),8641
2,3,E02002534,POINT (461356.929 515118.900),5693
3,4,E02002535,POINT (446117.027 525455.836),9336
4,5,E02002532,POINT (461053.212 516175.379),6655
...,...,...,...,...
6786,6787,E02004669,POINT (393469.114 227500.260),7496
6787,6788,E02006096,POINT (332829.367 109219.836),7001
6788,6789,E02003088,POINT (343276.340 158947.520),8444
6789,6790,E02006070,POINT (331710.269 136880.200),8291
