# MSOA Data Preprocessing

In [1]:
# read the data and take a look at the shape
import pandas as pd
import geopandas as gpd
import os
import numpy as np

## load data

### read population data (2019 csv)

source:
https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/middlesuperoutputareamidyearpopulationestimates

In [4]:
url = 'https://raw.githubusercontent.com/LingruFeng/dissertation/main/data_raw/msoa_population_2019estimated.csv'
df = pd.read_csv(url, low_memory=False)
df=df.rename(columns = {"All Ages": "population"})
print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 6,791 x 3


### read MSOA boundary data

source:
https://geoportal.statistics.gov.uk/datasets/ons::middle-layer-super-output-areas-december-2011-boundaries-super-generalised-clipped-bsc-ew-v3/about

In [20]:
msoa = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/MSOA_boundary.gpkg?raw=true')
msoa = msoa.to_crs('epsg:27700')

drop useless columns

In [21]:
drop_column = ['MSOA11NMW','BNG_E','BNG_N','LONG','LAT','Shape__Length','Shape__Area']
msoa.drop(columns=drop_column, axis=1,inplace=True)

filter the data, only leave england data

In [22]:
msoa = msoa[msoa.MSOA11CD.str.startswith('E')]
msoa.shape

(6791, 4)

join population data

In [27]:
test = pd.merge(left=msoa, right=df, how='left', left_on='MSOA11CD', right_on='MSOA Code')

In [28]:
test

Unnamed: 0,OBJECTID,MSOA11CD,MSOA11NM,geometry,MSOA Code,MSOA Name,All Ages
0,1,E02000001,City of London 001,"POLYGON ((532947.881 181895.061, 533412.520 18...",E02000001,City of London 001,9721
1,2,E02000002,Barking and Dagenham 001,"POLYGON ((549002.599 190873.761, 549122.828 18...",E02000002,Barking and Dagenham 001,7735
2,3,E02000003,Barking and Dagenham 002,"POLYGON ((548956.371 189063.484, 549087.474 18...",E02000003,Barking and Dagenham 002,11174
3,4,E02000004,Barking and Dagenham 003,"POLYGON ((551945.701 186027.855, 551672.764 18...",E02000004,Barking and Dagenham 003,6687
4,5,E02000005,Barking and Dagenham 004,"POLYGON ((549420.553 187442.648, 548985.314 18...",E02000005,Barking and Dagenham 004,10432
...,...,...,...,...,...,...,...
6786,6787,E02006930,Greenwich 037,"POLYGON ((540585.068 178133.897, 540368.424 17...",E02006930,Greenwich 037,9127
6787,6788,E02006931,Greenwich 038,"POLYGON ((538775.968 177841.277, 539072.243 17...",E02006931,Greenwich 038,9806
6788,6789,E02006932,Liverpool 060,"POLYGON ((335757.632 390987.474, 335739.219 39...",E02006932,Liverpool 060,17515
6789,6790,E02006933,Liverpool 061,"POLYGON ((335096.788 389638.891, 334715.024 38...",E02006933,Liverpool 061,8539


### read MSOA population weighted centroids data

set population weighted centroids as demand point

source:
https://geoportal.statistics.gov.uk/datasets/ons::middle-layer-super-output-areas-december-2011-population-weighted-centroids/about

In [23]:
demand = gpd.read_file('https://github.com/LingruFeng/dissertation/blob/main/data_raw/MSOA_Population_Weighted_Centroids.gpkg?raw=true')
demand = demand.to_crs('epsg:27700')

drop useless columns

(7201,)

In [None]:
drop_column = ['MSOA11NMW','BNG_E','BNG_N','LONG','LAT','Shape__Length','Shape__Area']
msoa.drop(columns=drop_column, axis=1,inplace=True)

filter the data, only leave england data