# Locating and classifying the expanded ocod dataset

This notebook runs through the process of locating properties withing the OA/LSOA system and classifying properties into one of the 5 types and 'unknown'

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import pandas as pd
import numpy as np
import os
import re
import io
import zipfile
#from helper_functions import *
from locate_and_classify_helper_functions import *


In [2]:
print("load ONSPD")
postcode_district_lookup = load_postocde_district_lookup("./empty_homes_data/" + "ONSPD_NOV_2021_UK.zip", "Data/ONSPD_NOV_2021_UK.csv")
print("load expanded ocod")
ocod_data =  pd.read_csv("./empty_homes_data/OCOD_cleaned_expanded3.csv")
print("pre-process expanded ocod data")
ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)
print("load and pre-process the Land Registry price paid dataset")
price_paid_df = load_and_process_pricepaid_data("./empty_homes_data/price_paid_files/", postcode_district_lookup)
print("add in missing Local authority codes to the ocoda dataset")
ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df)
print("load and pre-process the voa business ratings list dataset")
voa_businesses = load_voa_ratinglist('./empty_homes_data/' +'uk-englandwales-ndr-2017-listentries-compiled-epoch-0029-baseline-csv.csv', postcode_district_lookup)
del postcode_district_lookup

load ONSPD


  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','oa11','lsoa11', 'msoa11', 'ctry']]


load expanded ocod
pre-process expanded ocod data
load and pre-process the Land Registry price paid dataset
add in missing Local authority codes to the ocoda dataset
load and pre-process the voa business ratings list dataset


  voa_businesses =  pd.read_csv(file_path,


## Using price paid data to match names

The land registry does not use standardised LAD codes or names and 
the LAD names it uses appear to be wrong sometimes. I need to know the LADs so that I only try road matching within local authorities to minimise the chance of having the same road twice. To get around this I will use the substantially larger database of the price paid data to get all the land registry district names and match them to the onsp using the postcodes. This works as there are a large number of sales in each district most of them will have a postcode. There are cases where the wrong district or postcode is applied meaning a single district name can have two or more lad11cd's, to solve this I simply take the lad11cd with the largest number of counts.

The resulting OCOD data frame has a LAD11CD for each entry, and thus allows the road matching to work effectively

# Street and buildings to match lsoa

This section fills in missing lsoa11cd using knowledge of the LAD11cd and the streets within it. This takes data from price paid and voa

In [3]:
##
##This process is quite convoluted and there is certainly a more efficient and pythonic way
## however the order within each filling method is important to ensure that there are no duplicates
## as this causes the OCOD dataset to grow with duplicates
##
ocod_data = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)

replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA


## Matching at sub street level

Some streets are on the boundary of LSOA this section uses the street number to match to the nearest lsoa.

In [4]:
#This takes some time
ocod_data3 = substreet_matching(ocod_data, price_paid_df, voa_businesses)
#percent of dataset without lsoa
ocod_data['lsoa11cd'].isnull().sum()/ocod_data.shape[0]

lad  100  of 244
lad  200  of 244


0.13455019044751895

## Add in counts of businesses per oa and LSOA

In [5]:
#This function allows areas with no  businesses to automatically exclude business from the classification
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)


## What still doesn't have an LSOA?
what still doesn't have lsoa and what properties do they have?

In [19]:
pd.crosstab(ocod_data['postcode'].notnull(), ocod_data['lsoa_building'].notnull())

lsoa_building,False,True
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
False,49390,7231
True,81337,21754


In [None]:
test = ocod_data

In [None]:
#observations localised with lsoa and/or oa
pd.crosstab(test['lsoa11cd'].notnull(),  test['oa11cd'].notnull())/ocod_data.shape[0]

In [None]:
#this is definately the problem then
pd.crosstab(test['lsoa_street'].notnull(),  test['lsoa_building'].notnull())

In [None]:
test2 = test[test['lsoa11cd'].isnull()]
pd.crosstab(test2['property_address'].str.startswith('land') , test2['lsoa_street'].notnull())

In [None]:
#this is definately the problem then
pd.crosstab(test['street_name'].notnull(),  test['lsoa11cd'].notnull())

In [None]:
test[test['lsoa11cd'].isnull() & test['street_name'].notnull()].to_csv('/tf/empty_homes_data/delete_me.csv')

In [21]:
#95.5% of sets have only a single lsoa, when grouped by street, town, district and locality
#when grouped by only street and district, this number is still 90%
#excluding town the number is still 0.95% but dropping locality gives a match on 91%, therefore using locality is the key
temp = price_paid_df.groupby(['street', 'district', 'lsoa11cd']).size().reset_index().groupby(['street', 'district']).size()\
.reset_index().rename(columns = {0:'counts'})

#temp.groupby('counts').size()/temp.shape[0]


# VOA matching businesses

The below chunk matches addresses to known businesses

In [7]:
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)

address matched  0 lads of 331
address matched  50 lads of 331
address matched  100 lads of 331
address matched  150 lads of 331
address matched  200 lads of 331
address matched  250 lads of 331
address matched  300 lads of 331


In [None]:
pd.crosstab(ocod_data['oa_busi_building'].notnull(), ocod_data['business_address'].notnull())

# Classify property type

This section classifies the the data into different property types. 

# Classification type 1

The land is classified by the rules below which search the address string or meta data using regex.
The classification is hierarchical with the first match being the classification type.
Therefore if a property is classified by rule three and rule 6, rule three will take precedent and the property would be classed as airspace

- Starts with land/plot (land)
- Parking spaces (carpark)
- Air space (airspace)
- Flats, penthouses. apartments (domestic)
- Address matched businesses (business)
- Keyword relating to business (business)
- Land with other words before it (land)
- Pubs (business)
- A business was match in the same building (business)
- Is in the same address as a building (business)
- No business in the OA (domestic)
- No business in the LSOA (domestic)

After classifying the properties, classes left unknown are completed using the properties that are classed from the same title number
This is possible as there are no conflicting property classes within a given title number. This shows the quality of the method

In [8]:
ocod_data = classification_type1(ocod_data)

In [8]:
multi_class_titles = ocod_data[~ocod_data['class'].isin(['unknown', 'airspace', 'carpark']) & (ocod_data['within_larger_title']==True)].groupby(['title_number', 'class']).\
size().reset_index().groupby('title_number').size().reset_index().rename(columns={0:'counts'})

#there are no within title-ids that have more than one class. This shows that this is a very accurate way of filling in missing class data
print(multi_class_titles[multi_class_titles['counts']>1])

multi_class_titles = multi_class_titles[multi_class_titles['counts']==1]
#multi_class_titles.groupby('counts').size()

ocod_data[ocod_data['title_number'].isin(multi_class_titles['title_number'])].groupby('class').size()
#[['street_number', 'street_name','property_address', "business_address"]]

     title_number  counts
10         126312       2
16         142155       2
17         146577       2
19         147442       2
20         148312       2
...           ...     ...
4287    WYK737596       2
4294    WYK792514       2
4299    WYK856042       2
4304     YEA16295       2
4320      YY38811       2

[669 rows x 2 columns]


class
business     8046
domestic    40157
land            9
dtype: int64

## Classification type 2

Classification type 2 only affects the properties of class 'unknown' in classification type 1.

These properties are assumed to beeither domestic or business.
They are heierarchically classified into domestic or 'unknown' using the following rules

- Street match == TRUE, Street name is known AND street number is known (domestic)
- Street match is FALSE AND street name is known (domestic)
- Building name is known (domestic)

All remaining addresses do not contain enough information to be classified and are classed as unknown

In [9]:
ocod_data = classification_type2(ocod_data)

In [None]:
#If there is a street match, and the property has a street and a street number OR a building name
#Then is is a domestic property

test = ocod_data[ocod_data['class2']=='unknown']
print(pd.crosstab((test['street_match']==True), (test['street_name'].notnull()==True) ))

ocod_data[(ocod_data['street_name'].isnull()==True) & (ocod_data['class2']=='unknown')].to_csv('/tf/empty_homes_data/delete_me.csv')

In [None]:
ocod_data.groupby('class').size()

In [None]:
pd.crosstab(ocod_data['unit_type'],(ocod_data['class2']=="domestic"))

## Contracting the dataset
Businesses, carparks and airpsace etc are classed as a single address independent of how many components they a made of.
This chunk strips down businesses that have been expanded back to a single address


In [10]:
ocod_data = contract_ocod_after_classification(ocod_data, class_type = 'class2', classes = ['domestic'] )


In [None]:
ocod_data.groupby('class2').size()

In [None]:
ocod_data.groupby('class2').size()/ocod_data.shape[0]

In [45]:
#non of the unknowns have a postcode. I guess this is obvious as if there is no matching VOA postcode you are classed as domestic
#pd.crosstab(ocod_data[ocod_data['class']=="unknown"].postcode.notnull(), ocod_data[ocod_data['class']=="unknown"].street_name.notnull())

In [46]:
pd.crosstab(ocod_data['tenure'], ocod_data['region'].str.lower())#.to_latex() #convert to copyable latex table

region,east anglia,east midlands,greater london,north,north west,south east,south west,wales,west midlands,yorks and humber
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Freehold,3418,4680,21783,3304,16221,17722,5754,3395,5050,8460
Leasehold,434,1209,40133,882,5741,5496,1702,693,1858,2793


# Saving the enhanced expanded dataset

In [48]:
ocod_data.to_csv("/tf/empty_homes_data/enhanced_ocod_dataset.csv")

#Save the test set indices to create the ground truth
#this is commented out to avoid overwriting

#ocod_data.loc[ocod_data.title_number.isin(pd.read_csv("/tf/empty_homes_data/test_set_indices.csv")['title_number']) ,  
#              ['title_number','within_title_id','unit_type' ,'building_name', 'street_number', 'street_name','postcode' ,'property_address',  'lsoa11cd', 'class2']].to_csv('/tf/empty_homes_data/parsed_ground_truth_raw.csv')

93707

## creating the aggregated ocod dataset for sampling

This chunk outputs the data ready for analysis in R. The two files cover the base case and the case where nested and single properties are separated

In [55]:
ocod_data_lsoa = ocod_data
#ocod_data_lsoa['postcode2'] = ocod_data['postcode'].str.lower().str.replace("\s", "")

#ocod_data_lsoa = ocod_data.merge(postcode_district_lookup, 'left', left_on = "postcode2", right_on = "postcode2")

ocod_data_lsoa.groupby(['lad11cd', 'lsoa11cd', 'msoa11cd', 'class2']).size().reset_index().to_csv("/tf/empty_homes_data/ocod_lsoa.csv")
ocod_data_lsoa.groupby(['lad11cd', 'lsoa11cd', 'msoa11cd', 'class2', 'within_larger_title']).size().reset_index().to_csv("/tf/empty_homes_data/ocod_lsoa_by_nested_type.csv")



# Post creation analysis

In [18]:
pd.crosstab(ocod_data['class2'], ocod_data['region'].str.lower())#.to_latex() #convert to copyable latex table

region,east anglia,east midlands,greater london,north,north west,south east,south west,wales,west midlands,yorks and humber
class2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
airspace,50,162,94,15,215,185,266,290,94,303
business,550,1437,15806,799,2523,4453,1424,670,1423,1337
carpark,27,36,1597,24,518,194,89,9,50,57
domestic,2510,3791,48017,2742,16976,14306,4829,2607,4112,8439
land,639,919,2639,733,2215,4808,1270,629,1316,1314
unknown,64,115,734,157,319,453,207,89,150,133


In [19]:
pd.crosstab(ocod_data['class2'], ocod_data['tenure'])#.to_latex() #convert to copyable latex table

tenure,Freehold,Leasehold
class2,Unnamed: 1_level_1,Unnamed: 2_level_1
airspace,7,1667
business,20948,9474
carpark,252,2349
domestic,60744,47585
land,14619,1863
unknown,1534,887


In [20]:
temp_df = ocod_data[['title_number', 'tenure', 'within_larger_title']].drop_duplicates()

#most of titles containing nested addresses are free hold by about 3/2
pd.crosstab(temp_df['tenure'], temp_df['within_larger_title'])


within_larger_title,False,True
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1
Freehold,46557,4208
Leasehold,41562,1381


In [21]:
#The analysis is based on nested addresses being domestic
temp_df = ocod_data[['title_number', 'tenure', 'property_address']][ocod_data['within_larger_title']==True]
temp_df['is_flat'] = temp_df['property_address'].str.contains(r"(flat|apartment|penthouse|unit)", case = False)

#pd.crosstab(temp_df['tenure'], temp_df['within_larger_title'])

temp_df.groupby('tenure').size()

#Of nested addresses freehold is more common by 3/2 50k to 24k
#most of theproperties are not flats however flats dominate the leasehold section
#flats are 1/3 of nested addresses but make up almost 3/4 of the leashold nested addresses
#note this does not include items marked as units
pd.crosstab(temp_df.tenure, temp_df.is_flat)

  temp_df['is_flat'] = temp_df['property_address'].str.contains(r"(flat|apartment|penthouse|unit)", case = False)


is_flat,False,True
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1
Freehold,43388,7113
Leasehold,5186,16455


## Largest nested addresses

In [5]:
#The largest nested address
ocod_data.within_title_id.max()
ocod_data[ocod_data.within_title_id==ocod_data.within_title_id.max()].reset_index()['property_address'][0]



'Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 Arizona Building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 California Building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 Colorado Building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 Dakota Building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Idaho Building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 Indiana Building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 Montana Building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Nebraska Building, 1-10, 101-110, 201-210, 301-310 and 402-403 Utah Building, 1-10 and 101-110 Boston Building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 Madison Building, Deals Gateway, London'

# Whole location and classification pipeline

This provides a quick method to ouput the enhanced and expanded ocod dataset

In [1]:
from address_parsing_helper_functions import *
from locate_and_classify_helper_functions import *

root_path = "/home/jonno/empty_homes_data/"#str(args[1])


ocod_data = load_and_prep_OCOD_data(root_path + 'OCOD_FULL_2022_02.csv')

all_entities = spacy_pred_fn(spacy_model_path = root_path+'spacy_data/cpu/model-best', ocod_data = ocod_data.loc[0:1000,:])
full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)


  ocod_data =  pd.read_csv(file_path,


Loading the spaCy model




Adding the datapoint id and title number meta data to the property address
predicting over the OCOD dataset using the pre-trained spaCy model
i= 0  expand time,0.0 filter time0.001 make_dataframe_time 0.002
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  uniques = Index(uniques)


In [12]:
all_entities#[all_entities.text[x][all_entities.start[x]:all_entities.end[x]] for x in range(0,all_entities.shape[0])]

Unnamed: 0,start,end,label,text,datapoint_id,label_text
0,0,25,building_name,"westleigh lodge care home, nel pan lane, leigh...",0,westleigh lodge care home
1,27,39,street_name,"westleigh lodge care home, nel pan lane, leigh...",0,nel pan lane
2,41,46,city,"westleigh lodge care home, nel pan lane, leigh...",0,leigh
3,48,55,postcode,"westleigh lodge care home, nel pan lane, leigh...",0,wn7 5jt
4,0,4,unit_type,"flat 1, 1a canal street, manchester (m1 3he)",1,flat
...,...,...,...,...,...,...
4344,47,54,city,"parking space 409, spectrum, blackfriars road,...",999,salford
4345,0,10,building_name,"acre house, town square, sale (m33 7wz)",1000,acre house
4346,12,23,street_name,"acre house, town square, sale (m33 7wz)",1000,town square
4347,25,29,city,"acre house, town square, sale (m33 7wz)",1000,sale


In [8]:
all_entities.text[0][all_entities.start[0]:all_entities.end[0]]

'westleigh lodge care home'

In [7]:
test = load_cleaned_labels('/home/jonno/empty_homes_data//full_dataset_no_overlaps.json')
test

Unnamed: 0,start,end,label_text,labelId,label,labellingFunctionId,groundTruthId,datapoint_id,text,label_id_count
0,0,25,westleigh lodge care home,10,building_name,73,,0,"westleigh lodge care home, nel pan lane, leigh...",0
1,27,39,nel pan lane,12,street_name,28,,0,"westleigh lodge care home, nel pan lane, leigh...",0
2,41,46,leigh,5,city,10,,0,"westleigh lodge care home, nel pan lane, leigh...",0
3,48,55,wn7 5jt,11,postcode,50,,0,"westleigh lodge care home, nel pan lane, leigh...",0
4,0,4,flat,15,unit_type,15,,1,"flat 1, 1a canal street, manchester (m1 3he)",0
...,...,...,...,...,...,...,...,...,...,...
436660,12,42,discovery dock apartments east,10,building_name,39,,94087,"storage 17, discovery dock apartments east, 3 ...",0
436661,44,45,3,13,street_number,7,,94087,"storage 17, discovery dock apartments east, 3 ...",0
436662,46,63,south quay square,12,street_name,28,,94087,"storage 17, discovery dock apartments east, 3 ...",0
436663,65,71,london,5,city,10,,94087,"storage 17, discovery dock apartments east, 3 ...",0


In [8]:
    print('Loading the spaCy model')
    nlp1 = spacy.load(root_path+'spacy_data/cpu/model-best') 

    print('Adding the datapoint id and title number meta data to the property address')
    ocod_context = [(ocod_data.loc[x,'property_address'], {'datapoint_id':x, 'title_number':str(ocod_data.title_number[x])}) for x in range(0,ocod_data.shape[0])]
    

Loading the spaCy model




Adding the datapoint id and title number meta data to the property address


In [6]:
all_entities
#full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)


Unnamed: 0,start,end,label_text,text,datapoint_id
0,0,25,building_name,"westleigh lodge care home, nel pan lane, leigh...",0
1,27,39,street_name,"westleigh lodge care home, nel pan lane, leigh...",0
2,41,46,city,"westleigh lodge care home, nel pan lane, leigh...",0
3,48,55,postcode,"westleigh lodge care home, nel pan lane, leigh...",0
4,0,4,unit_type,"flat 1, 1a canal street, manchester (m1 3he)",1
...,...,...,...,...,...
434517,12,42,building_name,"storage 17, discovery dock apartments east, 3 ...",94087
434518,44,45,street_number,"storage 17, discovery dock apartments east, 3 ...",94087
434519,46,63,street_name,"storage 17, discovery dock apartments east, 3 ...",94087
434520,65,71,city,"storage 17, discovery dock apartments east, 3 ...",94087


In [2]:
from address_parsing_helper_functions import *
from locate_and_classify_helper_functions import *

root_path = "/home/jonno/empty_homes_data/"#str(args[1])


ocod_data = load_and_prep_OCOD_data(root_path + 'OCOD_FULL_2022_02.csv')

all_entities = spacy_pred_fn(spacy_model_path = root_path+'spacy_data/cpu/model-best', ocod_data = ocod_data)

full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)

del all_entities #memory management

ocod_data = post_process_expanded_data(full_expanded_data, ocod_data)

del full_expanded_data #memory management

print("Load ONSPD")
postcode_district_lookup = load_postocde_district_lookup(root_path + "ONSPD_NOV_2021_UK.zip", "Data/ONSPD_NOV_2021_UK.csv")
print("Load expanded ocod")
ocod_data =  pd.read_csv(root_path+'OCOD_cleaned_expanded2.csv')
print("Pre-process expanded ocod data")
ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)
print("Load and pre-process the Land Registry price paid dataset")
price_paid_df = load_and_process_pricepaid_data(root_path+'price_paid_files/', postcode_district_lookup)
print("Add in missing Local authority codes to the ocoda dataset")
ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df)
print("Load and pre-process the voa business ratings list dataset")
voa_businesses = load_voa_ratinglist(root_path +'uk-englandwales-ndr-2017-listentries-compiled-epoch-0029-baseline-csv.csv', postcode_district_lookup)

del postcode_district_lookup #for memory purposes

print("Match street addresses and buildings")
ocod_data = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)

#This takes some time
print('Sub-street matching, this takes some time')
ocod_data = substreet_matching(ocod_data, price_paid_df, voa_businesses)

del price_paid_df #for memory purposes
print('Add in businesses per oa and lsoa')
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)

print('Identify businesses using address matching')
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)

del voa_businesses #probably not necessary but still delete to save memory

print('Classification type 1')
ocod_data = classification_type1(ocod_data)
print('Classification type 2')
ocod_data = classification_type2(ocod_data)

print('Contract ocod dataset')
ocod_data = contract_ocod_after_classification(ocod_data, class_type = 'class2', classes = ['domestic'] )

ocod_data.to_csv(root_path+'OCOD_classes.csv')

  ocod_data =  pd.read_csv(file_path,


Loading the spaCy model


OSError: [E053] Could not read meta.json from /home/jonno/empty_homes_data/full_dataset_no_overlaps.json

In [3]:
ground_truth_df = pd.read_csv('/tf/empty_homes_data/Ground truth for offshore empties V2 includes comma space_labels.csv')


#I only need a small number of the columns to be able to calculate the F1 score
#Everything else just makes it confusing. 
#renaming is for consistancy
ground_truth_df = ground_truth_df.loc[ground_truth_df.loc[:,'result_type']=="span",[ 'result_type', 'label',
       'start', 'end', 'text', 'input:text', 'input:datapoint_id']].rename(
    columns = {'input:text':'property_address',
              'input:datapoint_id':'datapoint_id',
              'text':'label_text'})

In [16]:
unit_park = (ocod_data.property_address.str.contains('unit') & ocod_data.property_address.str.contains('park'))

ocod_data.loc[(unit_park==True)  & (ocod_data.class2.isin(['unknown', 'domestic'])), ['property_address', 'class', 'class2']]

Unnamed: 0,property_address,class,class2
1502,"unit 2, sovereign park, coronation road, londo...",unknown,domestic
1204,"unit 12, chiswick park, 566 chiswick high road...",unknown,domestic
2093,"unit 5, acton park estate, the vale, acton (w3...",unknown,unknown
221,"unit 3, polar park, bath road, harmondsworth, ...",domestic,domestic
618,"unit 3, dawley park, kestrel way, hayes (ub3 1hp)",unknown,domestic
...,...,...,...
405,"unit 401, bretton park way, dewsbury and parki...",unknown,unknown
94,"unit 2, brickyard lane, melton park, north fer...",unknown,unknown
576,"unit 4, park 32, park road, pontefract (wf8 4pr)",unknown,unknown
661,"unit 1, park 32, park road, pontefract (wf8 4ps)",unknown,unknown


In [5]:
from sklearn import metrics

In [6]:
gt_class = pd.read_csv('/tf/empty_homes_data/parsed_ground_truth_complete.csv').loc[:, ['title_number', 'truth']].drop_duplicates().\
merge(ocod_data.loc[:, ['title_number', 'class2']].drop_duplicates(), how = 'left')
label_names = list(np.unique(gt_class.truth.to_list()))

performance_df = metrics.precision_recall_fscore_support(gt_class.truth.to_list(),
                                        gt_class['class2'].to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(gt_class.truth.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]

Unnamed: 0,class,precision,recall,fscore,support
0,airspace,1.0,0.93,0.96,14.0
1,business,0.98,0.8,0.89,287.0
2,carpark,1.0,0.96,0.98,26.0
3,domestic,0.89,0.97,0.93,483.0
4,land,1.0,0.99,1.0,179.0
5,unknown,0.0,0.0,0.0,9.0


In [11]:
gt_class = pd.read_csv('/tf/empty_homes_data/parsed_ground_truth_complete.csv').loc[:, ['title_number', 'truth']].drop_duplicates().\
merge(ocod_data.loc[:, ['title_number', 'class2']].drop_duplicates(), how = 'left')
label_names = list(np.unique(gt_class.truth.to_list()))

performance_df = metrics.precision_recall_fscore_support(gt_class.truth.to_list(),
                                        gt_class['class2'].to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(gt_class.truth.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]#.to_latex()

Unnamed: 0,class,precision,recall,fscore,support
0,airspace,1.0,0.93,0.96,14.0
1,business,0.98,0.8,0.89,287.0
2,carpark,1.0,0.96,0.98,26.0
3,domestic,0.89,0.97,0.93,483.0
4,land,1.0,0.99,1.0,179.0
5,unknown,0.0,0.0,0.0,9.0


In [7]:
gt_class = pd.read_csv('/tf/empty_homes_data/parsed_ground_truth_complete.csv').loc[:, ['title_number', 'truth']].\
merge(ocod_data.loc[:, ['title_number', 'class2']].drop_duplicates(), how = 'left')
label_names = list(np.unique(gt_class.truth.to_list()))

performance_df = metrics.precision_recall_fscore_support(gt_class.truth.to_list(),
                                        gt_class['class2'].to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(gt_class.truth.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]

Unnamed: 0,class,precision,recall,fscore,support
0,airspace,1.0,0.93,0.96,14.0
1,business,0.98,0.79,0.88,311.0
2,carpark,1.0,0.96,0.98,26.0
3,domestic,0.93,0.98,0.96,918.0
4,land,1.0,0.99,1.0,179.0
5,unknown,0.0,0.0,0.0,9.0


# Future work


The below are primarly nice to have things and would not change the output or results in any significant way

- I could re-insert the original street number in to the address when contracting this would be better for addresses that had been expanded but shouldn't have been. But defintaley isn't very important

- I could clean up the functions to remove the 'setting on copy' warning
- Create a verbose flag such that the messages and print outs of the functions are suppressed

In [1]:
!python ./full_ocod_parse_process.py ./empty_homes_data/

  ocod_data =  pd.read_csv(file_path,
i= 1000  expand time,0.176 filter time0.119 make_dataframe_time 0.888
i= 2000  expand time,0.35 filter time0.237 make_dataframe_time 1.882
i= 1000  expand time,0.191 filter time0.126 make_dataframe_time 1.028
  uniques = Index(uniques)
Load ONSPD
  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','oa11','lsoa11', 'msoa11', 'ctry']]
Pre-process expanded ocod data
Load and pre-process the Land Registry price paid dataset
Add in missing Local authority codes to the ocoda dataset
Load and pre-process the voa business ratings list dataset
  voa_businesses =  pd.read_csv(file_path,
Match street addresses and buildings
replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA
Sub-street matching, this takes some time
lad  100  of 253
lad  200  of 253
Add in businesses per oa and

In [2]:
from address_parsing_helper_functions import *
from locate_and_classify_helper_functions import *

ocod_data = load_and_prep_OCOD_data('./empty_homes_data/' + 'OCOD_FULL_2022_02.csv')

#all_entities = spacy_pred_fn(spacy_model_path = "./empty_homes_data/spacy_data/cpu/model-best", ocod_data = ocod_data)
all_entities = load_cleaned_labels('./empty_homes_data/full_dataset_no_overlaps.json')

full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)

ocod_data = post_process_expanded_data(full_expanded_data, ocod_data)

print("load ONSPD")
postcode_district_lookup = load_postocde_district_lookup("./empty_homes_data/" + "ONSPD_NOV_2021_UK.zip", "Data/ONSPD_NOV_2021_UK.csv")
print("pre-process expanded ocod data")
ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)
print("load and pre-process the Land Registry price paid dataset")
price_paid_df = load_and_process_pricepaid_data("./empty_homes_data/price_paid_files/", postcode_district_lookup)
print("add in missing Local authority codes to the ocoda dataset")
ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df)
print("load and pre-process the voa business ratings list dataset")
voa_businesses = load_voa_ratinglist('./empty_homes_data/' +'uk-englandwales-ndr-2017-listentries-compiled-epoch-0029-baseline-csv.csv', postcode_district_lookup)
del postcode_district_lookup

ocod_data = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)



  ocod_data =  pd.read_csv(file_path,


i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.002
i= 1000  expand time,0.246 filter time0.156 make_dataframe_time 1.167
i= 2000  expand time,0.41 filter time0.259 make_dataframe_time 1.98
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.215 filter time0.133 make_dataframe_time 1.132


  uniques = Index(uniques)


load ONSPD


  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','oa11','lsoa11', 'msoa11', 'ctry']]


pre-process expanded ocod data
load and pre-process the Land Registry price paid dataset
add in missing Local authority codes to the ocoda dataset
load and pre-process the voa business ratings list dataset


  voa_businesses =  pd.read_csv(file_path,


replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA


In [30]:
def substreet_matching(ocod_data, price_paid_df, voa_businesses, print_lads = False, print_every = 100):
    """"
    Some streets are on the boundary of LSOA this section uses the street number to match to the nearest lsoa.
    """
    filled_lsoa_list = []
    i = 1
    unique_lad_codes = ocod_data[ocod_data['street_name'].notnull() & ocod_data['street_number'].notnull() & ocod_data['lsoa11cd'].isnull()]['lad11cd'].unique()

    for target_lad in unique_lad_codes:
        if print_lads: print(target_lad)
            
        if i%print_every==0: print("lad ", i, " of "+ str(round(len(unique_lad_codes), 3)))
        i = i+1
        
        #subset to the relevat rows within a single lad
        missing_lsoa_df = ocod_data[ocod_data['street_name'].notnull() & ocod_data['street_number'].notnull() & ocod_data['lsoa11cd'].isnull() & (ocod_data['lad11cd']==target_lad)].copy()
        missing_lsoa_df.loc[:,'street_number2'] = missing_lsoa_df.loc[:,'street_number'].str.replace(r"^.*(?=\b[0-9]+$)", "", regex = True).str.replace(r"[^\d]", "", regex = True)

        target_street_names = missing_lsoa_df['street_name2'].unique()

        temp_lsoa = pd.concat([
            #the price paid data with names changed
            price_paid_df[price_paid_df['street_name2'].isin(target_street_names )  & 
                                        (price_paid_df['lad11cd']==target_lad) ], 
        #voa data added in                
                        voa_businesses[(voa_businesses['lad11cd']==target_lad)]]
                                                        )[['street_name2', 'street_number', 'lsoa11cd', 'lad11cd']].dropna(axis = 0, how = 'any', inplace = False)

        temp_lsoa.loc[:,'street_number2'] = temp_lsoa.loc[:,'street_number'].str.replace(r"^.*(?=\b[0-9]+$)", "", regex = True).str.replace(r"[^\d]", "", regex = True)
        
        temp_lsoa  = create_all_street_addresses(temp_lsoa[temp_lsoa['street_name2'].isin(target_street_names ) & 
                                                temp_lsoa['street_number2'].notnull()], 
                                        target_lad, 
                                        ['street_name2', 'street_number2', 'lsoa11cd'])
        
        #this sextion of code maps LSOA codes to street addresses, It does this by matching the nearest known LSOA on that street
        #It assumes that the database of known lsoa in the price paid dataset covers all reasonable positions in the road
        #i.e there isn't a road with a street number 10000 and the nearest known number is 12. Putting in a cutoff is beyond the scope of this paper and 
        #So I am just going to ignore it.
        for target_road in target_street_names:
            #print(target_road)
            missing_lsoa_road = missing_lsoa_df[missing_lsoa_df['street_name2']== target_road ].copy()
            temp_road = temp_lsoa[temp_lsoa['street_name2'] ==target_road ]
            print(missing_lsoa_road)
            if len(temp_road)>0:
                print(missing_lsoa_road['street_number2'])
                missing_lsoa_road['lsoa11cd'] = [street_number_to_lsoa(temp_road, int(missing_lsoa_road.iloc[missing_lsoa_row]['street_number2'])) 
                                                for missing_lsoa_row 
                                                in range(0, len(missing_lsoa_road))]
                filled_lsoa_list = filled_lsoa_list + [missing_lsoa_road]
    

    #join the list back together
    temp_lsoa = pd.concat(filled_lsoa_list)

    #join the ocod dataset backtogether
    ocod_data = pd.concat([ocod_data[~ocod_data['unique_id'].isin(temp_lsoa['unique_id'])], temp_lsoa ] )

    ##
    ##Fill in the missing data in the nested addresses again, where at least one address in the nested group has an lsoa/os
    ##
    #Doing this grouped nested business a second time pushes lsoa ID over 90% which seems good enough for me

    #after all other lsoa adding methods are completed
    #all nested properties with missing lsoa have the lsoa of the other properties within their group added

    temp = ocod_data.loc[(ocod_data['lsoa11cd'].notnull()) & (ocod_data['within_larger_title']==True) ,['lsoa11cd', 'title_number']].\
    groupby(['lsoa11cd', 'title_number']).size().reset_index()
    temp = temp[['lsoa11cd', 'title_number']].rename(columns = {'lsoa11cd':'lsoa_nested2'})

    #there are a small number of nested addresses where there are multiple lsoa this prevents increasing the number of observations with these duplicates
    #I don't think it matters if a ver observations are in neighbouring lsoa, the general spatial coherence is maintained
    temp = temp.groupby('title_number')['lsoa_nested2'].first().reset_index()


    ocod_data = ocod_data.merge(temp[['title_number', 'lsoa_nested2']], 
                        how = "left",
                            on = "title_number")

    ocod_data.loc[ocod_data['lsoa11cd'].isnull(), 'lsoa11cd'] = ocod_data['lsoa_nested2'][ocod_data['lsoa11cd'].isnull()] 

    return ocod_data

In [29]:
int(NaN)

NameError: name 'NaN' is not defined

In [31]:
ocod_data2 = substreet_matching(ocod_data, price_paid_df, voa_businesses)

#This function allows areas with no  businesses to automatically exclude business from the classification
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data2, voa_businesses)

   title_number  within_title_id   unique_id  within_larger_title    tenure  \
87     CB335886                1  CB335886-1                False  Freehold   

   unit_id unit_type building_name street_number       street_name  ...  \
87     NaN      land           NaN             7  great north road  ...   

   street_number2    street_name2    lad11cd lsoa_street lsoa_building  \
87              7  greatnorthroad  E06000031         NaN           NaN   

   oa_building  oa_busi_building lsoa_busi_building lsoa_nested oa_nested  
87         NaN               NaN                NaN         NaN       NaN  

[1 rows x 32 columns]
    title_number  within_title_id   unique_id  within_larger_title     tenure  \
118     CB244681                1  CB244681-1                False  Leasehold   

    unit_id unit_type building_name street_number street_name  ...  \
118     NaN       NaN           NaN           1-3    broadway  ...   

    street_number2 street_name2    lad11cd lsoa_street lsoa_bu

     title_number  within_title_id    unique_id  within_larger_title  \
220     NGL824316                1  NGL824316-1                 True   
221     NGL824316                2  NGL824316-2                 True   
222     NGL824316                3  NGL824316-3                 True   
223     NGL824316                4  NGL824316-4                 True   
224     NGL824316                5  NGL824316-5                 True   
225     NGL824316                6  NGL824316-6                 True   
675     NGL824316                7  NGL824316-7                 True   
676     NGL824316                8  NGL824316-8                 True   
827     NGL821493                1  NGL821493-1                 True   
828     NGL821493                2  NGL821493-2                 True   
1066    AGL332861                1  AGL332861-1                False   
1113    NGL821705                1  NGL821705-1                False   
1135    NGL846681                1  NGL846681-1                F

ValueError: cannot convert float NaN to integer

In [49]:
test = ocod_data.loc[ocod_data.street_name.str.contains("london wall")==True,['property_address', 'street_number', "street_number2"]].astype(str)

#test['street_number'].str.replace(r"^.*(?=\b[0-9]+$)", "", regex = True).str.replace(r"[^\d]", "", regex = True)
test

Unnamed: 0,property_address,street_number,street_number2
220,parts of levels 1 to 6 (inclusive) being resid...,1,
221,parts of levels 1 to 6 (inclusive) being resid...,2,
222,parts of levels 1 to 6 (inclusive) being resid...,3,
223,parts of levels 1 to 6 (inclusive) being resid...,4,
224,parts of levels 1 to 6 (inclusive) being resid...,5,
225,parts of levels 1 to 6 (inclusive) being resid...,6,
437,54 to 63 london wall and 20 to 30 (even) copth...,54,
438,54 to 63 london wall and 20 to 30 (even) copth...,55,
439,54 to 63 london wall and 20 to 30 (even) copth...,56,
440,54 to 63 london wall and 20 to 30 (even) copth...,57,


In [1]:
from locate_and_classify_helper_functions import *

print("load ONSPD")
postcode_district_lookup = load_postocde_district_lookup("./empty_homes_data/" + "ONSPD_NOV_2021_UK.zip", "Data/ONSPD_NOV_2021_UK.csv")

print("load and pre-process the Land Registry price paid dataset")
price_paid_df = load_and_process_pricepaid_data("./empty_homes_data/price_paid_files/", postcode_district_lookup)

print("load and pre-process the voa business ratings list dataset")
voa_businesses = load_voa_ratinglist('./empty_homes_data/' +'uk-englandwales-ndr-2017-listentries-compiled-epoch-0029-baseline-csv.csv', postcode_district_lookup)

load ONSPD


  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','oa11','lsoa11', 'msoa11', 'ctry']]


load and pre-process the Land Registry price paid dataset
load and pre-process the voa business ratings list dataset


  voa_businesses =  pd.read_csv(file_path,


In [2]:

print("load expanded ocod")
ocod_spacy =  pd.read_csv("./empty_homes_data/OCOD_cleaned_expanded_spacy.csv")
print("pre-process expanded ocod data")
ocod_spacy = preprocess_expandaded_ocod_data(ocod_spacy, postcode_district_lookup)
print("add in missing Local authority codes to the ocoda dataset")
ocod_spacy = add_missing_lads_ocod(ocod_spacy, price_paid_df)
del postcode_district_lookup

print("Match street addresses and buildings")
ocod_spacy = street_and_building_matching(ocod_spacy, price_paid_df, voa_businesses)



load expanded ocod
pre-process expanded ocod data
add in missing Local authority codes to the ocoda dataset
Match street addresses and buildings
replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA


In [3]:
print('Sub-street matching, this takes some time')
ocod_spacy_sub = substreet_matching(ocod_spacy, price_paid_df, voa_businesses)

Sub-street matching, this takes some time
lad  100  of 244
lad  200  of 244


In [3]:

print("load expanded ocod")

ocod_prog =  pd.read_csv("./empty_homes_data/OCOD_cleaned_expanded_programmatic_only.csv")
print("pre-process expanded ocod data")

ocod_prog  = preprocess_expandaded_ocod_data(ocod_prog, postcode_district_lookup)

print("add in missing Local authority codes to the ocoda dataset")

ocod_prog  = add_missing_lads_ocod(ocod_prog, price_paid_df)

del postcode_district_lookup



load ONSPD


  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','oa11','lsoa11', 'msoa11', 'ctry']]


load expanded ocod
pre-process expanded ocod data
load and pre-process the Land Registry price paid dataset
add in missing Local authority codes to the ocoda dataset
load and pre-process the voa business ratings list dataset


  voa_businesses =  pd.read_csv(file_path,


In [4]:
print("Match street addresses and buildings")
ocod_spacy = street_and_building_matching(ocod_spacy, price_paid_df, voa_businesses)
ocod_prog = street_and_building_matching(ocod_prog, price_paid_df, voa_businesses)


Match street addresses and buildings
replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA
replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA


In [7]:

#This takes some time
print('Sub-street matching, this takes some time')
ocod_spacy_sub = substreet_matching(ocod_spacy, price_paid_df, voa_businesses)

Sub-street matching, this takes some time
lad  100  of 244
lad  200  of 244


In [5]:

#This takes some time
print('Sub-street matching, this takes some time')
ocod_prog_sub = substreet_matching(ocod_prog, price_paid_df, voa_businesses)

Sub-street matching, this takes some time
lad  100  of 253
lad  200  of 253


In [8]:
del price_paid_df #for memory purposes
print('Add in businesses per oa and lsoa')
ocod_spacy2 = counts_of_businesses_per_oa_lsoa(ocod_spacy_sub, voa_businesses)

print('Identify businesses using address matching')
ocod_spacy2 = voa_address_match_all_data(ocod_spacy2, voa_businesses)

Add in businesses per oa and lsoa
Identify businesses using address matching
address matched  0 lads of 331
address matched  50 lads of 331
address matched  100 lads of 331
address matched  150 lads of 331
address matched  200 lads of 331
address matched  250 lads of 331
address matched  300 lads of 331


In [2]:
from address_parsing_helper_functions import *
from locate_and_classify_helper_functions import *

print("load ONSPD")
postcode_district_lookup = load_postocde_district_lookup("./empty_homes_data/" + "ONSPD_NOV_2021_UK.zip", "Data/ONSPD_NOV_2021_UK.csv")
print("load expanded ocod")
ocod_spacy =  pd.read_csv("./empty_homes_data/OCOD_cleaned_expanded_spacy.csv")
print("pre-process expanded ocod data")
ocod_spacy = preprocess_expandaded_ocod_data(ocod_spacy, postcode_district_lookup)
print("load and pre-process the Land Registry price paid dataset")
price_paid_df = load_and_process_pricepaid_data("./empty_homes_data/price_paid_files/", postcode_district_lookup)
print("add in missing Local authority codes to the ocoda dataset")
ocod_spacy = add_missing_lads_ocod(ocod_spacy, price_paid_df)
print("load and pre-process the voa business ratings list dataset")
voa_businesses = load_voa_ratinglist('./empty_homes_data/' +'uk-englandwales-ndr-2017-listentries-compiled-epoch-0029-baseline-csv.csv', postcode_district_lookup)
del postcode_district_lookup
print("Match street addresses and buildings")
ocod_spacy = street_and_building_matching(ocod_spacy, price_paid_df, voa_businesses)

print('Sub-street matching, this takes some time')
ocod_spacy_sub = substreet_matching(ocod_spacy, price_paid_df, voa_businesses)

load ONSPD


  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','oa11','lsoa11', 'msoa11', 'ctry']]


load expanded ocod
pre-process expanded ocod data
load and pre-process the Land Registry price paid dataset
add in missing Local authority codes to the ocoda dataset
load and pre-process the voa business ratings list dataset


  voa_businesses =  pd.read_csv(file_path,


Match street addresses and buildings
replace the missing lsoa using street matching
replace the missing lsoa using building matching
insert newly ID'd LSOA and OA
update missing LSOA and OA for nested properties where at least one nested property has an OA or LSOA
Sub-street matching, this takes some time
lad  100  of 244
lad  200  of 244


In [4]:
import address_parsing_helper_functions
dir(address_parsing_helper_functions)

['ALPHA',
 'ALPHA_LOWER',
 'ALPHA_UPPER',
 'CONCAT_QUOTES',
 'HYPHENS',
 'LIST_ELLIPSES',
 'LIST_ICONS',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'add_backfill_blockers',
 'backfill_address_labels',
 'compile_infix_regex',
 'expand_dataframe_numbers',
 'expand_multi_id',
 'filter_contiguous_numbers',
 'final_parsed_addresses',
 'identify_multi_addresses',
 'json',
 'load_and_prep_OCOD_data',
 'load_cleaned_labels',
 'load_data_with_overlaps_harry',
 'load_data_with_overlaps_jonno',
 'np',
 'parsing_and_expansion_process',
 'pd',
 'post_process_expanded_data',
 're',
 'remove_overlaps_harry',
 'remove_overlaps_jonno',
 'spacy',
 'spacy_pred_fn',
 'spread_address_labels',
 'time']

In [5]:
import locate_and_classify_helper_functions
dir(locate_and_classify_helper_functions)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'add_missing_lads_ocod',
 'classification_type1',
 'classification_type2',
 'clean_street_numbers',
 'contract_ocod_after_classification',
 'counts_of_businesses_per_oa_lsoa',
 'create_all_street_addresses',
 'create_lad_streetname2',
 'expand_dataframe_numbers',
 'expand_multi_id',
 'filter_contiguous_numbers',
 'find_filter_type',
 'io',
 'json',
 'load_and_process_pricepaid_data',
 'load_postocde_district_lookup',
 'load_voa_ratinglist',
 'massaged_address_match',
 'massaged_street_match',
 'np',
 'os',
 'pd',
 'preprocess_expandaded_ocod_data',
 're',
 'street_and_building_matching',
 'street_number_to_lsoa',
 'substreet_matching',
 'time',
 'voa_address_match_all_data',
 'zipfile']