In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

  from IPython.core.display import display, HTML


# Introduction

This note book is used for the following

* Loading and processing the OCOD data
* Tokenising and performing NER on the OCOD dataset using the previously trained model
* Evaluating the performance of the trained model
* Expanding the OCOD dataset such that a single line represents a single street address
* Adding in additional information from the original OCOD dataset


In [6]:
import json
import pandas as pd
import re
#from helper_functions import *
from address_parsing_helper_functions import *
import numpy as np
from sklearn import metrics

# Load  and predict data

The below two chunk do all the processing to load clean and predict the OCOD dataset using a pretrained spaCy model

In [7]:
from address_parsing_helper_functions import *
from locate_and_classify_helper_functions import *

#!ipython full_ocod_parse_process.py "/home/jonno/data/"

In [12]:

#"./data/spacy_data/training/data_24_05_22/model-best"
ocod_data = load_and_prep_OCOD_data('./data/' + 'OCOD_FULL_2022_02.csv')
#for data without overlapping spans, where the spans were removed by my own script
#all_entities = load_cleaned_labels('./data/full_dataset_no_overlaps.json')

#for predicting from the original dataset using spacy model

#"./data/spacy_cpu_model"
#"./data/spacy_data/training/data_25_05_22/model-best"
#"./data/spacy_data/training/data_25_05_22/model-last"
all_entities = spacy_pred_fn(spacy_model_path = "./data/spacy_data/spacy_cpu_model", ocod_data = ocod_data) 
#for data with overlapping spans overlapping spans
#all_entities = load_data_with_overlaps_jonno('/tf/data/test.json')

  ocod_data =  pd.read_csv(file_path,


Loading the spaCy model




Adding the datapoint id and title number meta data to the property address
predicting over the OCOD dataset using the pre-trained spaCy model
extracting entity label text
Names Entity Recognition labelling complete


# Assesing quality

This section allows for the spacy model to be evaluated using the groundtruth dataset

In [9]:
ground_truth_df = pd.read_csv('./data/ground_truth_test_set_labels.csv')

#I only need a small number of the columns to be able to calculate the F1 score
#Everything else just makes it confusing. 
#renaming is for consistancy
ground_truth_df = ground_truth_df.loc[ground_truth_df.loc[:,'result_type']=="span",[ 'result_type', 'label',
       'start', 'end', 'text', 'input:text', 'input:datapoint_id']].rename(
    columns = {'input:text':'property_address',
              'input:datapoint_id':'datapoint_id',
              'text':'label_text'})

ground_truth_df = ground_truth_df.merge(all_entities.loc[all_entities.datapoint_id.isin(ground_truth_df.datapoint_id),
                                       ['start', 'end', 'datapoint_id' ,'label']
                                      ],
                     how = "left", on = ['start', 'end', 'datapoint_id'])

#These are the elements that are most important to get right
ground_truth_important_df = ground_truth_df[ground_truth_df.label_x.isin(['building_name','street_name', 'street_number','unit_id'])]

NameError: name 'all_entities' is not defined

In [46]:
ground_truth_df.to_csv('./data/what_errors_dev.csv')

In [18]:
label_names = list(np.unique(ground_truth_df.label_x.to_list()))

overall_score = pd.DataFrame( 
              [round(x,2) for x in 
               metrics.precision_recall_fscore_support(ground_truth_important_df.label_x.to_list(),
                                        ground_truth_important_df.label_y.to_list(), 
                                        average = "micro")[0:3]], columns = ["values"])
overall_score['metric']=["precision", "recall", "fscore"]


overall_score[['metric', 'values']]


Unnamed: 0,metric,values
0,precision,0.95
1,recall,0.95
2,fscore,0.95


The below chunk creates the performance table used in the paper

In [19]:

performance_df = metrics.precision_recall_fscore_support(ground_truth_df.label_x.to_list(),
                                        ground_truth_df.label_y.to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(ground_truth_df.label_x.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]
print(performance_df[['class',"precision", "recall", "fscore", "support"]].to_latex(index = False, escape = False))

\begin{tabular}{lrrrr}
\toprule
        class &  precision &  recall &  fscore &  support \\
\midrule
building_name &       0.93 &    0.89 &    0.91 &    383.0 \\
         city &       1.00 &    0.97 &    0.98 &    947.0 \\
number_filter &       1.00 &    1.00 &    1.00 &     23.0 \\
     postcode &       1.00 &    1.00 &    1.00 &    768.0 \\
  street_name &       1.00 &    0.95 &    0.97 &   1029.0 \\
street_number &       0.98 &    0.98 &    0.98 &    678.0 \\
      unit_id &       0.98 &    0.92 &    0.95 &    370.0 \\
    unit_type &       1.00 &    0.97 &    0.98 &    488.0 \\
\bottomrule
\end{tabular}



  print(performance_df[['class',"precision", "recall", "fscore", "support"]].to_latex(index = False, escape = False))


In [14]:

performance_df = metrics.precision_recall_fscore_support(ground_truth_df.label_x.to_list(),
                                        ground_truth_df.label_y.to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(ground_truth_df.label_x.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]
#print(performance_df[['class',"precision", "recall", "fscore", "support"]].to_latex(index = False, escape = False))

Unnamed: 0,class,precision,recall,fscore,support
0,building_name,0.93,0.89,0.91,383.0
1,city,1.0,0.97,0.98,947.0
2,number_filter,1.0,1.0,1.0,23.0
3,postcode,1.0,1.0,1.0,768.0
4,street_name,1.0,0.95,0.97,1029.0
5,street_number,0.98,0.98,0.98,678.0
6,unit_id,0.98,0.92,0.95,370.0
7,unit_type,1.0,0.97,0.98,488.0


In [7]:

performance_df = metrics.precision_recall_fscore_support(ground_truth_df.label_x.to_list(),
                                        ground_truth_df.label_y.to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(ground_truth_df.label_x.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]
#print(performance_df[['class',"precision", "recall", "fscore", "support"]].to_latex(index = False, escape = False))

Unnamed: 0,class,precision,recall,fscore,support
0,building_name,0.93,0.9,0.92,383.0
1,city,1.0,0.97,0.99,947.0
2,number_filter,1.0,1.0,1.0,23.0
3,postcode,1.0,1.0,1.0,768.0
4,street_name,1.0,0.95,0.97,1029.0
5,street_number,0.99,0.98,0.99,678.0
6,unit_id,0.99,0.95,0.97,370.0
7,unit_type,1.0,0.97,0.98,488.0


## Example of the data frame of labels

In [8]:
all_entities

Unnamed: 0,start,end,label_text,labelId,label,labellingFunctionId,groundTruthId,datapoint_id,text,label_id_count
0,0,25,westleigh lodge care home,10,building_name,73,,0,"westleigh lodge care home, nel pan lane, leigh...",0
1,27,39,nel pan lane,12,street_name,28,,0,"westleigh lodge care home, nel pan lane, leigh...",0
2,41,46,leigh,5,city,10,,0,"westleigh lodge care home, nel pan lane, leigh...",0
3,48,55,wn7 5jt,11,postcode,50,,0,"westleigh lodge care home, nel pan lane, leigh...",0
4,0,4,flat,15,unit_type,15,,1,"flat 1, 1a canal street, manchester (m1 3he)",0
...,...,...,...,...,...,...,...,...,...,...
434483,12,42,discovery dock apartments east,10,building_name,39,,94087,"storage 17, discovery dock apartments east, 3 ...",0
434484,44,45,3,13,street_number,7,,94087,"storage 17, discovery dock apartments east, 3 ...",0
434485,46,63,south quay square,12,street_name,28,,94087,"storage 17, discovery dock apartments east, 3 ...",0
434486,65,71,london,5,city,10,,94087,"storage 17, discovery dock apartments east, 3 ...",0


## Identify multi versus single address observations

Some addresses may have the form xx to yy but should not be expanded as this is a building that covers multiple street numbers. Items such as these need to be carefully removed before expansion

In [9]:
#This regex is used in several places and is kept here as it was originally used in the function below.
xx_to_yy_regex = r'^\d+\s?(?:-|to)\s?\d+$'
multi_unit_id, multi_property, all_multi_ids = identify_multi_addresses(all_entities)

In [10]:
df = spread_address_labels(all_entities, all_multi_ids)
#Blockers prevent the filling of wrong information. As an example if a building is going to back fill up 
#previous addresses it should not back fill past another street as this is highly unlikely to be the same building
df = add_backfill_blockers(df)
df = backfill_address_labels(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['index'] = temp_df.index


In [11]:
df

Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,13,block,peterborough,even,,miller way,2-24,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
1,13,block,peterborough,even,,miller way,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
2,13,block,peterborough,block,,miller way,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
3,13,block,peterborough,block,,hammonds drive,15-25,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
4,13,block,peterborough,block,,hammonds drive,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
...,...,...,...,...,...,...,...,...,...,...
41875,94046,block,sheffield,block,,doveholes drive,50 to 55,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."
41876,94046,block,sheffield,block,,doveholes drive,62 to 73,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."
41877,94046,block,sheffield,block,,doveholes drive,81 to 91,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."
41878,94046,block,sheffield,block,,doveholes drive,block,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."


In [60]:
full_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = True)
full_not_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = False)

i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.194 filter time0.12 make_dataframe_time 0.993
i= 2000  expand time,0.387 filter time0.241 make_dataframe_time 1.924
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.191 filter time0.116 make_dataframe_time 1.076


  uniques = Index(uniques)
  uniques = Index(uniques)


In [61]:
print(full_expanded_data.shape)
print(full_not_expanded_data.shape)

(159699, 11)
(109249, 11)


In [43]:
full_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = True)
full_not_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = False)

i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.191 filter time0.116 make_dataframe_time 0.938
i= 2000  expand time,0.38 filter time0.229 make_dataframe_time 1.837
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.178 filter time0.104 make_dataframe_time 0.965


  uniques = Index(uniques)
  uniques = Index(uniques)


# Load OCOD data and combine necessary information with the expanded data

In [30]:
ocod_data =  pd.read_csv('/tf/data/' +
                    'OCOD_FULL_2022_02.csv',
                   encoding_errors= 'ignore').rename(columns = lambda x: x.lower().replace(" ", "_"))
#empty addresses cannot be used. however there are only three so not a problem
ocod_data = ocod_data.dropna(subset = 'property_address')
ocod_data.reset_index(inplace = True, drop = True)
ocod_data = ocod_data[['title_number', 'tenure', 'district', 'county',
       'region', 'multiple_address_indicator', 'price_paid', 'property_address']]

ocod_data['property_address'] = ocod_data['property_address'].str.lower()

#ensure there is a space after commas
#This is because some numbers are are written as 1,2,3,4,5 which causes issues during tokenisation
ocod_data.property_address = ocod_data.property_address.str.replace(',', r', ', regex = True)
#remove multiple spaces
ocod_data.property_address = ocod_data.property_address.str.replace('\s{2,}', r' ', regex = True)

#typo in the data leads to a large number of fake flats
ocod_data.loc[:, 'property_address'] = ocod_data['property_address'].str.replace("stanley court ", "stanley court, ")
#This typo leads to some rather silly addresses
ocod_data.loc[:, 'property_address'] = ocod_data['property_address'].str.replace("100-1124", "100-112")
ocod_data.loc[:, 'property_address'] = ocod_data['property_address'].str.replace("40a, 40, 40¨, 42, 44", "40a, 40, 40, 42, 44")

full_expanded_data = full_expanded_data.merge(ocod_data, how = "left", left_on = "datapoint_id", right_index = True)

  ocod_data =  pd.read_csv('/tf/empty_homes_data/' +


This is a quality test that checks that the property addresses are identical between the original ocod data and the expanded OCOD data.
It also acts as a proof that the addresses are not being re-ordered

In [32]:

full_expanded_data['property_address'].str.lower().equals(full_expanded_data['text']) # When 'True' this shows the match works because the address string are equal
#If the above is not true then the below code shows which rows are not the same
#full_expanded_data.loc[~((full_expanded_data['text']==full_expanded_data['property_address'].str.lower())), ['property_address', 'text', 'datapoint_id']]

True

# Finalise expanded OCOD and save CSV
This is the final step of this notebook. It ouputs a csv for the next notebook to load and add in geolocation and classification

In [35]:
full_expanded_data['within_title_id'] = full_expanded_data.groupby('title_number').cumcount()+1
full_expanded_data['unique_id'] = [str(x) + '-' + str(y) for x, y in zip(full_expanded_data['title_number'], full_expanded_data['within_title_id'])]

tmp_df =((full_expanded_data[['title_number', 'within_title_id']].groupby('title_number').max('within_title_id'))>1)
tmp_df.columns = tmp_df.columns.str.replace('within_title_id', 'within_larger_title') #could also be called nested_address
full_expanded_data = full_expanded_data.merge(tmp_df, how = "left", left_on = "title_number", right_index = True)


full_expanded_data['postcode'] =full_expanded_data['postcode'].str.upper()
del tmp_df

#re-order the columns and drop columns that are not needed

full_expanded_data =full_expanded_data[['title_number', 'within_title_id', 'unique_id', 'within_larger_title',  'tenure','unit_id', 'unit_type','building_name','street_number', 'street_name', 'postcode','city',  'district', 'county', 'region',
       'multiple_address_indicator', 'price_paid' ,'property_address']].replace('block', np.NaN)

#save as CSV
full_expanded_data.to_csv("./data/OCOD_cleaned_expanded2.csv")

# The entire parsing process
The below chunk goes through the entire parsing process and saves the resulting dataframe as a csv

In [51]:
ocod_data = load_and_prep_OCOD_data('./data/' + 'OCOD_FULL_2022_02.csv')

all_entities = spacy_pred_fn(spacy_model_path = "./data/spacy_data/cpu/model-best", ocod_data = ocod_data)
#all_entities = load_cleaned_labels('./data/full_dataset_no_overlaps.json')
full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)

full_expanded_data = post_process_expanded_data(full_expanded_data, ocod_data)

full_expanded_data.to_csv("./data/OCOD_cleaned_expanded_spacy.csv")

  ocod_data =  pd.read_csv(file_path,


Loading the spaCy model
Adding the datapoint id and title number meta data to the property address
predicting over the OCOD dataset using the pre-trained spaCy model
extracting entity label text
Names Entity Recognition labelling complete
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.24 filter time0.143 make_dataframe_time 1.303
i= 2000  expand time,0.505 filter time0.306 make_dataframe_time 2.515
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.002
i= 1000  expand time,0.353 filter time0.23 make_dataframe_time 1.725


  uniques = Index(uniques)


In [52]:
test_spacy = pd.read_csv("./data/OCOD_cleaned_expanded_spacy.csv")
test_prog = pd.read_csv("./data/OCOD_cleaned_expanded_programmatic_only.csv")

In [61]:
np.sum(test_prog.isna()*1)

Unnamed: 0                         0
title_number                       0
within_title_id                    0
unique_id                          0
within_larger_title                0
tenure                             0
unit_id                       101723
unit_type                     112129
building_name                 105968
street_number                  57337
street_name                    13304
postcode                       57352
city                            9191
district                           0
county                             0
region                             0
multiple_address_indicator         0
price_paid                    117993
property_address                   0
dtype: int64

In [62]:
np.sum(test_spacy.isna()*1)

Unnamed: 0                         0
title_number                       0
within_title_id                    0
unique_id                          0
within_larger_title                0
tenure                             0
unit_id                       104432
unit_type                     105295
building_name                 107086
street_number                  50541
street_name                    11541
postcode                       52998
city                           11323
district                           0
county                             0
region                             0
multiple_address_indicator         0
price_paid                    111323
property_address                   0
dtype: int64

In [14]:
full_expanded_data.loc[full_expanded_data['street_name'].str.contains("stadium approach,")==True,:]
#full_expanded_data['street_name'].str.contains("stadium approach,")==True

Unnamed: 0,index,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
112,,123,,aylesbury,odd,,"stadium approach,",33-63,,,"33-63 (odd), stadium approach, aylesbury"
