In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Introduction
* Raw CSV loaded and lightly processed. Output: two column csv columns, property address, flat tag
* Data labelled in programmatic. Output: json file of entities.
* Data programmatic output json cleaned ordered and overlaps removed. Output: json file
* **Clean json converted to dataframe and multi-addresses expanded.** Output: CSV
* Count and locate addresses
* Create address matcher and match businesses
* Classify address types

This notebook is used so that I can create the code necessary to expand the addresses so that a single property/dwelling is a single line. This means the data will follow the tidy data principles of one observation per line where an observation is what is commonly thought of as a property.

In [2]:
import json
import pandas as pd
import re
#from helper_functions import *
from address_parsing_helper_functions import *
import numpy as np
from sklearn import metrics

# Load data

Use this chunk when the file to be loaded has already had overlapping spans removed by the unit tagging and span cleaning script

In [3]:
#For data without overlapping spans where the spans were removed by the denoising hmm
#  ############### DENOISING NOT CURRENTLY AVAILABLE COMPLETE WHEN POSSIBLE

#for data without overlapping spans, where the spans were removed by my own script
all_entities = load_cleaned_labels('/tf/empty_homes_data/full_dataset_no_overlaps.json')
#for data with overlapping spans overlapping spans
#all_entities = load_data_with_overlaps_jonno('/tf/empty_homes_data/test.json')

This chunk loads the ground truth for checking the labelling quality

In [4]:
ground_truth_df = pd.read_csv('/tf/empty_homes_data/Ground truth for offshore empties V2 includes comma space_labels.csv')

#I only need a small number of the columns to be able to calculate the F1 score
#Everything else just makes it confusing. 
#renaming is for consistancy
ground_truth_df = ground_truth_df.loc[ground_truth_df.loc[:,'result_type']=="span",[ 'result_type', 'label',
       'start', 'end', 'text', 'input:text', 'input:datapoint_id']].rename(
    columns = {'input:text':'property_address',
              'input:datapoint_id':'datapoint_id',
              'text':'label_text'})

ground_truth_df = ground_truth_df.merge(all_entities.loc[all_entities.datapoint_id.isin(ground_truth_df.datapoint_id),
                                       ['start', 'end', 'datapoint_id' ,'label', 'label_text']
                                      ],
                     how = "left", on = ['start', 'end', 'datapoint_id'])

ground_truth_important_df = ground_truth_df[ground_truth_df.label_x.isin(['building_name','street_name', 'street_number','unit_id'])]

In [5]:
#metrics.f1_score(ground_truth_df.label_x.to_list(),ground_truth_df.label_y.to_list(), average = 'micro')
label_names = list(np.unique(ground_truth_df.label_x.to_list()))

overall_score = pd.DataFrame( 
              [round(x,2) for x in 
               metrics.precision_recall_fscore_support(ground_truth_df.label_x.to_list(),
                                        ground_truth_df.label_y.to_list(), 
                                        average = "micro")[0:3]], columns = ["values"])
overall_score['metric']=["precision", "recall", "fscore"]


overall_score[['metric', 'values']]


Unnamed: 0,metric,values
0,precision,0.95
1,recall,0.95
2,fscore,0.95


In [7]:

performance_df = metrics.precision_recall_fscore_support(ground_truth_df.label_x.to_list(),
                                        ground_truth_df.label_y.to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(ground_truth_df.label_x.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]

Unnamed: 0,class,precision,recall,fscore,support
0,building_name,0.98,0.83,0.9,382.0
1,city,1.0,0.97,0.99,947.0
2,number_filter,1.0,1.0,1.0,23.0
3,postcode,1.0,1.0,1.0,768.0
4,street_name,0.99,0.97,0.98,1029.0
5,street_number,0.99,0.93,0.96,677.0
6,unit_id,0.99,0.91,0.95,370.0
7,unit_type,1.0,0.97,0.98,488.0


In [6]:
performance_df = metrics.precision_recall_fscore_support(ground_truth_df.label_x.to_list(),
                                        ground_truth_df.label_y.to_list(), 
                                        labels = label_names)

performance_df = pd.DataFrame(np.round_(np.transpose(performance_df),2), columns = ["precision", "recall", "fscore", "support"])
performance_df['class'] = list(np.unique(ground_truth_df.label_x.to_list()))
performance_df[['class',"precision", "recall", "fscore", "support"]]

Unnamed: 0,class,precision,recall,fscore,support
0,building_name,0.99,0.82,0.89,382.0
1,city,1.0,0.97,0.99,947.0
2,number_filter,1.0,1.0,1.0,23.0
3,postcode,1.0,1.0,1.0,768.0
4,street_name,0.99,0.97,0.98,1029.0
5,street_number,0.99,0.94,0.96,677.0
6,unit_id,0.99,0.93,0.96,370.0
7,unit_type,1.0,0.97,0.98,488.0


## Example of the data frame of labels

In [8]:
all_entities

Unnamed: 0,start,end,label_text,labelId,label,labellingFunctionId,groundTruthId,datapoint_id,text,label_id_count
0,0,25,westleigh lodge care home,10,building_name,73,,0,"westleigh lodge care home, nel pan lane, leigh...",0
1,27,39,nel pan lane,12,street_name,28,,0,"westleigh lodge care home, nel pan lane, leigh...",0
2,41,46,leigh,5,city,10,,0,"westleigh lodge care home, nel pan lane, leigh...",0
3,48,55,wn7 5jt,11,postcode,50,,0,"westleigh lodge care home, nel pan lane, leigh...",0
4,0,4,flat,15,unit_type,15,,1,"flat 1, 1a canal street, manchester (m1 3he)",0
...,...,...,...,...,...,...,...,...,...,...
434483,12,42,discovery dock apartments east,10,building_name,39,,94087,"storage 17, discovery dock apartments east, 3 ...",0
434484,44,45,3,13,street_number,7,,94087,"storage 17, discovery dock apartments east, 3 ...",0
434485,46,63,south quay square,12,street_name,28,,94087,"storage 17, discovery dock apartments east, 3 ...",0
434486,65,71,london,5,city,10,,94087,"storage 17, discovery dock apartments east, 3 ...",0


## Identify multi versus single address observations

Some addresses may have the form xx to yy but should not be expanded as this is a building that covers multiple street numbers. Items such as these need to be carefully removed before expansion

In [9]:
#This regex is used in several places and is kept here as it was originally used in the function below.
xx_to_yy_regex = r'^\d+\s?(?:-|to)\s?\d+$'
multi_unit_id, multi_property, all_multi_ids = identify_multi_addresses(all_entities)

In [10]:
df = spread_address_labels(all_entities, all_multi_ids)
#Blockers prevent the filling of wrong information. As an example if a building is going to back fill up 
#previous addresses it should not back fill past another street as this is highly unlikely to be the same building
df = add_backfill_blockers(df)
df = backfill_address_labels(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['index'] = temp_df.index


In [11]:
df

Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,13,block,peterborough,even,,miller way,2-24,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
1,13,block,peterborough,even,,miller way,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
2,13,block,peterborough,block,,miller way,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
3,13,block,peterborough,block,,hammonds drive,15-25,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
4,13,block,peterborough,block,,hammonds drive,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
...,...,...,...,...,...,...,...,...,...,...
41875,94046,block,sheffield,block,,doveholes drive,50 to 55,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."
41876,94046,block,sheffield,block,,doveholes drive,62 to 73,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."
41877,94046,block,sheffield,block,,doveholes drive,81 to 91,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."
41878,94046,block,sheffield,block,,doveholes drive,block,,,"50 to 55 (inclusive), 62 to 73 (inclusive) and..."


In [59]:
def final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = True):

    """
    This function creates the final parsed address dataframe.
    It can either expand the multi-addresses in the format xx to yy or not.
    This is because other address parsers are not designed to perform such and expansion
    and so would make such a comparison unfair.
    """
    xx_to_yy_regex = r'^\d+\s?(?:-|to)\s?\d+$'

    expanded_street = df[df.datapoint_id.isin(multi_property) & df.street_number.str.contains(xx_to_yy_regex)].reset_index()
    expanded_unit_id = df[df.datapoint_id.isin(multi_unit_id) & df.unit_id.str.contains(xx_to_yy_regex)].reset_index()

    #Generally expansion is required as it changes the format to 1 address per row
    #N.B. not all expanded addresses are valid. Office blocks are 1 property but can cover multiple street addresses
    #A matching and cleaning process is required to identify what should be expanded and what not
    if expand_addresses==True:
        expanded_street = expand_dataframe_numbers(expanded_street, column_name = "street_number" )
        expanded_unit_id = expand_dataframe_numbers(expanded_unit_id, column_name = "unit_id" )
        
    #unit id and street number that does does not have the xx to yy format and so has already been expanded by spreading and backfilling
    expanded_street_simple = df[df.datapoint_id.isin(multi_property) & 
                            (df.street_number.str.contains(xx_to_yy_regex)==False) & (df.street_number!='block')].reset_index()
    expanded_unit_id_simple = df[df.datapoint_id.isin(multi_unit_id) & 
                             (df.unit_id.str.contains(xx_to_yy_regex)==False) & (df.unit_id!='block')].reset_index()

    #remove the multi-addresses
    single_address_only =all_entities[~all_entities['datapoint_id'].isin(all_multi_ids)]
    #remove all but the first instance of a label in the remaining instances
    #this is because for single addresses there should be only a single label for each class
    single_address_only =single_address_only[single_address_only['label_id_count']==0]
    df2 = single_address_only.pivot(index='datapoint_id',columns='label',values='label_text')
    #add the datapoint_id back in for each of joining
    df2 = df2.merge(single_address_only[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")

    full_expanded_data = pd.concat([expanded_street, 
           expanded_unit_id, 
           expanded_street_simple, 
           expanded_unit_id_simple, 
           df2 ])
    
    return full_expanded_data

In [60]:
full_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = True)
full_not_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = False)

i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.194 filter time0.12 make_dataframe_time 0.993
i= 2000  expand time,0.387 filter time0.241 make_dataframe_time 1.924
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.191 filter time0.116 make_dataframe_time 1.076


  uniques = Index(uniques)
  uniques = Index(uniques)


In [61]:
print(full_expanded_data.shape)
print(full_not_expanded_data.shape)

(159699, 11)
(109249, 11)


In [43]:
full_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = True)
full_not_expanded_data = final_parsed_addresses(df,all_entities ,multi_property, multi_unit_id, all_multi_ids, expand_addresses = False)

i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.191 filter time0.116 make_dataframe_time 0.938
i= 2000  expand time,0.38 filter time0.229 make_dataframe_time 1.837
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.178 filter time0.104 make_dataframe_time 0.965


  uniques = Index(uniques)
  uniques = Index(uniques)


# Load OCOD data and combine necessary information with the expanded data

In [30]:
ocod_data =  pd.read_csv('/tf/empty_homes_data/' +
                    'OCOD_FULL_2022_02.csv',
                   encoding_errors= 'ignore').rename(columns = lambda x: x.lower().replace(" ", "_"))
#empty addresses cannot be used. however there are only three so not a problem
ocod_data = ocod_data.dropna(subset = 'property_address')
ocod_data.reset_index(inplace = True, drop = True)
ocod_data = ocod_data[['title_number', 'tenure', 'district', 'county',
       'region', 'multiple_address_indicator', 'price_paid', 'property_address']]

ocod_data['property_address'] = ocod_data['property_address'].str.lower()

#ensure there is a space after commas
#This is because some numbers are are written as 1,2,3,4,5 which causes issues during tokenisation
ocod_data.property_address = ocod_data.property_address.str.replace(',', r', ', regex = True)
#remove multiple spaces
ocod_data.property_address = ocod_data.property_address.str.replace('\s{2,}', r' ', regex = True)

#typo in the data leads to a large number of fake flats
ocod_data.loc[:, 'property_address'] = ocod_data['property_address'].str.replace("stanley court ", "stanley court, ")
#This typo leads to some rather silly addresses
ocod_data.loc[:, 'property_address'] = ocod_data['property_address'].str.replace("100-1124", "100-112")
ocod_data.loc[:, 'property_address'] = ocod_data['property_address'].str.replace("40a, 40, 40¨, 42, 44", "40a, 40, 40, 42, 44")

full_expanded_data = full_expanded_data.merge(ocod_data, how = "left", left_on = "datapoint_id", right_index = True)

  ocod_data =  pd.read_csv('/tf/empty_homes_data/' +


This is a quality test that checks that the property addresses are identical between the original ocod data and the expanded OCOD data.
It also acts as a proof that the addresses are not being re-ordered

In [32]:

full_expanded_data['property_address'].str.lower().equals(full_expanded_data['text']) # When 'True' this shows the match works because the address string are equal
#If the above is not true then the below code shows which rows are not the same
#full_expanded_data.loc[~((full_expanded_data['text']==full_expanded_data['property_address'].str.lower())), ['property_address', 'text', 'datapoint_id']]

True

# Finalise expanded OCOD and save CSV
This is the final step of this notebook. It ouputs a csv for the next notebook to load and add in geolocation and classification

In [35]:
full_expanded_data['within_title_id'] = full_expanded_data.groupby('title_number').cumcount()+1
full_expanded_data['unique_id'] = [str(x) + '-' + str(y) for x, y in zip(full_expanded_data['title_number'], full_expanded_data['within_title_id'])]

tmp_df =((full_expanded_data[['title_number', 'within_title_id']].groupby('title_number').max('within_title_id'))>1)
tmp_df.columns = tmp_df.columns.str.replace('within_title_id', 'within_larger_title') #could also be called nested_address
full_expanded_data = full_expanded_data.merge(tmp_df, how = "left", left_on = "title_number", right_index = True)


full_expanded_data['postcode'] =full_expanded_data['postcode'].str.upper()
del tmp_df

#re-order the columns and drop columns that are not needed

full_expanded_data =full_expanded_data[['title_number', 'within_title_id', 'unique_id', 'within_larger_title',  'tenure','unit_id', 'unit_type','building_name','street_number', 'street_name', 'postcode','city',  'district', 'county', 'region',
       'multiple_address_indicator', 'price_paid' ,'property_address']].replace('block', np.NaN)

#save as CSV
full_expanded_data.to_csv("/tf/empty_homes_data/OCOD_cleaned_expanded2.csv")

# The entire parsing process
The below chunk goes through the entire parsing process and saves the resulting dataframe as a csv

In [5]:
all_entities = load_cleaned_labels('/tf/empty_homes_data/full_dataset_no_overlaps.json')

full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)

ocod_data = load_and_prep_OCOD_data('/tf/empty_homes_data/' + 'OCOD_FULL_2022_02.csv')

full_expanded_data = post_process_expanded_data(full_expanded_data, ocod_data)

full_expanded_data.to_csv("/tf/empty_homes_data/OCOD_cleaned_expanded2.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['index'] = temp_df.index


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.173 filter time0.11 make_dataframe_time 0.917
i= 2000  expand time,0.346 filter time0.22 make_dataframe_time 1.773
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.182 filter time0.112 make_dataframe_time 1.03


  uniques = Index(uniques)
  ocod_data =  pd.read_csv(file_path,


In [14]:
full_expanded_data.loc[full_expanded_data['street_name'].str.contains("stadium approach,")==True,:]
#full_expanded_data['street_name'].str.contains("stadium approach,")==True

Unnamed: 0,index,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
112,,123,,aylesbury,odd,,"stadium approach,",33-63,,,"33-63 (odd), stadium approach, aylesbury"
