In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Introduction
* Raw CSV loaded and lightly processed. Output: two column csv columns, property address, flat tag
* Data labelled in programmatic. Output: json file of entities.
* Data programmatic output json cleaned ordered and overlaps removed. Output: json file
* **Clean json converted to dataframe and multi-addresses expanded.** Output: CSV
* Count and locate addresses
* Create address matcher and match businesses
* Classify address types

This notebook is used so that I can create the code necessary to expand the addresses so that a single property/dwelling is a single line. This means the data will follow the tidy data principles of one observation per line where an observation is what is commonly thought of as a property.

In [2]:
import json
import pandas as pd
import re
from helper_functions import *
import numpy as np

In [4]:

with open('/tf/empty_homes_data/full_dataset_no_overlaps.json', "r") as read_file:
    all_entities_json = json.load(read_file)


In [5]:
all_entities = pd.json_normalize(all_entities_json, record_path = "labels",
                       meta = ['datapoint_id', 'text'])

all_entities['label_id_count'] = all_entities.groupby(['datapoint_id', 'label']).cumcount()

all_entities['label_text'] =all_entities['label_text'].str.replace("100-1124", "100-112")

## Example of the data frame of labels

In [6]:
all_entities

Unnamed: 0,start,end,label,label_text,datapoint_id,text,label_id_count
0,0,25,building_name,westleigh lodge care home,0,"westleigh lodge care home, nel pan lane, leigh...",0
1,27,39,street_name,nel pan lane,0,"westleigh lodge care home, nel pan lane, leigh...",0
2,41,46,city,leigh,0,"westleigh lodge care home, nel pan lane, leigh...",0
3,48,55,postcode,wn7 5jt,0,"westleigh lodge care home, nel pan lane, leigh...",0
4,0,4,unit_type,flat,1,"flat 1, 1a canal street, manchester (m1 3he)",0
...,...,...,...,...,...,...,...
377001,0,7,unit_type,storage,94087,"storage 17, discovery dock apartments east, 3 ...",0
377002,44,45,street_number,3,94087,"storage 17, discovery dock apartments east, 3 ...",0
377003,46,63,street_name,south quay square,94087,"storage 17, discovery dock apartments east, 3 ...",0
377004,65,71,city,london,94087,"storage 17, discovery dock apartments east, 3 ...",0


In [7]:
all_entities[all_entities['datapoint_id']==51352].reset_index()['text'][0]

'ground to ninth floor flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 alaska building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 arizona building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 california building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 colorado building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 dakota building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 idaho building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 indiana building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 montana building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 nebraska building, 1-10, 101-110, 201-210, 301-310 and 402-403 utah building, 1-10 and 101-110 boston building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 madison building, deals gateway, london'

## Identify multi versus single address observations

Some addresses may have the form xx to yy but should not be expanded as this is a building that covers multiple street numbers. Items such as these need to be carefully removed before expansion

In [7]:
xx_to_yy_regex = r'^\d+(\s)?(-|to)(\s)?\d+$'

multi_check_df = all_entities[['datapoint_id', 'text', ]].drop_duplicates()
multi_check_df['comma_count'] = multi_check_df['text'].str.count(',')
multi_check_df['land'] = multi_check_df['text'].str.contains(r"^(land|plot|airspace|car|parking)", case = False)

multi_check_df['business'] = multi_check_df['text'].str.contains(r"(cinema)|(hotel)|(office)|(\bpub)|(business)|(cafe)|(^shop)|( shop)|(restaurant)|(home)|(\bstore\b)|(\bstorage\\b)|(company)|(ltd)|(limited)|(plc)", case = False)
temp_df = all_entities[['datapoint_id', 'label']].groupby(['datapoint_id', 'label']).value_counts().to_frame(name = "counts").reset_index().pivot(index = 'datapoint_id', columns = 'label', values = 'counts').fillna(0)
#test['datapoint_id'] = test.index

xx_to_yy_street_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
    xx_to_yy_regex)& (all_entities['label']=="street_number")
                            ].to_frame(name = 'datapoint_id').groupby('datapoint_id').size().to_frame(name = 'xx_to_yy_street_counts')

xx_to_yy_unit_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
    xx_to_yy_regex)& (all_entities['label']=="unit_id")
                            ].to_frame(name = 'datapoint_id').groupby('datapoint_id').size().to_frame(name = 'xx_to_yy_unit_counts')

multi_check_df = multi_check_df.merge(temp_df, how = 'left', left_on = "datapoint_id", right_index = True).\
merge(xx_to_yy_street_counts, how = 'left', left_on = "datapoint_id", right_index = True).\
merge(xx_to_yy_unit_counts, how = 'left', left_on = "datapoint_id", right_index = True).fillna(0)


del xx_to_yy_street_counts
del xx_to_yy_unit_counts

#separate the classes using logical rules
multi_check_df['class'] = np.select(
    [
        multi_check_df['land']== True,
        multi_check_df['business']== True,
        (multi_check_df['building_name']==1) & (multi_check_df['unit_id'] == 0), #this has to go infront of 'multi_check_df['xx_to_yy_unit_counts']>0'
        multi_check_df['xx_to_yy_unit_counts']>0,
        multi_check_df['street_number']>1,
        multi_check_df['unit_id']>1,
        (multi_check_df['street_number']<=1) & (multi_check_df['xx_to_yy_street_counts']<=1) & (multi_check_df['unit_id']<=1) ##This does most of the heavy lifting
    ], 
    [
        'single',
        'single',
        'single',
        'multi',
        'multi',
        'multi',
        'single',
        
    ], 
    default='unknown'
)
multi_check_df


  multi_check_df['land'] = multi_check_df['text'].str.contains(r"^(land|plot|airspace|car|parking)", case = False)
  multi_check_df['business'] = multi_check_df['text'].str.contains(r"(cinema)|(hotel)|(office)|(\bpub)|(business)|(cafe)|(^shop)|( shop)|(restaurant)|(home)|(\bstore\b)|(\bstorage\\b)|(company)|(ltd)|(limited)|(plc)", case = False)
  xx_to_yy_street_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
  xx_to_yy_unit_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(


Unnamed: 0,datapoint_id,text,comma_count,land,business,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,xx_to_yy_street_counts,xx_to_yy_unit_counts,class
0,0,"westleigh lodge care home, nel pan lane, leigh...",2,False,True,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,single
4,1,"flat 1, 1a canal street, manchester (m1 3he)",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,single
10,2,"flat 201, 1 regent road, manchester (m3 4ay)",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,single
16,3,"land at 2a gerard street, ashton in makerfield...",2,True,False,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,single
20,4,"unit 111, timber wharf, worsley street, manche...",3,False,False,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,single
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376984,94083,"2nd floor, 52 lime street, london (ec3m 7aw)",2,False,False,0.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,multi
376989,94084,"155 bishopsgate, london (ec2m 3ad)",1,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,single
376993,94085,"9th and 10th floors, 52 lime street, london (e...",2,False,False,0.0,1.0,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,multi
376999,94086,"part of tenth floor, 6 bevis marks, london (ec...",2,False,False,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,single


### Get the indexes of multi properties

In [8]:
multi_unit_id = set(multi_check_df['datapoint_id'][(multi_check_df['class']=='multi') &( multi_check_df['unit_id']>0)].tolist())
multi_property = set(multi_check_df['datapoint_id'][(multi_check_df['class']=='multi') &( multi_check_df['unit_id']==0)].tolist())
all_multi_ids = list(multi_unit_id) +list(multi_property)
multi_check_df.groupby('class').size()

class
multi      6002
single    87503
dtype: int64

# checking weird regex problems

In [10]:
number_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]



#these are just tests for the expansion functions

multi_id_string = "1to30"
number_list = expand_multi_id(multi_id_string)
print(number_list)
correct_numbers_even = filter_contiguous_numbers(number_list, "even")
correct_numbers_odd = filter_contiguous_numbers(number_list, "odd")
correct_numbers_all = filter_contiguous_numbers(number_list, None)
print("original list" + str(number_list), "\neven list", str(correct_numbers_even),
     "\nodd list", str(correct_numbers_odd),
     "\nall numbers", str(correct_numbers_all))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
original list[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] 
even list [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] 
odd list [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29] 
all numbers [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [None]:
all_entities['label_text'].to_list()[0]

In [None]:
max([4,6])

## Spread label data

In [9]:
#pivot the columns so that each label class is it's own column and the value in the column is the text

temp_df = all_entities[all_entities.datapoint_id.isin(all_multi_ids)]

temp_df['index'] = temp_df.index
df = temp_df[['index', 'label', 'label_text']].pivot(index='index',columns='label',values='label_text')
#add the datapoint_id back in for each of joining
df = pd.concat([temp_df['datapoint_id'], df], axis=1).merge(temp_df[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")
del temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['index'] = temp_df.index


## Add blockers

Blockers prevent the filling of wrong information. As an example if a building is going to back fill up previous addresses it should not back fill past another street as this is highly unlikely to be the same building

In [10]:
df['building_name'][df['street_name'].notnull()] = 'block'
df['street_number'][df['street_name'].notnull()] = 'block' #for multi-flats inside a common building

#returns true if current number filter is null and the next row has street_number or unit id is not null
#prevents number filters propergsating back across roads and unit ids
number_filter_block = df['number_filter'].isnull() & (df['street_number'].shift().notnull() |df['unit_id'].shift().notnull())
df['number_filter'][number_filter_block] = 'block'

## Backfill 

Backfilling adds address information in. However, street address should only be back filled for multi addresses.
I need to work out how to do flat, which may be before or after the unit ID
Also I don't think this is a very good way of doing it at all. Using the pre-spread list is probably better and only working on the multi-addresses is probably a much faster and cleaner way. But I will have to think about how to do it

In [11]:
df['number_filter'] = df[['datapoint_id','number_filter']].groupby('datapoint_id').fillna(method ='bfill')
df['building_name'] = df[['datapoint_id','building_name']].groupby('datapoint_id').fillna(method ='bfill')
df['street_number'] = df[['datapoint_id','street_number']].groupby('datapoint_id').fillna(method ='bfill')
df['postcode'] = df[['datapoint_id','postcode']].groupby('datapoint_id').fillna(method ='bfill')
df['street_name'] = df[['datapoint_id','street_name']].groupby('datapoint_id').fillna(method ='bfill')
df['number_filter'] = df[['datapoint_id','number_filter']].groupby('datapoint_id').fillna(method ='bfill')
df['city'] = df[['datapoint_id','city']].groupby('datapoint_id').fillna(method ='bfill')
df['unit_type'] = df[['datapoint_id','unit_type']].groupby('datapoint_id').fillna(method ='bfill')

In [12]:
df[df['text'].str.contains('salisbury square')]

Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text


In [13]:
df

Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,13,block,,even,,miller way,2-24,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
1,13,block,,even,,miller way,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
2,13,block,,block,,miller way,block,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
3,13,block,,block,,fengate,15-25,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
4,13,block,,odd,,fengate,1-19,,,"2-24 (even) miller way, 15-25 hammonds drive (..."
...,...,...,...,...,...,...,...,...,...,...
39735,94085,block,london,block,ec3m 7af,lime street,10,,,"9th and 10th floors, 52 lime street, london (e..."
39736,94085,block,london,block,ec3m 7af,lime street,52,,,"9th and 10th floors, 52 lime street, london (e..."
39737,94085,block,london,block,ec3m 7af,lime street,block,,,"9th and 10th floors, 52 lime street, london (e..."
39738,94085,,london,block,ec3m 7af,,,,,"9th and 10th floors, 52 lime street, london (e..."


In [14]:
expanded_street = df[df.datapoint_id.isin(multi_property) & df.street_number.str.contains(xx_to_yy_regex)].reset_index()
expanded_unit_id = df[df.datapoint_id.isin(multi_unit_id) & df.unit_id.str.contains(xx_to_yy_regex)].reset_index()

expanded_street = expand_dataframe_numbers(expanded_street, column_name = "street_number" )
expanded_unit_id = expand_dataframe_numbers(expanded_unit_id, column_name = "unit_id" )

  expanded_street = df[df.datapoint_id.isin(multi_property) & df.street_number.str.contains(xx_to_yy_regex)].reset_index()
  expanded_unit_id = df[df.datapoint_id.isin(multi_unit_id) & df.unit_id.str.contains(xx_to_yy_regex)].reset_index()


i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.163 filter time0.106 make_dataframe_time 0.805
i= 2000  expand time,0.323 filter time0.213 make_dataframe_time 1.609
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.157 filter time0.101 make_dataframe_time 0.87
i= 2000  expand time,0.31 filter time0.202 make_dataframe_time 1.678


In [15]:
#unit id and street number that does does not have the xx to yy format and so has already been expanded by spreaing and backfilling
expanded_street_simple = df[df.datapoint_id.isin(multi_property) & (df.street_number.str.contains(xx_to_yy_regex)==False) & (df.street_number!='block')].reset_index()
expanded_unit_id_simple = df[df.datapoint_id.isin(multi_unit_id) & (df.unit_id.str.contains(xx_to_yy_regex)==False) & (df.unit_id!='block')].reset_index()

  expanded_street_simple = df[df.datapoint_id.isin(multi_property) & (df.street_number.str.contains(xx_to_yy_regex)==False) & (df.street_number!='block')].reset_index()
  expanded_unit_id_simple = df[df.datapoint_id.isin(multi_unit_id) & (df.unit_id.str.contains(xx_to_yy_regex)==False) & (df.unit_id!='block')].reset_index()


In [16]:
#pivot the columns so that each label class is it's own column and the value in the column is the text

#remove the multi-addresses
single_address_only =all_entities[~all_entities['datapoint_id'].isin(all_multi_ids)]
#remove all but the first instance of a label in the remaining instances
#this is because for single addresses there should be only a single label for each class
single_address_only =single_address_only[single_address_only['label_id_count']==0]
df2 = single_address_only.pivot(index='datapoint_id',columns='label',values='label_text')
#add the datapoint_id back in for each of joining
df2 = df2.merge(single_address_only[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")

df2

  uniques = Index(uniques)


Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,0,westleigh lodge care home,leigh,,wn7 5jt,nel pan lane,,,,"westleigh lodge care home, nel pan lane, leigh..."
1,1,,manchester,,m1 3he,canal street,1a,1,flat,"flat 1, 1a canal street, manchester (m1 3he)"
2,2,,manchester,,m3 4ay,regent road,1,201,flat,"flat 201, 1 regent road, manchester (m3 4ay)"
3,3,,wigan,,wn4 9aa,gerard street,,,land,"land at 2a gerard street, ashton in makerfield..."
4,4,,manchester,,m15 4nz,worsley street,,111,unit,"unit 111, timber wharf, worsley street, manche..."
...,...,...,...,...,...,...,...,...,...,...
87498,94081,heritage tower,london,,e14 3nw,east ferry road,118,807,flat,"flat 807, heritage tower, 118 east ferry road,..."
87499,94082,ormond house,london,,ec4n 4ua,queen victoria street,4t,,,"4th floor, ormond house, 63 queen victoria str..."
87500,94084,,london,,ec2m 3ad,bishopsgate,155,,,"155 bishopsgate, london (ec2m 3ad)"
87501,94086,,london,,ec3a 7ba,,,,,"part of tenth floor, 6 bevis marks, london (ec..."


In [17]:
full_expanded_data = pd.concat([expanded_street, 
           expanded_unit_id, 
           expanded_street_simple, 
           expanded_unit_id_simple, 
           df2, ])

In [18]:
ocod_data =  pd.read_csv('/tf/empty_homes_data/' +
                    'OCOD_FULL_2022_02.csv',
                   encoding_errors= 'ignore').rename(columns = lambda x: x.lower().replace(" ", "_"))
#empty addresses cannot be used. however there are only three so not a problem
ocod_data = ocod_data.dropna(subset = 'property_address')
ocod_data.reset_index(inplace = True, drop = True)
ocod_data = ocod_data[['title_number', 'tenure', 'district', 'county',
       'region', 'multiple_address_indicator', 'price_paid', 'property_address']]

  ocod_data =  pd.read_csv('/tf/empty_homes_data/' +


In [19]:
full_expanded_data = full_expanded_data.merge(ocod_data, how = "left", left_on = "datapoint_id", right_index = True)
full_expanded_data['property_address'].str.lower().equals(full_expanded_data['text']) #This shows the match works

True

In [20]:
full_expanded_data['within_title_id'] = full_expanded_data.groupby('title_number').cumcount()+1
full_expanded_data['unique_id'] = [str(x) + '-' + str(y) for x, y in zip(full_expanded_data['title_number'], full_expanded_data['within_title_id'])]

tmp_df =((full_expanded_data[['title_number', 'within_title_id']].groupby('title_number').max('within_title_id'))>1)
tmp_df.columns = tmp_df.columns.str.replace('within_title_id', 'within_larger_title') #could also be called nested_address
full_expanded_data = full_expanded_data.merge(tmp_df, how = "left", left_on = "title_number", right_index = True)


full_expanded_data['postcode'] =full_expanded_data['postcode'].str.upper()
del tmp_df

#re-order the columns and drop columns that are not needed

full_expanded_data =full_expanded_data[['title_number', 'within_title_id', 'unique_id', 'within_larger_title',  'tenure','unit_id', 'unit_type','building_name','street_number', 'street_name', 'postcode','city',  'district', 'county', 'region',
       'multiple_address_indicator', 'price_paid' ,'property_address']].replace('block', np.NaN)

#save as CSV
full_expanded_data.to_csv("/tf/empty_homes_data/OCOD_cleaned_expanded.csv")

In [21]:
full_expanded_data[full_expanded_data['street_name'].isnull()]['property_address'].to_csv('/tf/empty_homes_data/street_is_null.csv')

In [22]:
full_expanded_data[full_expanded_data.within_title_id==full_expanded_data.within_title_id.max()].reset_index()['property_address'][0]


'Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 Arizona Building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 California Building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 Colorado Building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 Dakota Building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Idaho Building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 Indiana Building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 Montana Building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Nebraska Building, 1-10, 101-110, 201-210, 301-310 and 402-403 Utah Building, 1-10 and 101-110 Boston Building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 Madison Building, Deals Gateway, London'

In [1]:
300000*700

210000000