In [465]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Introduction
* Raw CSV loaded and lightly processed. Output: two column csv columns, property address, flat tag
* Data labelled in programmatic. Output: json file of entities.
* Data programmatic output json cleaned ordered and overlaps removed. Output: json file
* **Clean json converted to dataframe and multi-addresses expanded.** Output: CSV
* Count and locate addresses
* Create address matcher and match businesses
* Classify address types

This notebook is used so that I can create the code necessary to expand the addresses so that a single property/dwelling is a single line. This means the data will follow the tidy data principles of one observation per line where an observation is what is commonly thought of as a property.

In [466]:
import json
import pandas as pd
import re
import numpy as np
import time #for profiling slow functions

In [467]:

with open('/tf/empty_homes_data/full_dataset_no_overlaps.json', "r") as read_file:
    all_entities_json = json.load(read_file)


In [468]:
all_entities_json[5954]

{'text': '4-22 (even) gemini road, 2-12 (even) saturn grove and land lying to the north west of saturn grove, salford (m6 6ha)',
 'labels': [{'start': 0,
   'end': 4,
   'label': 'street_number',
   'label_text': '4-22'},
  {'start': 6, 'end': 10, 'label': 'number_filter', 'label_text': 'even'},
  {'start': 12,
   'end': 23,
   'label': 'street_name',
   'label_text': 'gemini road'},
  {'start': 25, 'end': 29, 'label': 'street_number', 'label_text': '2-12'},
  {'start': 31, 'end': 35, 'label': 'number_filter', 'label_text': 'even'},
  {'start': 37,
   'end': 49,
   'label': 'street_name',
   'label_text': 'saturn grove'},
  {'start': 86,
   'end': 98,
   'label': 'street_name',
   'label_text': 'saturn grove'},
  {'start': 100, 'end': 107, 'label': 'city', 'label_text': 'salford'},
  {'start': 109, 'end': 115, 'label': 'postcode', 'label_text': 'm6 6ha'}],
 'datapoint_id': 5991}

In [469]:
all_entities = pd.json_normalize(all_entities_json, record_path = "labels",
                       meta = ['datapoint_id', 'text'])

all_entities['label_id_count'] = all_entities.groupby(['datapoint_id', 'label']).cumcount()

## Example of the data frame of labels

In [470]:
all_entities

Unnamed: 0,start,end,label,label_text,datapoint_id,text,label_id_count
0,27,39,street_name,nel pan lane,0,"westleigh lodge care home, nel pan lane, leigh...",0
1,41,46,city,leigh,0,"westleigh lodge care home, nel pan lane, leigh...",0
2,48,55,postcode,wn7 5jt,0,"westleigh lodge care home, nel pan lane, leigh...",0
3,0,4,unit_type,flat,1,"flat 1, 1a canal street, manchester (m1 3he)",0
4,5,6,unit_id,1,1,"flat 1, 1a canal street, manchester (m1 3he)",0
...,...,...,...,...,...,...,...
370057,36,42,city,london,94086,"part of tenth floor, 6 bevis marks, london (ec...",0
370058,44,52,postcode,ec3a 7ba,94086,"part of tenth floor, 6 bevis marks, london (ec...",0
370059,46,63,street_name,south quay square,94087,"storage 17, discovery dock apartments east, 3 ...",0
370060,65,71,city,london,94087,"storage 17, discovery dock apartments east, 3 ...",0


In [505]:
all_entities[all_entities['datapoint_id']==613].reset_index()['text'][0]

'flat 2, 131-139 the broadway, london (sw19 1qj)'

## This chunk is supposed to help only keep truly multi addresses

In [472]:
entity_counts = all_entities.groupby('datapoint_id').size().to_frame(name = "entity_counts")
unit_id_counts = all_entities[all_entities["label"]=="unit_id"].groupby('datapoint_id').size().to_frame(name = "unit_id_counts")
building_counts = all_entities[all_entities["label"]=="building_name"].groupby('datapoint_id').size().to_frame(name = "building_counts")
street_counts = all_entities[all_entities["label"]=="street_name"].groupby('datapoint_id').size().to_frame(name = "street_counts")

#counts the number of times the label unit_id occurs. does not count the total number of units
multi_unit_id_counts = all_entities[(all_entities["label"]=="unit_id") & (all_entities['label_text'].str.contains(r'\d+(\s)?(-|to)(\s)?\d+'))].groupby('datapoint_id').size().to_frame(name = "multi_unit_id_counts")
street_number_counts = all_entities[(all_entities["label"]=="street_number") & (all_entities['label_text'].str.contains(r'\d+(\s)?(-|to)(\s)?\d+'))].groupby('datapoint_id').size().to_frame(name = "street_number_counts")


test = pd.merge(entity_counts, unit_id_counts, left_index=True, right_index=True, how = "left")

test = pd.merge(test, building_counts, left_index=True, right_index=True, how = "left")
test = pd.merge(test, street_counts, left_index=True, right_index=True, how = "left")
test = pd.merge(test, multi_unit_id_counts, left_index=True, right_index=True, how = "left")
test = pd.merge(test, street_number_counts, left_index=True, right_index=True, how = "left").fillna(0)


test['is_multi'] = (test['unit_id_counts']>1) | (test['building_counts']>1) | (test['street_counts']>1) |(test['multi_unit_id_counts'] >0) | ((test['street_number_counts']>0) & (test['building_counts'] !=1) )

#the index of multi addresses
multi_address_ids = test[test['is_multi']==True].index.values.tolist()

  multi_unit_id_counts = all_entities[(all_entities["label"]=="unit_id") & (all_entities['label_text'].str.contains(r'\d+(\s)?(-|to)(\s)?\d+'))].groupby('datapoint_id').size().to_frame(name = "multi_unit_id_counts")
  street_number_counts = all_entities[(all_entities["label"]=="street_number") & (all_entities['label_text'].str.contains(r'\d+(\s)?(-|to)(\s)?\d+'))].groupby('datapoint_id').size().to_frame(name = "street_number_counts")


Unnamed: 0,0,1,2,3,4
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
370057,,,,,
370058,,,,,
370059,,,,,
370060,,,,,


In [473]:
all_entities.loc[multi_address_ids]

Unnamed: 0,start,end,label,label_text,datapoint_id,text,label_id_count
13,25,35,city,manchester,2,"flat 201, 1 regent road, manchester (m3 4ay)",0
38,8,25,building_name,queensgate centre,9,"land at queensgate centre, peterborough",0
46,33,40,postcode,pe2 8ns,11,"121 hawksbill way, peterborough (pe2 8ns)",0
48,3,15,street_name,beluga close,12,"44 beluga close, peterborough (pe2 8ne)",0
56,46,49,number_filter,odd,13,"2-24 (even) miller way, 15-25 hammonds drive (...",1
...,...,...,...,...,...,...,...
94028,46,53,postcode,e18 1nb,23890,"122 onslow gardens, london and garden ground (...",0
94046,42,49,postcode,sw3 1pu,23894,"flat 6, 2 and 3 beaufort gardens, london (sw3 ...",0
94048,3,18,street_name,coulthard drive,23895,"21 coulthard drive, breage, helston (tr13 9pf)",0
94057,27,35,postcode,de22 1dt,23897,"5 the hill, darley abbey, (de22 1dt)",0


# checking weird regex problems

In [474]:
all_entities[all_entities['text'].str.contains(r"house")]

Unnamed: 0,start,end,label,label_text,datapoint_id,text,label_id_count
202,0,1,street_number,1,48,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18,...",0
203,3,4,street_number,2,48,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18,...",1
204,6,7,street_number,3,48,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18,...",2
205,9,10,street_number,4,48,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18,...",3
206,12,13,street_number,5,48,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 18,...",4
...,...,...,...,...,...,...,...
370037,0,2,street_number,4t,94082,"4th floor, ormond house, 63 queen victoria str...",0
370038,11,23,building_name,ormond house,94082,"4th floor, ormond house, 63 queen victoria str...",0
370039,28,49,street_name,queen victoria street,94082,"4th floor, ormond house, 63 queen victoria str...",0
370040,51,57,city,london,94082,"4th floor, ormond house, 63 queen victoria str...",0


In [475]:
all_entities['text'][all_entities['text'].str.contains("58 to 66")]

221975    58 to 66 (even numbers) great portland street,...
221976    58 to 66 (even numbers) great portland street,...
221977    58 to 66 (even numbers) great portland street,...
221978    58 to 66 (even numbers) great portland street,...
221979    58 to 66 (even numbers) great portland street,...
221980    58 to 66 (even numbers) great portland street,...
221981    58 to 66 (even numbers) great portland street,...
221982    58 to 66 (even numbers) great portland street,...
221983    58 to 66 (even numbers) great portland street,...
221984    58 to 66 (even numbers) great portland street,...
Name: text, dtype: object

In [498]:

number_filter = None

number_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

def expand_multi_id(multi_id_string):
    #the function takes a string that is in the form '\d+(\s)?(-|to)(\s)?\d+'
    #and outputs a continguous list of numbers between the two numbers in the string
    multi_id_list = [int(x) for x in re.findall(r'\d+', multi_id_string)]
    #min and max has to be used becuase somtimes the numbers are in descending order 4-3... I don't know why someone would do that
    out = list(range(min(multi_id_list), max(multi_id_list)+1))
    return(out)

def filter_contiguous_numbers(number_list, number_filter):
    #this function filters a list of contiguous house numbers/unit_id's to be even, odd, or unchanged
    #it takes as an argument a list of integers and a filter condition.
    #these values are contained in the label dictionary and reformated dataframe
    #The function ouputs the correct list of integers according to the filter condition

    if number_filter == 'odd':
        out = [ x for x in number_list if x%2==1]
    elif number_filter == 'even':
        out = [ x for x in number_list if x%2==0]
    else:
        out = number_list
    return out

multi_id_string = "1to30"
number_list = expand_multi_id(multi_id_string)
print(number_list)
correct_numbers_even = filter_contiguous_numbers(number_list, "even")
correct_numbers_odd = filter_contiguous_numbers(number_list, "odd")
correct_numbers_all = filter_contiguous_numbers(number_list, None)
print("original list" + str(number_list), "\neven list", str(correct_numbers_even),
     "\nodd list", str(correct_numbers_odd),
     "\nall numbers", str(correct_numbers_all))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
original list[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] 
even list [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] 
odd list [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29] 
all numbers [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [477]:
all_entities['label_text'].to_list()[0]

'nel pan lane'

In [497]:
max([4,6])

6

## Spread label data

In [515]:
#pivot the columns so that each label class is it's own column and the value in the column is the text
all_entities['index'] = all_entities.index
df = all_entities[['index', 'label', 'label_text']].pivot(index='index',columns='label',values='label_text')
#add the datapoint_id back in for each of joining
df = pd.concat([all_entities['datapoint_id'], df], axis=1).merge(all_entities[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")
df

Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,0,,,,,nel pan lane,,,,"westleigh lodge care home, nel pan lane, leigh..."
1,0,,leigh,,,,,,,"westleigh lodge care home, nel pan lane, leigh..."
2,0,,,,wn7 5jt,,,,,"westleigh lodge care home, nel pan lane, leigh..."
3,1,,,,,,,,flat,"flat 1, 1a canal street, manchester (m1 3he)"
4,1,,,,,,,1,,"flat 1, 1a canal street, manchester (m1 3he)"
...,...,...,...,...,...,...,...,...,...,...
370057,94086,,london,,,,,,,"part of tenth floor, 6 bevis marks, london (ec..."
370058,94086,,,,ec3a 7ba,,,,,"part of tenth floor, 6 bevis marks, london (ec..."
370059,94087,,,,,south quay square,,,,"storage 17, discovery dock apartments east, 3 ..."
370060,94087,,london,,,,,,,"storage 17, discovery dock apartments east, 3 ..."


## Add blockers

Blockers prevent the filling of wrong information. As an example if a building is going to back fill up previous addresses it should not back fill past another street as this is highly unlikely to be the same building

In [480]:
df['building_name'][df['street_name'].notnull()] = 'block'
df['street_number'][df['street_name'].notnull()] = 'block' #for multi-flats inside a common building

#returns true if current number filter is null and the next row has street_number or unit id is not null
#prevents number filters propergsating back across roads and unit ids
number_filter_block = df['number_filter'].isnull() & (df['street_number'].shift().notnull() |df['unit_id'].shift().notnull())
df['number_filter'][number_filter_block] = 'block'

0         False
1         False
2         False
3         False
4         False
          ...  
357947    False
357948    False
357949     True
357950    False
357951    False
Length: 357952, dtype: bool

## Backfill 

Backfilling adds address information in. However, street address should only be back filled for multi addresses.
I need to work out how to do flat, which may be before or after the unit ID
Also I don't think this is a very good way of doing it at all. Using the pre-spread list is probably better and only working on the multi-addresses is probably a much faster and cleaner way. But I will have to think about how to do it

In [481]:
df['number_filter'] = df[['datapoint_id','number_filter']].groupby('datapoint_id').fillna(method ='bfill')
df['building_name'] = df[['datapoint_id','building_name']].groupby('datapoint_id').fillna(method ='bfill')
df['street_number'] = df[['datapoint_id','street_number']].groupby('datapoint_id').fillna(method ='bfill')
df['postcode'] = df[['datapoint_id','postcode']].groupby('datapoint_id').fillna(method ='bfill')
df['street_name'] = df[['datapoint_id','street_name']].groupby('datapoint_id').fillna(method ='bfill')
df['number_filter'] = df[['datapoint_id','number_filter']].groupby('datapoint_id').fillna(method ='bfill')
df['city'] = df[['datapoint_id','city']].groupby('datapoint_id').fillna(method ='bfill')

In [482]:
#The regex that identifies which are multi rows. It is important not to select buildings that simply take up multiple 
#street numbers
xx_to_yy_regex = r'^\d+(\s)?(-|to)(\s)?\d+$' #the start and end symbol prrevent things like 1-0-3 messing everything up
multi_unit_rows = (df['unit_id'].str.contains(xx_to_yy_regex) & df['unit_id'].notnull())
multi_building_rows = (df['street_number'].str.contains(xx_to_yy_regex) & 
                                         (df['unit_id'].isnull() | 
                                          df['building_name'].isnull()|
                                          (df['building_name']=='block')
                                         ))

  multi_unit_rows = (df['unit_id'].str.contains(xx_to_yy_regex) & df['unit_id'].notnull())
  multi_building_rows = (df['street_number'].str.contains(xx_to_yy_regex) &


In [520]:
def expand_dataframe_numbers(df2, column_name, print_every = 1000, min_count = 1):
    
    temp_list = []
    expand_time = 0
    filter_time = 0
    make_dataframe_time = 0
    
    for i in range(0, df2.shape[0]):
        
                
        start_expand_time = time.time()
        numbers_list = expand_multi_id(df2.loc[i][column_name])
        end_expand_time = time.time()

        numbers_list = filter_contiguous_numbers(numbers_list, df2.loc[i]['number_filter'])

        end_filter_time = time.time()
        
        dataframe_len = len(numbers_list)
        
        #This prevents large properties counting as several properties
        if dataframe_len>min_count:
            tmp = pd.concat([df2.iloc[i].to_frame().T]*dataframe_len, ignore_index=True)
            
            tmp[column_name] = numbers_list
        else:
            tmp = df2.iloc[i].to_frame().T
            
        temp_list.append(tmp)
        end_make_dataframe_time = time.time()
        
        expand_time = expand_time + (end_expand_time - start_expand_time)
        filter_time =filter_time + (end_filter_time - end_expand_time)
        make_dataframe_time = make_dataframe_time +(end_make_dataframe_time - end_filter_time)
        
        if i%print_every==0: print("i=", i, " expand time,"+ str(round(expand_time, 3)) +
                           " filter time" + str(round(filter_time,3)) + 
                           " make_dataframe_time " + str(round(make_dataframe_time,3)))
    
    #once all the lines have been expanded concatenate them into a single dataframe
    start_concat_time = time.time()
    out = pd.concat(temp_list)
    end_concat_time = time.time

    return out

In [499]:
df_unit_expand = df.loc[np.where(multi_unit_rows)].reset_index()
#is multi street address but is not a flat. This is horribly flawed but at least it is a start. 
df_street_number = df.loc[np.where(multi_building_rows)].reset_index()

In [486]:
df_unit_expand.to_csv("/tf/empty_homes_data/test.csv")

In [521]:
expanded_street = expand_dataframe_numbers(df_street_number, column_name = "street_number", min_count = 4 )
expanded_unit_id = expand_dataframe_numbers(df_unit_expand, column_name = "unit_id" )

i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.002
i= 1000  expand time,0.205 filter time0.121 make_dataframe_time 0.805
i= 2000  expand time,0.395 filter time0.244 make_dataframe_time 1.546
i= 3000  expand time,0.571 filter time0.357 make_dataframe_time 2.266
i= 4000  expand time,0.753 filter time0.472 make_dataframe_time 2.993
i= 5000  expand time,0.925 filter time0.587 make_dataframe_time 4.874
i= 6000  expand time,1.092 filter time0.7 make_dataframe_time 5.592
i= 7000  expand time,1.265 filter time0.815 make_dataframe_time 6.33
i= 8000  expand time,1.444 filter time0.935 make_dataframe_time 7.06
i= 9000  expand time,1.616 filter time1.053 make_dataframe_time 7.769
i= 10000  expand time,1.791 filter time1.167 make_dataframe_time 8.705
i= 11000  expand time,1.963 filter time1.281 make_dataframe_time 9.435
i= 12000  expand time,2.129 filter time1.392 make_dataframe_time 10.119
i= 13000  expand time,2.3 filter time1.505 make_dataframe_time 10.827
i= 0  expand time,0.0 filte

In [522]:

#save to csv to quality check
expanded_unit_id.to_csv("/tf/empty_homes_data/expanded_unit.csv")

expanded_unit_id

Unnamed: 0,index,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type
0,894,223,block,,block,,draybank road,1,1,
1,894,223,block,,block,,draybank road,1,2,
2,894,223,block,,block,,draybank road,1,3,
3,894,223,block,,block,,draybank road,1,4,
4,894,223,block,,block,,draybank road,1,5,
...,...,...,...,...,...,...,...,...,...,...
8,369687,93996,block,bushey,block,wd23 4jd,fuller close,42 to 55,24,
9,369687,93996,block,bushey,block,wd23 4jd,fuller close,42 to 55,25,
10,369687,93996,block,bushey,block,wd23 4jd,fuller close,42 to 55,26,
11,369687,93996,block,bushey,block,wd23 4jd,fuller close,42 to 55,27,


In [523]:
#np.where((unit_rows | multi_building_rows))

#df[(multi_unit_rows==False) | (multi_building_rows==False)]

multiaddress_datapoint_id = pd.concat([expanded_unit_id,expanded_street])['datapoint_id'].unique()
expanded_street


Unnamed: 0,index,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type
0,51,13,block,,even,,miller way,2,,
1,51,13,block,,even,,miller way,4,,
2,51,13,block,,even,,miller way,6,,
3,51,13,block,,even,,miller way,8,,
4,51,13,block,,even,,miller way,10,,
...,...,...,...,...,...,...,...,...,...,...
15,369880,94048,block,doncaster,even,dn4 5pz,windermere drive,44,,
16,369880,94048,block,doncaster,even,dn4 5pz,windermere drive,46,,
17,369880,94048,block,doncaster,even,dn4 5pz,windermere drive,48,,
18,369880,94048,block,doncaster,even,dn4 5pz,windermere drive,50,,


In [509]:
#pivot the columns so that each label class is it's own column and the value in the column is the text

#remove the multi-addresses
single_address_only =all_entities[~all_entities['datapoint_id'].isin(multiaddress_datapoint_id)]
#remove all but the first instance of a label in the remaining instances
#this is because for single addresses there should be only a single label for each class
single_address_only =single_address_only[single_address_only['label_id_count']==0]
df2 = single_address_only.pivot(index='datapoint_id',columns='label',values='label_text')
#add the datapoint_id back in for each of joining
df2.merge(single_address_only[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")

  uniques = Index(uniques)


Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,0,,leigh,,wn7 5jt,nel pan lane,,,,"westleigh lodge care home, nel pan lane, leigh..."
1,1,,manchester,,m1 3he,canal street,1a,1,flat,"flat 1, 1a canal street, manchester (m1 3he)"
2,2,,manchester,,m3 4ay,regent road,1,201,flat,"flat 201, 1 regent road, manchester (m3 4ay)"
3,3,,wigan,,wn4 9aa,gerard street,,,land,"land at 2a gerard street, ashton in makerfield..."
4,4,,manchester,,m15 4nz,worsley street,,111,unit,"unit 111, timber wharf, worsley street, manche..."
...,...,...,...,...,...,...,...,...,...,...
84924,94083,,london,,ec3m 7aw,lime street,2n,,,"2nd floor, 52 lime street, london (ec3m 7aw)"
84925,94084,,london,,ec2m 3ad,bishopsgate,155,,,"155 bishopsgate, london (ec2m 3ad)"
84926,94085,,london,,ec3m 7af,lime street,9t,,,"9th and 10th floors, 52 lime street, london (e..."
84927,94086,,london,,ec3a 7ba,,,,,"part of tenth floor, 6 bevis marks, london (ec..."


In [513]:
pd.concat([expanded_street, expanded_unit_id, df2]).merge(all_entities[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id").to_csv("/tf/empty_homes_data/expanded_parsed.csv")