# Analysing the cleaned OCOD dataset


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import pandas as pd
import numpy as np
import re
import io
import zipfile
from helper_functions import *



In [2]:
#!pip install thefuzz[speedup]
from thefuzz import process #an alternative could jellyfish, nltk, diff-match-patch

In [3]:
target_post_area = "Data/ONSPD_NOV_2021_UK.csv"

with zipfile.ZipFile("/tf/empty_homes_data/" + "ONSPD_NOV_2021_UK.zip") as zf:
    with io.TextIOWrapper(zf.open(target_post_area), encoding = 'latin-1') as f:
        postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','lsoa11', 'msoa11', 'ctry']]
        postcode_district_lookup = postcode_district_lookup[(postcode_district_lookup['ctry'] == 'E92000001') | (postcode_district_lookup['ctry'] == 'W92000004')]
        postcode_district_lookup.rename(columns = {'pcds':'postcode2',
                                                  'oslaua':'lad11cd'}, inplace = True)
        #spaces are removed because I don't know if the formatting is the same in the two datasets
        postcode_district_lookup['postcode2']= postcode_district_lookup['postcode2'].str.lower().str.replace(r"\s", r"", regex = True)
        postcode_district_lookup.drop('ctry',  axis =1, inplace = True)


  postcode_district_lookup = pd.read_csv(f)[['pcds','oslaua','lsoa11', 'msoa11', 'ctry']]


In [4]:
ocod_data =  pd.read_csv("/tf/empty_homes_data/OCOD_cleaned_expanded.csv")

##add in the geographic area data like lsoa etc
ocod_data['postcode2'] = ocod_data['postcode'].str.lower().str.replace("\s", "")

ocod_data = ocod_data.merge(postcode_district_lookup, 'left', left_on = "postcode2", right_on = "postcode2")

  ocod_data['postcode2'] = ocod_data['postcode'].str.lower().str.replace("\s", "")


In [5]:
ocod_data.columns

Index(['Unnamed: 0', 'title_number', 'within_title_id', 'unique_id',
       'within_larger_title', 'tenure', 'unit_id', 'unit_type',
       'building_name', 'street_number', 'street_name', 'postcode', 'city',
       'district', 'county', 'region', 'multiple_address_indicator',
       'price_paid', 'property_address', 'postcode2', 'lad11cd', 'lsoa11',
       'msoa11'],
      dtype='object')

## Using price paid data to match names

The land registry does not use standardised LAD codes or names and 
the LAD names it uses appear to be wrong sometimes. I need to know the LADs so that I only try road matching within local authorities to minimise the chance of having the same road twice. To get around this I will use the substantially larger database of the price paid data to get all the land registry district names and match them to the onsp using the postcodes. This works as there are a large number of sales in each district most of them will have a postcode. There are cases where the wrong district or postcode is applied meaning a single district name can have two or more lad11cd's, to solve this I simply take the lad11cd with the largest number of counts.

The resulting OCOD data frame has a LAD11CD for each entry, and thus allows the fuzzy road matching to work effectively

In [6]:
#https://www.gov.uk/guidance/about-the-price-paid-data#explanations-of-column-headers-in-the-ppd
price_paid_headers = ['Transaction unique identifier', 'Price', 'Date of Transfer', 'Postcode', 'Property Type', 
                     'Old New', 'Duration', 'PAON', 'SAON', 'Street',  'Locality', 'Town', 'District', 'County',
                     'PPD Category Type', 'Record Status - monthly file only']
#clean up to make working with them easier
price_paid_headers = [x.lower().replace(' ', '_') for x in price_paid_headers]

price_paid_df = pd.read_csv('/tf/empty_homes_data/price_paid_files/pp-2021.csv', names = price_paid_headers)

price_paid_df['postcode2'] = price_paid_df['postcode'].str.lower().str.replace(r"\s", r"", regex=True)

price_paid_df = price_paid_df.merge(postcode_district_lookup, 'left', left_on = "postcode2", right_on = "postcode2")

#when there are multiples take the lad11cd with the largest number of counts
lad_lookup = price_paid_df[['district', 'lad11cd']].dropna().groupby(['district', 'lad11cd']).size().reset_index()
lad_lookup.rename(columns = {0:'counts'}, inplace = True)
lad_lookup = lad_lookup.sort_values('counts', ascending=False).groupby('lad11cd').first().reset_index()
lad_lookup.drop('counts', axis = 1, inplace = True)

temp = ocod_data
temp = temp.drop('lad11cd', axis = 1)

temp = temp[temp['postcode'].isna()]

temp = temp.merge(lad_lookup, left_on = "district", right_on = "district")

temp['lad11cd'].isna().sum() #there are no na values showing all districts now have a lad code

#join the ocod data back together again
ocod_data = pd.concat( [temp, ocod_data[~ocod_data['postcode'].isna()]])

#tidy up
del temp
del lad_lookup
del price_paid_df

In [7]:
ocod_data

Unnamed: 0.1,Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,...,district,county,region,multiple_address_indicator,price_paid,property_address,postcode2,lsoa11,msoa11,lad11cd
0,0,CB400630,1,CB400630-1,True,Freehold,,,,2,...,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",,,,E06000031
1,1,CB400630,2,CB400630-2,True,Freehold,,,,4,...,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",,,,E06000031
2,2,CB400630,3,CB400630-3,True,Freehold,,,,6,...,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",,,,E06000031
3,3,CB400630,4,CB400630-4,True,Freehold,,,,8,...,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",,,,E06000031
4,4,CB400630,5,CB400630-5,True,Freehold,,,,10,...,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (...",,,,E06000031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162119,87498,AGL465007,1,AGL465007-1,False,Leasehold,807,flat,heritage tower,118,...,TOWER HAMLETS,GREATER LONDON,GREATER LONDON,N,479950.0,"Flat 807, Heritage Tower, 118 East Ferry Road,...",e143nw,E01004215,E02000891,E09000030
162120,87499,AGL465195,1,AGL465195-1,False,Leasehold,,,ormond house,4t,...,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"4th Floor, Ormond House, 63 Queen Victoria Str...",ec4n4ua,E01032739,E02000001,E09000001
162121,87500,AGL473126,1,AGL473126-1,False,Leasehold,,,,155,...,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"155 Bishopsgate, London (EC2M 3AD)",ec2m3ad,E01032739,E02000001,E09000001
162122,87501,AGL475468,1,AGL475468-1,False,Leasehold,,,,,...,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"part of Tenth Floor, 6 Bevis Marks, London (EC...",ec3a7ba,E01032739,E02000001,E09000001


In [8]:
ocod_data.groupby('unit_type').size()

unit_type
airspace              1551
apartment             5188
business               321
cafe                     6
car park space          51
car parking space      441
cinema                   8
flat                 14302
garage                 397
hotel                   75
land                  4645
office                  89
parking space         1094
penthouse               92
plot                  2466
pub                      1
restaurant              17
room                   959
storage                144
store                  173
suite                  134
unit                  3033
dtype: int64

In [9]:
ocod_data.postcode.isnull().sum()/ocod_data.shape[0] #How can location be given when missing postcode?

0.3582381387086428

In [10]:
#only 10k have neither postcode nor street name
#This means I can try to geomatch using street name. In the best case only 4% of addresses will not be matched
pd.crosstab(ocod_data.postcode.notnull(), ocod_data.street_name.notnull())#/ocod_data.shape[0]



street_name,False,True
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
False,10163,47916
True,14927,89118


## Load Voa and get business postcodes

This alllows businesses to be identified

In [11]:
VOA_headers_raw= ["Incrementing Entry Number", "Billing Authority Code", "NDR Community Code", 
 "BA Reference Number", "Primary And Secondary Description Code", "Primary Description Text",
"Unique Address Reference Number UARN", "Full Property Identifier", "Firms Name", "Number Or Name",
"Street", "Town", "Postal District", "County", "Postcode", "Effective Date", "Composite Indicator",
 "Rateable Value", "Appeal Settlement Code", "Assessment Reference", "List Alteration Date", "SCAT Code And Suffix",
 "Sub Street level 3", "Sub Street level 2", "Sub Street level 1", "Case Number", 
 "Current From Date", "Current To Date", 
]

#set to lower and replace spaces with underscore to turn the names into appropriate column names
VOA_headers = [x.lower().replace(" ", "_") for x in VOA_headers_raw]


voa_businesses =  pd.read_csv('/tf/empty_homes_data/' +
                    'uk-englandwales-ndr-2017-listentries-compiled-epoch-0029-baseline-csv.csv',
                   sep = "*",
                   encoding_errors= 'ignore',
                    header=None,
                   names = VOA_headers,
                    index_col = False,
                    #usecols = list(range(1,28))
                   )
voa_businesses['postcode'] = voa_businesses['postcode'].str.lower()
voa_businesses['street'] = voa_businesses['street'].str.lower()

#this removes advertising hordings which are irrelevant
voa_businesses = voa_businesses.loc[voa_businesses['primary_description_text'].str.contains("ADVERTISING")==False,:]
#remove several kinds of car parking space
voa_businesses = voa_businesses.loc[~voa_businesses['primary_and_secondary_description_code'].isin(['C0', 'CP', 'CX', 'MX']),:]
##
##
## Warning this removes a large amount of columns, these may be interesting for some people
##
##
voa_businesses = voa_businesses.iloc[:,4:15]
#Extract the street number
voa_businesses['street_number'] = voa_businesses['number_or_name'].str.extract(r"(\b[0-9\-]+$)")
voa_businesses['postcode2'] = voa_businesses['postcode'].str.lower().str.replace("\s", "")

#add in postcode data and LSOA etc data, this is useful for a range of tasks
voa_businesses = voa_businesses.merge(postcode_district_lookup, left_on = 'postcode2', right_on = "postcode2")

#Create a dataframe that contains the counts of businesses per postcode
postcode_counts_voa = voa_businesses.groupby('postcode').size().reset_index(name = 'business_counts')

#del voa_businesses

ocod_data = pd.merge(ocod_data, postcode_counts_voa, on = "postcode", how = "left")
ocod_data["business_counts"] = ocod_data["business_counts"].fillna(0)

#this is quite large and no longer necessary so is removed from the workspace to save memory
del postcode_district_lookup

  voa_businesses =  pd.read_csv('/tf/empty_homes_data/' +
  voa_businesses['postcode2'] = voa_businesses['postcode'].str.lower().str.replace("\s", "")


In [12]:
voa_businesses[voa_businesses['lad11cd'] == target_lad]

NameError: name 'target_lad' is not defined

## Creating a voa expander for individual LADs

In [123]:
def find_filter_type(street_num):
 #gets the highest street number and uses it to work out if the property is on the odd or even side of the street, or if that rule is ignore and it is all numbers
    values = [int(x) for x in street_num.split("-")]
    if (max(values)%2==0) & (min(values)%2==0):
        out = "even"
    elif (max(values)%2==1) & (min(values)%2==1):
        out = "odd"
    else:
        out = "all"
    
    return out

target_lad = 'E09000019'

def create_all_street_addresses(voa_businesses, target_lad):
    
    #creates a two column table where the first column is the street name and the second
    #column is the street number. The function expands address to get all numbers between for example 4-22
    #voa_businesses is a dataframe of the voa business listings and ratings dataset
    #target_lad is the ons code identifying which local authority will be used. 
    
    temp = voa_businesses[voa_businesses['lad11cd'] == target_lad].copy(deep = True)
    
    #remove anything in brackets
    temp['street_number'] = temp['street_number'].str.replace(r"\(.+\)", "", regex = True, case = False)
    
    #units often slip in as street numbers, this kills them off
    temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
    
    #replace @ and & with words
    temp['street_number'] = temp['street_number'].str.replace(r"@", " at ", regex = True, case = False).str.replace(r"&", " and ", regex = True, case = False)
    
    #replace "-" with spaces with a simple "-"
    temp['street_number'] = temp['street_number'].str.replace(r"(\s)?-(\s)?", "-", regex = True, case = False)
    
    #take only things after the last space includes cases where there is no space. Then remove all letters
    temp['street_number'] = temp['street_number'].str.extract(r"([^\s]+$)")[0].str.replace(r"([a-z]+)", "", regex = True, case = False)
    #remove dangling hyphens and slashes
    temp['street_number'] = temp['street_number'].str.replace(r"(\-$)|(^\-)|\\|\/", "", regex = True, case = False)
    #replace double hyphen... yes it happens
    temp['street_number'] = temp['street_number'].str.replace(r"\-\-", r"\-", regex = True, case = False)
    temp.loc[temp['street_number'].str.len() == 0, 'street_number'] = np.nan
    temp.loc[temp['street_number'].str.contains(r"\.", regex = True)==True, 'street_number'] = np.nan

    temp['is_multi'] = temp['street_number'].str.contains(r"\-", regex = True)

    temp_multi = temp.loc[temp['is_multi']==True]#, ['street_number', 'is_multi']]

    temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]
    
    #occasionally one of the dataframes is empty, this causes a error with the concatenation
    #this if statement gets around that
    not_multi_address = temp.loc[temp['is_multi']==False]
    
    if (temp_multi.shape[0]>0) & (not_multi_address.shape[0]>0):
        
        #expand the dataframe according to the correct rules
        temp_multi = expand_dataframe_numbers(temp_multi.reset_index(), 'street_number', print_every = 10000, min_count = 1)
        street_address_lookup = pd.concat([temp_multi, not_multi_address])[['street_name2', 'street_number']]
        
    elif (temp_multi.shape[0]==0) & (not_multi_address.shape[0]>0):
        street_address_lookup = not_multi_address
    else:
        #expand the dataframe according to the correct rules
        temp_multi = expand_dataframe_numbers(temp_multi.reset_index(), 'street_number', print_every = 10000, min_count = 1)
        street_address_lookup = temp_multi
        
    #i = 0
    #temp_list = []
    #for x in temp_multi['street_number'].unique():
    #   # print(i)
    #    i = i+1
    #    values = [int(x) for x in [x for x in x.split("-")]]
    #    temp_list = temp_list +[values]

    #street_address_lookup = pd.concat([temp_multi, temp.loc[temp['is_multi']==False]])[['street_name2', 'street_number']]
    return(street_address_lookup)


In [38]:
target_lad = 'W06000020'

temp = voa_businesses[voa_businesses['lad11cd'] == target_lad].copy(deep = True)
    
    #remove anything in brackets
temp['street_number'] = temp['number_or_name'].str.replace(r"\(.+\)", "", regex = True, case = False)
#units often slip in as street numbers, this kills them off
temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
    #replace @ and & with words
temp['street_number'] = temp['street_number'].str.replace(r"@", " at ", regex = True, case = False).str.replace(r"&", " and ", regex = True, case = False)
    
    #replace "-" with spaces with a simple "-"
temp['street_number'] = temp['street_number'].str.replace(r"(\s)?-(\s)?", "-", regex = True, case = False)
#replace double hyphen... yes it happens
temp['street_number'] = temp['street_number'].str.replace(r"--", "-", regex = True, case = False)
    
    #take only things after the last space includes cases where there is no space. Then remove all letters
temp['street_number'] = temp['street_number'].str.extract(r"([^\s]+$)")[0].str.replace(r"([a-z]+)", "", regex = True, case = False)
    #remove dangling hyphens and slashes
temp['street_number'] = temp['street_number'].str.replace(r"(\-$)|(^\-)|\\|\/", "", regex = True, case = False)
temp.loc[temp['street_number'].str.len() == 0, 'street_number'] = np.nan
temp.loc[temp['street_number'].str.contains(r"\.", regex = True)==True, 'street_number'] = np.nan

temp.loc[:,'is_multi'] = temp['street_number'].str.contains(r"\-", regex = True)

temp_multi = temp.loc[temp['is_multi']==True]#, ['street_number', 'is_multi']]

#temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]

i = 0
temp_list = []
for x in temp_multi['street_number']:#.unique():
        print(str(i)+ " "+x)
        i = i+1
        values = [int(x) for x in [x for x in x.split("-")]]
        temp_list = temp_list +[values]

0 97-98
1 18-19
2 11-17
3 11-13
4 11-13
5 11-13
6 5-7
7 5-8
8 18-22
9 43-45
10 10-12
11 39-40
12 48-50
13 51-52
14 38-40
15 8-11
16 2-3
17 26-28
18 130-131
19 17-20
20 12-13
21 5-6
22 31-33
23 7-9
24 18-19
25 1-5
26 17-25
27 1-4
28 328-331
29 44-47
30 59-61
31 1-3
32 3-6
33 9-11
34 60-61
35 2-3


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan


In [33]:
temp_multi.iloc[248]

primary_and_secondary_description_code                                             CG1
primary_description_text                          VEHICLE REPAIR WORKSHOP AND PREMISES
unique_address_reference_number_uarn                                        7063871000
full_property_identifier                  231--235, HAWTHORNE ROAD, BOOTLE, MERSEYSIDE
firms_name                                                      NATIONAL TYRE SERVICES
number_or_name                                                                231--235
street                                                                  hawthorne road
town                                                                            BOOTLE
postal_district                                                                    NaN
county                                                                      MERSEYSIDE
postcode                                                                       l20 3aw
street_number                              

In [321]:
all_street_addresses = create_all_street_addresses(voa_businesses, target_lad)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


In [332]:
test = ocod_data[(ocod_data['lad11cd']==target_lad)].copy(deep = True)
test['matches_business_address'] = test['street_name'].isin(all_street_addresses['street']) & test['street_number'].isin(all_street_addresses['street_number'])
test['matches_business_address'].sum()

In [341]:
business_address_matcher(test['street_name'], test['street_number'], voa_businesses, target_lad)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


10100     False
10101     False
10102     False
10103     False
10104     False
          ...  
161976     True
161983     True
162115     True
162116     True
162117     True
Length: 2141, dtype: bool

In [354]:
target_lad

'E06000031'

In [None]:
#filters to a single LAD
    #removes advertising hoardings which are irrelevant
    LAD_biz = voa_businesses.loc[(voa_data['lad11cd']==target_lad) & ~voa_data['primary_description_text'].str.contains("ADVERTISING")].copy(deep = True)
    
    LAD_biz.loc[:,'street_name2'] = LAD_biz['street'].copy(deep=True)
    #remove apostraphe's
    LAD_biz.loc[:,'street_name2'] = LAD_biz.loc[:,'street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    #subset to target LAD
    ocod_data_road = ocod_data[ocod_data['lad11cd']==target_lad].copy(deep = True)
    #replace nan values to prevent crash    
    
    #create second column
    ocod_data_road['street_name2'] = ocod_data_road['street_name'].copy(deep=True)
    
    #replace nan values to prevent crash    
    ocod_data_road.loc[ocod_data_road.street_name.isna(),'street_name'2] ="xxxstreet name missingxxx"
    #clean street names of common matching errors
    #remove apostraphe's
    #remove trailing 's'
    #remove all spaces
    ocod_data_road['street_name2'] = ocod_data_road['street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    ocod_data_road['match'] = ocod_data_road['street_name2'].isin(LAD_biz.street_name2.unique())

In [67]:
def business_address_matcher(street_name, street_number, voa_businesses, target_lad):
    #produces a logical vector indficating whether the street address has matched with a business
    #this allows positive ID of business addresses
    #street name a pandas series of street names
    #street_number a pandas series of corresponding street numbers
    #voa_businesses a dataframe of the voa listings and ratings dataset.
    #target_lad the ons code for the the local authority of interest
    
    
    ##
    ## create a dataframe of cleaned street names for target lad for VOA
    ##
    LAD_biz = voa_businesses.loc[(voa_businesses['lad11cd']==target_lad)].copy(deep = True)
    
    LAD_biz.loc[:,'street_name2'] = LAD_biz['street'].copy(deep=True)
    #clean street names of common matching errors
    #remove apostraphe's
    #remove trailing 's'
    #remove all spaces
    LAD_biz.loc[:,'street_name2'] = LAD_biz.loc[:,'street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    ##
    ## create a dataframe of cleaned street names for target lad for OCOD
    ##
    #clean street names of common matching errors
    #remove apostraphe's
    #remove trailing 's'
    #remove all spaces    
    street_name2 = street_name.str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    #subset voa data again to only include streets that are in the OCOD dataset
    #also remoe all streets that are nan
    LAD_biz = LAD_biz[LAD_biz['street_name2'].isin(street_name2.unique()) & LAD_biz['street_name2'].notna() ]
    
    all_street_addresses = create_all_street_addresses(LAD_biz, target_lad)
    
    #temp = street_name.isin(all_street_addresses['street_name2']) & street_number.isin(all_street_addresses['street_number'])
    return street_name.isin(all_street_addresses['street_name2']), street_number.isin(all_street_addresses['street_number'])#(temp)
   
    

In [116]:
def massaged_address_match(ocod_data, voa_data, target_lad):
##
## This exact match works pretty much as well as the fuzzy matcher but is much faster and clearer
##
    #filters to a single LAD
    #removes advertising hoardings which are irrelevant
    LAD_biz = voa_data.loc[(voa_data['lad11cd']==target_lad) & ~voa_data['primary_description_text'].str.contains("ADVERTISING")].copy(deep = True)
    
    LAD_biz.loc[:,'street_name2'] = LAD_biz['street'].copy(deep=True)
    #clean street names of common matching errors
    #remove apostraphe's
    #remove trailing 's'
    #remove all spaces
    LAD_biz.loc[:,'street_name2'] = LAD_biz.loc[:,'street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    #subset to target LAD
    ocod_data_road = ocod_data[ocod_data['lad11cd']==target_lad].copy(deep = True)
    #replace nan values to prevent crash    
    
    #create second column
    ocod_data_road['street_name2'] = ocod_data_road['street_name'].copy(deep=True)
    
    #replace nan values to prevent crash    
    ocod_data_road.loc[ocod_data_road.street_name.isna(),'street_name2'] ="xxxstreet name missingxxx"
    #clean street names of common matching errors
    #remove apostraphe's
    #remove trailing 's'
    #remove all spaces
    ocod_data_road['street_name2'] = ocod_data_road['street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    ocod_data_road['street_match'] = ocod_data_road['street_name2'].isin(LAD_biz.street_name2.unique())
    
    #remove irrelevant streets
    LAD_biz = LAD_biz[LAD_biz['street_name2'].isin(street_name2.unique()) & LAD_biz['street_name2'].notna() ]
    #create the database table
    all_street_addresses = create_all_street_addresses(LAD_biz, target_lad)
    
    #pre-make the new column and assign nan to all values. THis might make things a bit faster
    ocod_data_road['address_match'] = np.nan
    
    all_data_list = []
    
    #loop though all the streets with offshore property in and compare to the list
    #of street names with companies in them
    for target_street in ocod_data_road['street_name2'].unique():
            #subset to target street
            temp_ocod = ocod_data_road[ocod_data_road['street_name2']== target_street]
            temp_address = all_street_addresses[all_street_addresses['street_name2']== target_street]
            #if the data frame has a length of 0 then there are no companies in the street
            #the inference being all the properties are domestic.. not totally true but close enough
            #otherwise check the properties
            if temp_address.shape[0]>0:
                temp_ocod.loc[:,'address_match'] =  temp_ocod['street_number'].isin(temp_address['street_number'])
            else:
                temp_ocod.loc[:,'address_match'] = False
            all_data_list = all_data_list +[temp_ocod]
    
    ocod_data_road = pd.concat(all_data_list)
    ocod_data_road.loc[ocod_data_road['street_name2']== "xxxstreet name missingxxx",'street_name2'] = np.nan
    
     
    
    return(ocod_data_road)

In [18]:
street_name = ocod_data.loc[ocod_data['lad11cd'] == target_lad ,'street_name']

In [99]:
test = massaged_address_match(ocod_data, voa_businesses, target_lad)

  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] =  temp_ocod['street_number'].isin(temp_address['street_number'])


address_match
False    1827
True      314
dtype: int64

In [391]:
all_street_addresses

Unnamed: 0,street_name2,street_number
0,highstreet,15
1,highstreet,17
1929209,kingstreet,35
1929210,kingstreet,35
1929211,kingstreet,35
1929212,kingstreet,35
1929213,kingstreet,35
1929214,kingstreet,35
1929215,kingstreet,35
1929216,kingstreet,35


In [58]:
all_lads =ocod_data.lad11cd.unique()
#all_lads = all_lads[all_lads != 'E06000031']
#all_lads = all_lads[all_lads != 'E08000017']

all_lads = [x for x in all_lads if str(x) != 'nan']
#see which roads match a road in voa data set for each local authority

matched_lads_list = []

for target_lad in all_lads:
    print(target_lad)
    temp = ocod_data[(ocod_data['lad11cd']==target_lad)].copy(deep = True)
    street_match, number_match = business_address_matcher(temp['street_name'], temp['street_number'], voa_businesses, target_lad)
    temp['street_match'] = street_match
    temp['number_match'] = number_match
    temp['matches_business_address'] = temp['street_match'] & temp['number_match']
    #temp['matches_business_address'] = business_address_matcher(temp['street_name'], temp['street_number'], voa_businesses, target_lad)
    matched_lads_list = matched_lads_list + [temp]


E06000031
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E07000032
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000019


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000017
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E09000028


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E09000020


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E09000007


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E06000049


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E09000011
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E09000033


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.002
i= 10000  expand time,2.486 filter time1.563 make_dataframe_time 10.642
E09000019


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E09000016
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000015
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E09000015
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E06000030
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E06000050


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000010


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.002
E07000070
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000003


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E06000057
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E06000047


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000033
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000007


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000035


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E09000001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E07000098
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E06000044
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.002


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E07000103
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E07000243
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000012


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
E09000012


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E09000032


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E06000045


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.001 make_dataframe_time 0.002
E08000006


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.002
E08000004
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E08000028


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000013
i= 0  expand time,0.001 filter time0.001 make_dataframe_time 0.002


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E07000117
i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E07000119
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.002


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


E06000016


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000014


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan


ValueError: invalid literal for int() with base 10: '231\\'

In [122]:
all_lads =ocod_data.lad11cd.unique()
#all_lads = all_lads[all_lads != 'E06000031']
#all_lads = all_lads[all_lads != 'E08000017']

all_lads = [x for x in all_lads if str(x) != 'nan']
#see which roads match a road in voa data set for each local authority
for target_lad in all_lads:
    print(target_lad)
    #temp['matches_business_address'] = business_address_matcher(temp['street_name'], temp['street_number'], voa_businesses, target_lad)
    matched_lads_list = matched_lads_list + [massaged_address_match(ocod_data, voa_businesses, target_lad)]

    #matched_lads_list = [massaged_address_match(ocod_data, voa_businesses, target_lad) for target_lad in all_lads]


E06000031


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] = False


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] =  temp_ocod['street_number'].isin(temp_address['street_number'])


E07000032


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] = False


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001
E08000019


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] = False


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] =  temp_ocod['street_number'].isin(temp_address['street_number'])


E08000017


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] = False


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.002
E09000028


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_multi['number_filter'] = [find_filter_type(x) for x in temp_multi['street_number']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] = False


i= 0  expand time,0.0 filter time0.0 make_dataframe_time 0.001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ocod.loc[:,'address_match'] =  temp_ocod['street_number'].isin(temp_address['street_number'])


E09000020


  temp.loc[temp['street_number'].str.contains(r"(unit|suite)", regex = True, case = False)==True, 'street_number'] = np.nan


ValueError: No objects to concatenate

In [114]:
test  = pd.concat(matched_lads_list)
test['street_match'].sum()

192

# Classify property type

In [138]:
ocod_data['class'] = np.select(
    [
        ocod_data['property_address'].str.contains(r"^(land|plot)", case = False),
        ocod_data['property_address'].str.contains(r"^((garage)|(parking(\s)?space)|(parking space)|(car park(ing)?(\sspace)))", case = False),
        ocod_data['property_address'].str.contains(r"^((the airspace)|(airspace))", case = False),
        ocod_data['property_address'].str.contains(r"(penthouse|flat|apartment)", case = False),
        ocod_data['property_address'].str.contains(r"(cinema)|(hotel)|(office)|centre|(\bpub)|(business)|(cafe)|(^shop)|( shop)|(restaurant)|(home)|(^store(s)?\b)|(^storage\\b)|(company)|(ltd)|(limited)|(plc)|(retail)|(leisure)|(industrial)|(hall of)", case = False), 
        (ocod_data['business_counts']==0 )& ocod_data.postcode.notnull()
    ], 
    [
        'land',
        'carpark',
        'airspace',
        'domestic',
        'business',
        'domestic'
    ], 
    default='unknown'
)

  ocod_data['property_address'].str.contains(r"^(land|plot)", case = False),
  ocod_data['property_address'].str.contains(r"^((garage)|(parking(\s)?space)|(parking space)|(car park(ing)?(\sspace)))", case = False),
  ocod_data['property_address'].str.contains(r"^((the airspace)|(airspace))", case = False),
  ocod_data['property_address'].str.contains(r"(penthouse|flat|apartment)", case = False),
  ocod_data['property_address'].str.contains(r"(cinema)|(hotel)|(office)|centre|(\bpub)|(business)|(cafe)|(^shop)|( shop)|(restaurant)|(home)|(\bstore\b)|(\bstorage\\b)|(company)|(ltd)|(limited)|(plc)|(retail)|(leisure)|(industrial)|(hall of)", case = False),


In [144]:
#if we can be relatively sure that all the nested properties are domestic then the amount of unclassified 
#properties drops to 2%
#What would allow us to be sure?
ocod_data.groupby('class').size()

class
airspace      1671
business      5568
carpark       2086
domestic    103978
land         16501
unknown      32320
dtype: int64

In [10]:
ocod_data.groupby('class').size()

class
airspace     1671
business    15167
carpark      2078
domestic    89600
land        16501
unknown     32343
dtype: int64

In [11]:
ocod_data.groupby('class').size()/ocod_data.shape[0]

class
airspace    0.010619
business    0.088453
carpark     0.013205
domestic    0.574549
land        0.104861
unknown     0.208312
dtype: float64

In [10]:
#non of the unknowns have a postcode. I guess this is obvious as if there is no matching VOA postcode you are classed as domestic
pd.crosstab(ocod_data[ocod_data['class']=="unknown"].postcode.notnull(), ocod_data[ocod_data['class']=="unknown"].street_name.notnull())

street_name,False,True
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3034,29309


In [12]:
pd.crosstab(ocod_data['tenure'], ocod_data['region'].str.lower())#.to_latex() #convert to copyable latex table

region,east anglia,east midlands,greater london,north,north west,south east,south west,wales,west midlands,yorks and humber
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Freehold,3330,4962,25457,3177,16421,17922,5465,3341,5083,8840
Leasehold,460,1282,42530,906,5855,5462,1677,667,1791,2732


In [15]:
pd.crosstab(ocod_data['class'], ocod_data['region'].str.lower())#.to_latex() #convert to copyable latex table

region,east anglia,east midlands,greater london,north,north west,south east,south west,wales,west midlands,yorks and humber
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
airspace,50,162,91,15,215,185,266,290,94,303
business,219,447,9323,461,1107,2010,566,430,583,758
carpark,10,7,1283,14,474,159,71,4,27,37
domestic,2547,3453,45543,2127,10352,13513,4212,1452,4032,5028
land,630,1143,2586,721,2186,4792,1239,617,1290,1297
unknown,343,1640,9948,1059,8752,3367,1430,1538,1242,4384


In [14]:
pd.crosstab(ocod_data['class'], ocod_data['tenure'])#.to_latex() #convert to copyable latex table

tenure,Freehold,Leasehold
class,Unnamed: 1_level_1,Unnamed: 2_level_1
airspace,7,1664
business,3061,10858
carpark,156,1922
domestic,47954,42457
land,14659,1842
unknown,28161,4619


In [16]:
temp_df = ocod_data[['title_number', 'tenure', 'within_larger_title']].drop_duplicates()

#most of titles containing nested addresses are free hold by about 3/2
pd.crosstab(temp_df['tenure'], temp_df['within_larger_title'])


within_larger_title,False,True
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1
Freehold,46326,4343
Leasehold,41203,1633


In [16]:
#The analysis is based on nested addresses being domestic
temp_df = ocod_data[['title_number', 'tenure', 'property_address']][ocod_data['within_larger_title']==True]
temp_df['is_flat'] = temp_df['property_address'].str.contains(r"(flat|apartment|penthouse|unit)", case = False)

#pd.crosstab(temp_df['tenure'], temp_df['within_larger_title'])

temp_df.groupby('tenure').size()

#Of nested addresses freehold is more common by 3/2 50k to 24k
#most of theproperties are not flats however flats dominate the leasehold section
#flats are 1/3 of nested addresses but make up almost 3/4 of the leashold nested addresses
#note this does not include items marked as units
pd.crosstab(temp_df.tenure, temp_df.is_flat)

  temp_df['is_flat'] = temp_df['property_address'].str.contains(r"(flat|apartment|penthouse|unit)", case = False)


is_flat,False,True
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1
Freehold,40400,6909
Leasehold,5489,16495


In [17]:
ocod_data[ocod_data.price_paid.notnull()==True].to_csv("/tf/empty_homes_data/ocod_price_paid.csv")
pd.crosstab(ocod_data.within_larger_title, ocod_data.price_paid.notnull())

price_paid,False,True
within_larger_title,Unnamed: 1_level_1,Unnamed: 2_level_1
False,57236,30831
True,57817,11476


In [11]:
#This is interesting as it shows that only about 15% of normal titles are missing a postcode whilst half of within larger title addresses are missing a postcode.
#This is probably because they cover multiple postcodes
pd.crosstab(ocod_data.postcode.notnull(), ocod_data.within_larger_title)

within_larger_title,False,True
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
False,19758,36771
True,68309,32522


## Road matching

In [18]:
ocod_data.groupby('district').size().sort_values(ascending=False)

district
CITY OF WESTMINSTER       15052
KENSINGTON AND CHELSEA     6868
TOWER HAMLETS              4048
WANDSWORTH                 3755
SHEFFIELD                  3690
                          ...  
DERBYSHIRE DALES             27
MELTON                       25
BOSTON                       16
RUTLAND                      13
ISLES OF SCILLY               4
Length: 331, dtype: int64

## Exploring road matching

Before matching the names I should edit the street names to remove all ' apostraphes as these are very unreliable, I should also remove spaces between 'gate' and whatever comes before it same with 'way'
It may also be an idea to remove 's' that appears at the end of a word but we can try this again later

In [68]:


def fuzzy_street_match(ocod_data, voa_data, target_lad):
##
## It's possible that this entire thing can be replaced with a simple isin() type command.
## I will be able to do the replacement if matching is basically bianry
##
    #filters to a single LAD
    #removes advertising hoardings which are irrelevant
    LAD_biz = voa_businesses.loc[(voa_data['lad11cd']==target_lad) & ~voa_data['primary_description_text'].str.contains("ADVERTISING")].copy(deep = True)
    
    LAD_biz.loc[:,'street_name2'] = LAD_biz['street'].copy(deep=True)
    #remove apostraphe's
    LAD_biz.loc[:,'street_name2'] = LAD_biz.loc[:,'street_name2'].str.replace(r"'", "", regex = True).str.replace(r"s(?=\s)", "", regex = True).str.replace(r"\s(?=way|gate)", "", regex = True)
    
    #subset to target LAD
    ocod_data_road = ocod_data[ocod_data['lad11cd']==target_lad].copy(deep = True)
    #replace nan values to prevent crash
    #ocod_data_road['street_name'] = 
    #ocod_data_road['street_name'].fillna("xxxstreet name missingxxx")
    
    ocod_data_road.loc[ocod_data_road.street_name.isna(),'street_name'] ="xxxstreet name missingxxx"
    
    #create second column
    ocod_data_road['street_name2'] = ocod_data_road['street_name'].copy(deep=True)
    #remove apostraphe's
    ocod_data_road['street_name2'] = ocod_data_road['street_name2'].str.replace(r"'", "", regex = True).str.replace(r"s(?=\s)", "", regex = True).str.replace(r"\s(?=way|gate)", "", regex = True)
    #remove trailing 's'
    #ocod_data_road['street_name2'] = ocod_data_road['street_name2'].str.remove(r"s(?=\s)")
    #remove space preceeding 'way' or 'gate'
    #ocod_data_road['street_name2'] = ocod_data_road['street_name2'].str.remove(r"\s(?=way|gate)")
    
    
    #using the below line cause a copy warning
    # ocod_data_road.loc[:,'street_name'].fillna("xxxstreet name missingxxx", inplace = False)

    #extract unique names
    unique_ocod_names = ocod_data_road.street_name2.unique()
    unique_voa_names = LAD_biz.street_name2.unique()

    #fuzzy match unique names
    street_matches_df = pd.DataFrame([process.extractOne(x, unique_voa_names) for x in unique_ocod_names], 
                                 columns = ['matched_road_name', 'similarity'])
    street_matches_df['street_name2'] = unique_ocod_names

    out = ocod_data_road.merge(street_matches_df, left_on = "street_name2", right_on = "street_name2")
    #out.loc[ocod_data_road.street_name == "xxxstreet name missingxxx",'street_name'] = None
    #remove the modified street name
    #out = out.drop('street_name2', axis = 1)
    return(out)


def massaged_street_match(ocod_data, voa_data, target_lad):
##
## This exact match works pretty much as well as the fuzzy matcher but is much faster and clearer
##
    #filters to a single LAD
    #removes advertising hoardings which are irrelevant
    LAD_biz = voa_businesses.loc[(voa_data['lad11cd']==target_lad) & ~voa_data['primary_description_text'].str.contains("ADVERTISING")].copy(deep = True)
    
    LAD_biz.loc[:,'street_name2'] = LAD_biz['street'].copy(deep=True)
    #remove apostraphe's
    LAD_biz.loc[:,'street_name2'] = LAD_biz.loc[:,'street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    #subset to target LAD
    ocod_data_road = ocod_data[ocod_data['lad11cd']==target_lad].copy(deep = True)
    #replace nan values to prevent crash    
    
    #create second column
    ocod_data_road['street_name2'] = ocod_data_road['street_name'].copy(deep=True)
    
    #replace nan values to prevent crash    
    ocod_data_road.loc[ocod_data_road.street_name.isna(),'street_name2'] ="xxxstreet name missingxxx"
    #clean street names of common matching errors
    #remove apostraphe's
    #remove trailing 's'
    #remove all spaces
    ocod_data_road['street_name2'] = ocod_data_road['street_name2'].str.replace(r"'", "", regex = True).\
    str.replace(r"s(s)?(?=\s)", "", regex = True).str.replace(r"\s", "", regex = True)
    
    ocod_data_road['match'] = ocod_data_road['street_name2'].isin(LAD_biz.street_name2.unique())

    return(ocod_data_road)

In [145]:
#target_lad = 'E09000019'
#test= fuzzy_street_match(ocod_data, voa_businesses, target_lad)


In [91]:
target_lad = 'E09000019'
test= massaged_street_match(ocod_data, voa_businesses, target_lad)

test.columns#[test['street_name2']].groupby('match').size()

Index(['Unnamed: 0', 'title_number', 'within_title_id', 'unique_id',
       'within_larger_title', 'tenure', 'unit_id', 'unit_type',
       'building_name', 'street_number', 'street_name', 'postcode', 'city',
       'district', 'county', 'region', 'multiple_address_indicator',
       'price_paid', 'property_address', 'postcode2', 'lsoa11', 'msoa11',
       'lad11cd', 'business_counts', 'street_name2', 'match'],
      dtype='object')

In [81]:
test[test['street_name2']=='xxxstreetnamemissingxxx'].groupby('match').size()

match
False    344
dtype: int64

In [97]:

test[test['street_name'].str.contains(r"\s(?=way|gate)")][['street_name', 'street_name2', 'match' ]]

Unnamed: 0,street_name,street_name2,match
10207,york way,yorkway,True
10208,york way,yorkway,True
10209,york way,yorkway,True
10210,york way,yorkway,True
10211,york way,yorkway,True
...,...,...,...
151369,angel gate,angelgate,False
151527,sussex way,sussexway,True
153104,drummond way,drummondway,True
161766,drummond way,drummondway,True


In [54]:
#all_lads = ocod_data.lad11cd.unique()

#all_lads = [x for x in all_lads if str(x) != 'nan']
##see which roads match a road in voa data set for each local authority
#ocod_road_match= [fuzzy_street_match(ocod_data, voa_businesses, target_lad) for target_lad in all_lads]
#ocod_road_match = pd.concat(ocod_road_match)

In [87]:
all_lads = ocod_data.lad11cd.unique()

all_lads = [x for x in all_lads if str(x) != 'nan']
#see which roads match a road in voa data set for each local authority
ocod_road_match= [massaged_street_match(ocod_data, voa_businesses, target_lad) for target_lad in all_lads]
ocod_road_match = pd.concat(ocod_road_match)

In [89]:
ocod_road_match.groupby('match').size()

match
False    73922
True     87979
dtype: int64

In [142]:
test = ocod_road_match2[ocod_road_match2['street_name'] != 'xxxstreet name missingxxx']
pd.crosstab(test['match'], test['class'])

class,airspace,business,carpark,domestic,land,unknown
match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,953,498,472,23764,9215,12796
True,455,3781,1275,65143,4143,14375


In [146]:
498/(498+3781)

0.11638233232063566

In [143]:

ocod_road_match2[(ocod_road_match2['match']==False) & (ocod_road_match2['class'] == "business")& (ocod_road_match2['street_name'] != 'xxxstreet name missingxxx')].to_csv("/tf/empty_homes_data/fuzzy_roads.csv")

In [136]:

ocod_road_match[(ocod_road_match['similarity']<93) & (ocod_road_match['class'] == "business") & (ocod_road_match['street_name'] != 'xxxstreet name missingxxx')].to_csv("/tf/empty_homes_data/fuzzy_roads.csv")

In [68]:
ocod_road_match[ocod_road_match['street_name'].str.contains(r"'")][['street_name', 'street_name2', 'matched_road_name' ]]

Unnamed: 0,street_name,street_name2,matched_road_name
304,land on the north east side of land's end road,land on the north east side of land end road,land end road
484,queen mary's road,queen mary road,queen mary road
1203,d'eynsford road,deynsford road,deynsford road
1206,land on the west side of page's walk and on th...,land on the west side of page walk and on the ...,page walk
1689,price's street,price street,princes street
...,...,...,...
23,east and south of duke's road,east and south of duke road,duke road
93,land on the east side of parker's farm road,land on the east side of parker farm road,parker farm road
37,o'gorman avenue,ogorman avenue,ogorman avenue
65,land on the east side of lug's lane,land on the east side of lug lane,lug lane


In [76]:
#values greater than or equal to 94 seem to be pretty good

#ocod_road_match[ocod_road_match['similarity']==96][['property_address', 'street_name2', 'matched_road_name']]

Unnamed: 0,property_address,street_name2,matched_road_name
3662,"127a Lowedges Road, Sheffield (S8 7LE)",lowedge road,low edge road
2064,"Flat 12, 20 St James's Road, London (SE16 4QJ)",st james road,st jame road
2065,"Flat 33, 30 St James's Road, London (SE16 4QJ)",st james road,st jame road
2066,"Flat 42, 30 St James's Road, London (SE16 4QJ)",st james road,st jame road
2067,"Flat 2, 20 St James's Road, London (SE16 4QJ)",st james road,st jame road
...,...,...,...
467,"Travelodge, Four Went Ways, Abington, Cambridg...",four wentway,four wentways
91,"Land on the south side of Myrtle Lodge, Milfor...",milford road,millford road
48,"Airspace above 35 Neville Road, Peacehaven (BN...",neville road,nevill road
67,"Units A, B, C, D, E, F, G, H, J, K, L and M, S...",old end lane,oldend lane


### creating the aggregated ocod dataset for sampling

In [14]:
ocod_data_lsoa = ocod_data
ocod_data_lsoa['postcode2'] = ocod_data['postcode'].str.lower().str.replace("\s", "")

ocod_data_lsoa = ocod_data.merge(postcode_district_lookup, 'left', left_on = "postcode2", right_on = "postcode2")

ocod_data_lsoa.groupby(['lad11cd', 'lsoa11', 'msoa11', 'class']).size().reset_index().to_csv("/tf/empty_homes_data/ocod_lsoa.csv")
ocod_data_lsoa.groupby(['lad11cd', 'lsoa11', 'msoa11', 'class', 'within_larger_title']).size().reset_index().to_csv("/tf/empty_homes_data/ocod_lsoa_by_nested_type.csv")

  ocod_data_lsoa['postcode2'] = ocod_data['postcode'].str.lower().str.replace("\s", "")


In [26]:

voa_businesses['postcode2'] = voa_businesses['postcode'].str.lower().str.replace("\s", "")

voa_businesses = voa_businesses.merge(postcode_district_lookup, 'left', left_on = "postcode2", right_on = "postcode2")
#postcode_district_lookup


  voa_businesses['postcode2'] = voa_businesses['postcode'].str.lower().str.replace("\s", "")


In [27]:
voa_businesses.columns

Index(['incrementing_entry_number', 'billing_authority_code',
       'ndr_community_code', 'ba_reference_number',
       'primary_and_secondary_description_code', 'primary_description_text',
       'unique_address_reference_number_uarn', 'full_property_identifier',
       'firms_name', 'number_or_name', 'street', 'town', 'postal_district',
       'county', 'postcode', 'effective_date', 'composite_indicator',
       'rateable_value', 'appeal_settlement_code', 'assessment_reference',
       'list_alteration_date', 'scat_code_and_suffix', 'sub_street_level_3',
       'sub_street_level_2', 'sub_street_level_1', 'case_number',
       'current_from_date', 'current_to_date', 'postcode2', 'lad11cd',
       'lad11nm'],
      dtype='object')

In [100]:
ocod_district = ocod_data[(ocod_data['district'].str.lower() == 'tower hamlets') & (ocod_data['class'] == "unknown") & ocod_data.street_name.notnull() ]

ocod_district = ocod_district[['title_number', 'unit_id', 'building_name','street_number', 'street_name', 'property_address' ]]
voa_district = voa_businesses[voa_businesses['lad11nm'].str.lower() =='tower hamlets']



In [99]:
ocod_data.groupby('district').size().to_csv('/tf/empty_homes_data/ocod_districts.csv')

In [101]:
ocod_district[~ocod_district.street_name.str.strip().isin(voa_district.street.str.lower().unique())].to_csv("/tf/empty_homes_data/delete_me.csv")

In [85]:
pd.crosstab(voa_businesses.postcode.isnull(),voa_businesses.postcode.isnull())

postcode,False,True
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2552185,0
True,0,1414


## Largest nested addresses

In [None]:
#The largest nested address
ocod_data.within_title_id.max()

In [None]:
ocod_data[ocod_data.within_title_id==ocod_data.within_title_id.max()].reset_index()['property_address'][0]



In [102]:
ocod_data.to_csv("/tf/empty_homes_data/OCOD_classes.csv")

In [None]:
test  = ocod_data[ocod_data['class']=="unknown" ]

pd.crosstab(test.postcode.notnull(), test.street_name.notnull())

In [None]:
ocod_data[ ocod_data.street_name.isnull()].to_csv("/tf/empty_homes_data/OCOD_no_street.csv")

In [None]:
1/21