In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Introduction
* Raw CSV loaded and lightly processed. Output: two column csv columns, property address, flat tag
* Data labelled in programmatic. Output: json file of entities.
* Data programmatic output json cleaned ordered and overlaps removed. Output: json file
* **Clean json converted to dataframe and multi-addresses expanded.** Output: CSV
* Count and locate addresses
* Create address matcher and match businesses
* Classify address types

This notebook is used so that I can create the code necessary to expand the addresses so that a single property/dwelling is a single line. This means the data will follow the tidy data principles of one observation per line where an observation is what is commonly thought of as a property.

In [2]:
import json
import pandas as pd
import re
from helper_functions import *
import numpy as np

In [3]:

with open('/tf/empty_homes_data/humanloop_04_04_22_t1210.json', "r") as read_file:
    all_entities_json = json.load(read_file)

# with open('/tf/empty_homes_data/full_dataset_no_overlaps.json', "r") as read_file:
#     all_entities_json = json.load(read_file)


In [4]:
all_entities_json['datapoints'][8079]

{'programmatic': {'results': [{'start': 48,
    'end': 49,
    'text': '2',
    'labelId': '13',
    'label': 'street_number',
    'labellingFunctionId': 5,
    'groundTruthId': None},
   {'start': 50,
    'end': 51,
    'text': '3',
    'labelId': '13',
    'label': 'street_number',
    'labellingFunctionId': 5,
    'groundTruthId': None},
   {'start': 52,
    'end': 53,
    'text': '4',
    'labelId': '13',
    'label': 'street_number',
    'labellingFunctionId': 5,
    'groundTruthId': None},
   {'start': 54,
    'end': 55,
    'text': '5',
    'labelId': '13',
    'label': 'street_number',
    'labellingFunctionId': 5,
    'groundTruthId': None},
   {'start': 46,
    'end': 47,
    'text': '1',
    'labelId': '13',
    'label': 'street_number',
    'labellingFunctionId': 8,
    'groundTruthId': None},
   {'start': 215,
    'end': 221,
    'text': 'london',
    'labelId': '3',
    'label': 'city',
    'labellingFunctionId': 13,
    'groundTruthId': None},
   {'start': 0,
    'end': 

In [5]:
#output_dict = [x for x in all_entities_json if int(x['data_stuff.id']) == 42353]

# Transform python object back into json
#output_json = json.dumps(output_dict)

In [6]:
all_entities_json

{'datapoints': [{'programmatic': {'results': [{'start': 41,
      'end': 47,
      'text': 'leigh ',
      'labelId': '3',
      'label': 'city',
      'labellingFunctionId': 12,
      'groundTruthId': None},
     {'start': 27,
      'end': 39,
      'text': 'nel pan lane',
      'labelId': '12',
      'label': 'street_name',
      'labellingFunctionId': 24,
      'groundTruthId': None},
     {'start': 48,
      'end': 55,
      'text': 'wn7 5jt',
      'labelId': '11',
      'label': 'postcode',
      'labellingFunctionId': 36,
      'groundTruthId': None},
     {'start': 0,
      'end': 25,
      'text': 'westleigh lodge care home',
      'labelId': '10',
      'label': 'building_name',
      'labellingFunctionId': 30,
      'groundTruthId': None},
     {'start': 27,
      'end': 39,
      'text': 'nel pan lane',
      'labelId': '12',
      'label': 'street_name',
      'labellingFunctionId': 23,
      'groundTruthId': None},
     {'start': 0,
      'end': 25,
      'text': 'westlei

In [7]:

all_entities = pd.json_normalize(
    all_entities_json["datapoints"],
    record_path=["programmatic", "results"],
    meta=["data", "id"],
    #record_prefix="result_stuff.",
    meta_prefix="data_stuff.",
    errors="ignore",
)
all_entities = all_entities.rename(columns = {'data_stuff.id':'datapoint_id',
                                             'text':'label_text'})

all_entities["text"] = all_entities["data_stuff.data"].map(lambda x: x["text"])

#all_entities.drop(['data_stuff.data'], axis = 1, inplace = True)

all_entities = all_entities.sort_values(['datapoint_id', 'start'])

In [8]:
# all_entities = pd.json_normalize(all_entities_json, record_path = "labels",
#                        meta = ['datapoint_id', 'text'])

all_entities['label_id_count'] = all_entities.groupby(['datapoint_id', 'label']).cumcount()

all_entities['label_text'] =all_entities['label_text'].str.replace("100-1124", "100-112")

#strips leading and trailing whitespace. this makes matching better
#but means the start and end points are incorrect
#There may be a crafty way of fixing this
all_entities['label_text'] = all_entities['label_text'].str.strip()

all_entities.drop(columns = ['data_stuff.data'], inplace = True)

In [32]:
all_entities

Unnamed: 0,datapoint_id,start,end,label_text,labelId,label,labellingFunctionId,groundTruthId,text,label_id_count
0,0,0,25,westleigh lodge care home,10,building_name,30,,"westleigh lodge care home, nel pan lane, leigh...",0
1,0,27,39,nel pan lane,12,street_name,24,,"westleigh lodge care home, nel pan lane, leigh...",0
2,0,41,47,leigh,3,city,12,,"westleigh lodge care home, nel pan lane, leigh...",0
3,0,48,55,wn7 5jt,11,postcode,36,,"westleigh lodge care home, nel pan lane, leigh...",0
4,1,0,4,flat,15,unit_type,18,,"flat 1, 1a canal street, manchester (m1 3he)",0
...,...,...,...,...,...,...,...,...,...,...
414322,94087,12,42,discovery dock apartments east,10,building_name,54,,"storage 17, discovery dock apartments east, 3 ...",0
414323,94087,44,45,3,13,street_number,9,,"storage 17, discovery dock apartments east, 3 ...",0
414324,94087,46,63,south quay square,12,street_name,22,,"storage 17, discovery dock apartments east, 3 ...",0
414325,94087,65,72,london,3,city,12,,"storage 17, discovery dock apartments east, 3 ...",0


## remove overlapping entities

In [10]:

def remove_overlaps(x):
    #this functions is modified from 
    #https://stackoverflow.com/questions/57804145/combining-rows-with-overlapping-time-periods-in-a-pandas-dataframe
    x = x.copy()
    #create a unique label, this is used for joining the data back on 
    #and removes a reliance on the data being pre-sorted
    x['unique_label'] = [*range(0,x.shape[0])]
    #get the size of the spans
    x['diff'] = (x['end']-x['start'])

    
    startdf = pd.DataFrame({'position':x['start'], 'unique_label':x['unique_label'], 'what':1})
    enddf = pd.DataFrame({'position':x['end'], 'unique_label':x['unique_label'], 'what':-1})
    mergdf = pd.concat([startdf, enddf]).sort_values('position')
    mergdf['running'] = mergdf['what'].cumsum()
    mergdf['newwin'] = mergdf['running'].eq(1) & mergdf['what'].eq(1)
    mergdf['group'] = mergdf['newwin'].cumsum()
    
    #merge back on using uniqe label to ensure correct ordering
    x = x.merge(mergdf.loc[mergdf['what'].eq(1),['unique_label','group']], how = 'left', on = 'unique_label')
    #sort within group and keep only the largest
    x = x.sort_values('diff', ascending=False).groupby(['group', 'datapoint_id'], as_index=False).first()

    x.drop(['diff', 'unique_label', 'group'], axis = 1, inplace = True)

    return(x)


#ddf = remove_overlaps(df)
#takes about 20 minutes
all_entities = all_entities.groupby(['datapoint_id']).apply(remove_overlaps)


## Example of the data frame of labels

In [17]:
all_entities.reset_index(drop = True, inplace = True)

## Identify multi versus single address observations

Some addresses may have the form xx to yy but should not be expanded as this is a building that covers multiple street numbers. Items such as these need to be carefully removed before expansion

In [18]:
xx_to_yy_regex = r'^\d+(\s)?(-|to)(\s)?\d+$'

multi_check_df = all_entities[['datapoint_id', 'text', ]].drop_duplicates()
multi_check_df['comma_count'] = multi_check_df['text'].str.count(',')
multi_check_df['land'] = multi_check_df['text'].str.contains(r"^(land|plot|airspace|car|parking)", case = False)

multi_check_df['business'] = multi_check_df['text'].str.contains(r"(cinema)|(hotel)|(office)|centre|(\bpub)|holiday(\s)?inn|travel(\s)?lodge|(business)|(cafe)|(^shop)|( shop)|(restaurant)|(home)|(^store(s)?\b)|(^storage\b)|(company)|(ltd)|(limited)|(plc)|(retail)|(leisure)|(industrial)|(hall of)|trading|commercial|works", case = False)
temp_df = all_entities[['datapoint_id', 'label']].groupby(['datapoint_id', 'label']).value_counts().to_frame(name = "counts").reset_index().pivot(index = 'datapoint_id', columns = 'label', values = 'counts').fillna(0)
#test['datapoint_id'] = test.index

xx_to_yy_street_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
    xx_to_yy_regex)& (all_entities['label']=="street_number")
                            ].to_frame(name = 'datapoint_id').groupby('datapoint_id').size().to_frame(name = 'xx_to_yy_street_counts')

xx_to_yy_unit_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
    xx_to_yy_regex)& (all_entities['label']=="unit_id")
                            ].to_frame(name = 'datapoint_id').groupby('datapoint_id').size().to_frame(name = 'xx_to_yy_unit_counts')

multi_check_df = multi_check_df.merge(temp_df, how = 'left', left_on = "datapoint_id", right_index = True).\
merge(xx_to_yy_street_counts, how = 'left', left_on = "datapoint_id", right_index = True).\
merge(xx_to_yy_unit_counts, how = 'left', left_on = "datapoint_id", right_index = True).fillna(0)


del xx_to_yy_street_counts
del xx_to_yy_unit_counts

#separate the classes using logical rules
multi_check_df['class'] = np.select(
    [
        multi_check_df['land']== True,
        multi_check_df['business']== True,
        (multi_check_df['building_name']==1) & (multi_check_df['unit_id'] == 0), #this has to go infront of 'multi_check_df['xx_to_yy_unit_counts']>0'
        multi_check_df['xx_to_yy_unit_counts']>0,
        multi_check_df['street_number']>1,
        multi_check_df['unit_id']>1,
        (multi_check_df['street_number']<=1) & (multi_check_df['xx_to_yy_street_counts']<=1) & (multi_check_df['unit_id']<=1) ##This does most of the heavy lifting
    ], 
    [
        'single',
        'single',
        'single',
        'multi',
        'multi',
        'multi',
        'single',
        
    ], 
    default='unknown'
)
multi_check_df


  multi_check_df['land'] = multi_check_df['text'].str.contains(r"^(land|plot|airspace|car|parking)", case = False)
  multi_check_df['business'] = multi_check_df['text'].str.contains(r"(cinema)|(hotel)|(office)|centre|(\bpub)|holiday(\s)?inn|travel(\s)?lodge|(business)|(cafe)|(^shop)|( shop)|(restaurant)|(home)|(^store(s)?\b)|(^storage\b)|(company)|(ltd)|(limited)|(plc)|(retail)|(leisure)|(industrial)|(hall of)|trading|commercial|works", case = False)
  xx_to_yy_street_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(
  xx_to_yy_unit_counts = all_entities['datapoint_id'][all_entities['label_text'].str.contains(


Unnamed: 0,datapoint_id,text,comma_count,land,business,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,xx_to_yy_street_counts,xx_to_yy_unit_counts,class
0,0,"westleigh lodge care home, nel pan lane, leigh...",2,False,True,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,single
4,1,"flat 1, 1a canal street, manchester (m1 3he)",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,single
9,2,"flat 201, 1 regent road, manchester (m3 4ay)",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,single
14,3,"land at 2a gerard street, ashton in makerfield...",2,True,False,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,single
18,4,"unit 111, timber wharf, worsley street, manche...",3,False,False,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,single
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414305,94083,"2nd floor, 52 lime street, london (ec3m 7aw)",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,single
414309,94084,"155 bishopsgate, london (ec2m 3ad)",1,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,single
414313,94085,"9th and 10th floors, 52 lime street, london (e...",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,single
414317,94086,"part of tenth floor, 6 bevis marks, london (ec...",2,False,False,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,single


### Get the indexes of multi properties

In [19]:
multi_unit_id = set(multi_check_df['datapoint_id'][(multi_check_df['class']=='multi') &( multi_check_df['unit_id']>0)].tolist())
multi_property = set(multi_check_df['datapoint_id'][(multi_check_df['class']=='multi') &( multi_check_df['unit_id']==0)].tolist())
all_multi_ids = list(multi_unit_id) +list(multi_property)
multi_check_df.groupby('class').size()

class
multi      5194
single    88894
dtype: int64

# checking weird regex problems

In [None]:
all_entities['label_text'].to_list()[0]

## Spread label data

In [20]:
#pivot the columns so that each label class is it's own column and the value in the column is the text

temp_df = all_entities[all_entities.datapoint_id.isin(all_multi_ids)]

temp_df['index'] = temp_df.index
df = temp_df[['index', 'label', 'label_text']].pivot(index='index',columns='label',values='label_text')
#add the datapoint_id back in for each of joining
df = pd.concat([temp_df['datapoint_id'], df], axis=1).merge(temp_df[['datapoint_id' ,'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")
del temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['index'] = temp_df.index


In [None]:
#df[df.text.str.contains('unit')==True].drop_duplicates()#.to_csv('/tf/empty_homes_data/delete_me.csv')

## Add blockers

Blockers prevent the filling of wrong information. As an example if a building is going to back fill up previous addresses it should not back fill past another street as this is highly unlikely to be the same building

In [21]:
df['building_name'][df['street_name'].notnull()] = 'block'
df['street_number'][df['street_name'].notnull()] = 'block' #for multi-flats inside a common building

#returns true if current number filter is null and the next row has street_number or unit id is not null
#prevents number filters propergsating back across roads and unit ids
number_filter_block = df['number_filter'].isnull() & (df['street_number'].shift().notnull() |df['unit_id'].shift().notnull())
df['number_filter'][number_filter_block] = 'block'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['building_name'][df['street_name'].notnull()] = 'block'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['street_number'][df['street_name'].notnull()] = 'block' #for multi-flats inside a common building
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['number_filter'][number_filter_block] = 'block'


## Backfill 

Backfilling adds address information in. However, street address should only be back filled for multi addresses.
I need to work out how to do flat, which may be before or after the unit ID
Also I don't think this is a very good way of doing it at all. Using the pre-spread list is probably better and only working on the multi-addresses is probably a much faster and cleaner way. But I will have to think about how to do it

In [22]:
df['number_filter'] = df[['datapoint_id','number_filter']].groupby('datapoint_id').fillna(method ='bfill')
df['building_name'] = df[['datapoint_id','building_name']].groupby('datapoint_id').fillna(method ='bfill')
df['street_number'] = df[['datapoint_id','street_number']].groupby('datapoint_id').fillna(method ='bfill')
df['postcode'] = df[['datapoint_id','postcode']].groupby('datapoint_id').fillna(method ='bfill')
df['street_name'] = df[['datapoint_id','street_name']].groupby('datapoint_id').fillna(method ='bfill')
df['number_filter'] = df[['datapoint_id','number_filter']].groupby('datapoint_id').fillna(method ='bfill')
df['city'] = df[['datapoint_id','city']].groupby('datapoint_id').fillna(method ='bfill')
df['unit_type'] = df[['datapoint_id','unit_type']].groupby('datapoint_id').fillna(method ='ffill') #should this will backwards or forwards? as mostly it is flat xx not xx flat?

In [None]:
df

In [23]:
expanded_street = df[df.datapoint_id.isin(multi_property) & df.street_number.str.contains(xx_to_yy_regex)].reset_index()
expanded_unit_id = df[df.datapoint_id.isin(multi_unit_id) & df.unit_id.str.contains(xx_to_yy_regex)].reset_index()

expanded_street = expand_dataframe_numbers(expanded_street, column_name = "street_number" )
expanded_unit_id = expand_dataframe_numbers(expanded_unit_id, column_name = "unit_id" )

  expanded_street = df[df.datapoint_id.isin(multi_property) & df.street_number.str.contains(xx_to_yy_regex)].reset_index()
  expanded_unit_id = df[df.datapoint_id.isin(multi_unit_id) & df.unit_id.str.contains(xx_to_yy_regex)].reset_index()


i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.203 filter time0.127 make_dataframe_time 0.985
i= 2000  expand time,0.387 filter time0.244 make_dataframe_time 1.873
i= 0  expand time,0.001 filter time0.0 make_dataframe_time 0.001
i= 1000  expand time,0.19 filter time0.122 make_dataframe_time 1.025


In [24]:
#unit id and street number that does does not have the xx to yy format and so has already been expanded by spreaing and backfilling
expanded_street_simple = df[df.datapoint_id.isin(multi_property) & (df.street_number.str.contains(xx_to_yy_regex)==False) & (df.street_number!='block')].reset_index()
expanded_unit_id_simple = df[df.datapoint_id.isin(multi_unit_id) & (df.unit_id.str.contains(xx_to_yy_regex)==False) & (df.unit_id!='block')].reset_index()

  expanded_street_simple = df[df.datapoint_id.isin(multi_property) & (df.street_number.str.contains(xx_to_yy_regex)==False) & (df.street_number!='block')].reset_index()
  expanded_unit_id_simple = df[df.datapoint_id.isin(multi_unit_id) & (df.unit_id.str.contains(xx_to_yy_regex)==False) & (df.unit_id!='block')].reset_index()


In [25]:
#pivot the columns so that each label class is it's own column and the value in the column is the text

#remove the multi-addresses
single_address_only =all_entities[~all_entities['datapoint_id'].isin(all_multi_ids)]
#remove all but the first instance of a label in the remaining instances
#this is because for single addresses there should be only a single label for each class
single_address_only =single_address_only[single_address_only['label_id_count']==0]
df2 = single_address_only.pivot(index='datapoint_id',columns='label',values='label_text')
#add the datapoint_id back in for each of joining
df2 = df2.merge(single_address_only[['datapoint_id', 'text']].drop_duplicates(), 
          how = "left",
          left_on = "datapoint_id", right_on = "datapoint_id")

df2

Unnamed: 0,datapoint_id,building_name,city,number_filter,postcode,street_name,street_number,unit_id,unit_type,text
0,0,westleigh lodge care home,leigh,,wn7 5jt,nel pan lane,,,,"westleigh lodge care home, nel pan lane, leigh..."
1,1,,manchester,,m1 3he,canal street,1a,,flat,"flat 1, 1a canal street, manchester (m1 3he)"
2,2,,manchester,,m3 4ay,regent road,1,,flat,"flat 201, 1 regent road, manchester (m3 4ay)"
3,3,,wigan,,wn4 9aa,gerard street,,,land,"land at 2a gerard street, ashton in makerfield..."
4,4,,manchester,,m15 4nz,worsley street,,111,unit,"unit 111, timber wharf, worsley street, manche..."
...,...,...,...,...,...,...,...,...,...,...
88889,94083,,london,,ec3m 7aw,lime street,52,,,"2nd floor, 52 lime street, london (ec3m 7aw)"
88890,94084,,london,,ec2m 3ad,bishopsgate,155,,,"155 bishopsgate, london (ec2m 3ad)"
88891,94085,,london,,ec3m 7af,lime street,52,,,"9th and 10th floors, 52 lime street, london (e..."
88892,94086,,london,,ec3a 7ba,bevis marks,6,,,"part of tenth floor, 6 bevis marks, london (ec..."


In [26]:
full_expanded_data = pd.concat([expanded_street, 
           expanded_unit_id, 
           expanded_street_simple, 
           expanded_unit_id_simple, 
           df2, ])

In [27]:
ocod_data =  pd.read_csv('/tf/empty_homes_data/' +
                    'OCOD_FULL_2022_02.csv',
                   encoding_errors= 'ignore').rename(columns = lambda x: x.lower().replace(" ", "_"))
#empty addresses cannot be used. however there are only three so not a problem
ocod_data = ocod_data.dropna(subset = 'property_address')
ocod_data.reset_index(inplace = True, drop = True)
ocod_data = ocod_data[['title_number', 'tenure', 'district', 'county',
       'region', 'multiple_address_indicator', 'price_paid', 'property_address']]

  ocod_data =  pd.read_csv('/tf/empty_homes_data/' +


In [28]:
full_expanded_data = full_expanded_data.merge(ocod_data, how = "left", left_on = "datapoint_id", right_index = True)
full_expanded_data['property_address'].str.lower().equals(full_expanded_data['text']) # When 'True' this shows the match works because the address string are equal

False

In [29]:
#Why is this 1 address not the same?

full_expanded_data.loc[~((full_expanded_data['text']==full_expanded_data['property_address'].str.lower())), ['property_address', 'text', 'datapoint_id']]

Unnamed: 0,property_address,text,datapoint_id
0,"202, Stanley Court 19-23, Stanley Street, Live...","202, stanley court, 19-23, stanley street, liv...",3102
1,"202, Stanley Court 19-23, Stanley Street, Live...","202, stanley court, 19-23, stanley street, liv...",3102
2,"202, Stanley Court 19-23, Stanley Street, Live...","202, stanley court, 19-23, stanley street, liv...",3102
3,"202, Stanley Court 19-23, Stanley Street, Live...","202, stanley court, 19-23, stanley street, liv...",3102
4,"202, Stanley Court 19-23, Stanley Street, Live...","202, stanley court, 19-23, stanley street, liv...",3102
8589,"40a, 40, 40¨, 42, 44 East Bond Street, 2, 4 Gr...","40a, 40, 40 ̈, 42, 44 east bond street, 2, 4 g...",57998
8590,"40a, 40, 40¨, 42, 44 East Bond Street, 2, 4 Gr...","40a, 40, 40 ̈, 42, 44 east bond street, 2, 4 g...",57998
8591,"40a, 40, 40¨, 42, 44 East Bond Street, 2, 4 Gr...","40a, 40, 40 ̈, 42, 44 east bond street, 2, 4 g...",57998
8592,"40a, 40, 40¨, 42, 44 East Bond Street, 2, 4 Gr...","40a, 40, 40 ̈, 42, 44 east bond street, 2, 4 g...",57998
8593,"40a, 40, 40¨, 42, 44 East Bond Street, 2, 4 Gr...","40a, 40, 40 ̈, 42, 44 east bond street, 2, 4 g...",57998


In [30]:
full_expanded_data.loc[:, 'property_address'].str.lower().loc[full_expanded_data['datapoint_id']==57998]

8589    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
8590    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
8591    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
8592    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
8593    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
8594    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
8595    40a, 40, 40¨, 42, 44 east bond street, 2, 4 gr...
Name: property_address, dtype: object

In [31]:
full_expanded_data['within_title_id'] = full_expanded_data.groupby('title_number').cumcount()+1
full_expanded_data['unique_id'] = [str(x) + '-' + str(y) for x, y in zip(full_expanded_data['title_number'], full_expanded_data['within_title_id'])]

tmp_df =((full_expanded_data[['title_number', 'within_title_id']].groupby('title_number').max('within_title_id'))>1)
tmp_df.columns = tmp_df.columns.str.replace('within_title_id', 'within_larger_title') #could also be called nested_address
full_expanded_data = full_expanded_data.merge(tmp_df, how = "left", left_on = "title_number", right_index = True)


full_expanded_data['postcode'] =full_expanded_data['postcode'].str.upper()
del tmp_df

#re-order the columns and drop columns that are not needed

full_expanded_data =full_expanded_data[['title_number', 'within_title_id', 'unique_id', 'within_larger_title',  'tenure','unit_id', 'unit_type','building_name','street_number', 'street_name', 'postcode','city',  'district', 'county', 'region',
       'multiple_address_indicator', 'price_paid' ,'property_address']].replace('block', np.NaN)

#save as CSV
full_expanded_data.to_csv("/tf/empty_homes_data/OCOD_cleaned_expanded2.csv")

In [33]:
full_expanded_data#[full_expanded_data['street_name'].isnull()]['property_address'].to_csv('/tf/empty_homes_data/street_is_null.csv')

Unnamed: 0,title_number,within_title_id,unique_id,within_larger_title,tenure,unit_id,unit_type,building_name,street_number,street_name,postcode,city,district,county,region,multiple_address_indicator,price_paid,property_address
0,CB400630,1,CB400630-1,True,Freehold,,,,2,miller way,,peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
1,CB400630,2,CB400630-2,True,Freehold,,,,4,miller way,,peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
2,CB400630,3,CB400630-3,True,Freehold,,,,6,miller way,,peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
3,CB400630,4,CB400630-4,True,Freehold,,,,8,miller way,,peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
4,CB400630,5,CB400630-5,True,Freehold,,,,10,miller way,,peterborough,CITY OF PETERBOROUGH,CITY OF PETERBOROUGH,EAST ANGLIA,N,,"2-24 (even) Miller Way, 15-25 Hammonds Drive (..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88889,AGL470128,1,AGL470128-1,False,Leasehold,,,,52,lime street,EC3M 7AW,london,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"2nd Floor, 52 Lime Street, London (EC3M 7AW)"
88890,AGL473126,1,AGL473126-1,False,Leasehold,,,,155,bishopsgate,EC2M 3AD,london,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"155 Bishopsgate, London (EC2M 3AD)"
88891,AGL473595,1,AGL473595-1,False,Leasehold,,,,52,lime street,EC3M 7AF,london,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"9th and 10th floors, 52 Lime Street, London (E..."
88892,AGL475468,1,AGL475468-1,False,Leasehold,,,,6,bevis marks,EC3A 7BA,london,CITY OF LONDON,GREATER LONDON,GREATER LONDON,N,,"part of Tenth Floor, 6 Bevis Marks, London (EC..."


In [None]:
full_expanded_data[full_expanded_data['property_address'].str.contains('Scottish Provident')]

In [None]:
300000*700