In [1]:
import pandas as pd
from enhance_ocod.inference import convert_to_entity_dataframe, parse_addresses_basic
from enhance_ocod.address_parsing import (
    load_and_prep_OCOD_data,
    parsing_and_expansion_process,
    post_process_expanded_data,
    load_postcode_district_lookup,
)
from pathlib import Path
import numpy as np

SCRIPT_DIR = Path("/teamspace/studios/this_studio/enhance_ocod/notebooks")

# ====== CONSTANT PATHS AND SETTINGS ======
input_dir = SCRIPT_DIR.parent / "data" / "ocod_history"
output_dir = SCRIPT_DIR.parent / "data" / "ocod_history_processed2"
model_path = (
    SCRIPT_DIR.parent / "models" / "address_parser_original_fullset" / "final_model"
)

In [2]:
address_str = "Flats 36 - 40 (even), 42-44, climb house, Flats 1-5 down buildings, chapel street, London, se45 6pq"

address_df = pd.DataFrame({'address':[address_str]}, index = [0])




In [3]:
example_out = parse_addresses_basic(
    address_df,
    model_path = model_path
) 

initial_df = convert_to_entity_dataframe(example_out)

expanded_df = parsing_and_expansion_process(all_entities=initial_df, expand_addresses=False)


Loading model and processing 1 addresses with batch_size=512


Device set to use cpu
Converting results: 100%|██████████| 1/1 [00:00<00:00, 5683.34it/s]

Processing 8 entities into DataFrame...
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 8
Added missing columns: ['street_number', 'unit_type', 'city']





In [4]:
initial_df

Unnamed: 0,datapoint_id,label,start,end,text,label_text,label_id_count
0,0,unit_id,5,13,"Flats 36 - 40 (even), 42-44, climb house, Flat...",36 - 40,0
1,0,number_filter,15,19,"Flats 36 - 40 (even), 42-44, climb house, Flat...",even,0
2,0,unit_id,21,27,"Flats 36 - 40 (even), 42-44, climb house, Flat...",42-44,1
3,0,building_name,28,40,"Flats 36 - 40 (even), 42-44, climb house, Flat...",climb house,0
4,0,unit_id,47,51,"Flats 36 - 40 (even), 42-44, climb house, Flat...",1-5,2
5,0,building_name,51,66,"Flats 36 - 40 (even), 42-44, climb house, Flat...",down buildings,1
6,0,street_name,67,81,"Flats 36 - 40 (even), 42-44, climb house, Flat...",chapel street,0
7,0,postcode,90,99,"Flats 36 - 40 (even), 42-44, climb house, Flat...",se45 6pq,0


In [39]:
example_out

{'summary': {'total_addresses': 1,
  'successful_parses': 1,
  'failed_parses': 0,
  'success_rate': 1.0,
  'batch_size_used': 512},
 'results': [{'row_index': 0,
   'datapoint_id': 0,
   'original_address': 'Flats 36 - 40 (even), 42-44, climb house, Flats 1-5 down buildings, chapel street, London, se45 6pq',
   'entities': [{'type': 'unit_id',
     'text': ' 36 - 40',
     'start': 5,
     'end': 13,
     'confidence': 0.9996221},
    {'type': 'number_filter',
     'text': 'even',
     'start': 15,
     'end': 19,
     'confidence': 0.999858},
    {'type': 'unit_id',
     'text': ' 42-44',
     'start': 21,
     'end': 27,
     'confidence': 0.9986492},
    {'type': 'building_name',
     'text': ' climb house',
     'start': 28,
     'end': 40,
     'confidence': 0.999928},
    {'type': 'unit_id',
     'text': ' 1-5',
     'start': 47,
     'end': 51,
     'confidence': 0.997581},
    {'type': 'building_name',
     'text': ' down buildings',
     'start': 51,
     'end': 66,
     'con

In [6]:
from enhance_ocod.address_parsing import (identify_multi_addresses, spread_address_labels, 
ensure_required_columns, add_backfill_blockers, backfill_address_labels, final_parsed_addresses)

In [18]:

required_columns = [
        "building_name",
        "street_name",
        "street_number",
        "number_filter",
        "unit_id",
        "unit_type",
        "city",
        "postcode",
    ]

# Continue with existing logic
multi_unit_id, multi_property, all_multi_ids = identify_multi_addresses(
    initial_df
)
spread_labels_df = spread_address_labels(initial_df, all_multi_ids)

# The columns are filled with an empty string as at the moment all columns should be strings
# Ensurinng string prevents errors later when cleaning is performed on street_name and other variables
# This is not being changed to default behaviour as I may need to implement more significant changes later
has_columns_df = ensure_required_columns(spread_labels_df, required_columns, "")

# Blockers prevent the filling of wrong information. As an example if a building is going to back fill up
# previous addresses it should not back fill past another street as this is highly unlikely to be the same building
blockers_df = add_backfill_blockers(has_columns_df)
backfilled_df = backfill_address_labels(blockers_df)

df = final_parsed_addresses(
    backfilled_df,
    initial_df,
    multi_property,
    multi_unit_id,
    all_multi_ids,
    expand_addresses=False,
)


Added missing columns: ['street_number', 'unit_type', 'city']


  out.loc[:, column_name] = out.loc[:, column_name].astype(str)


In [24]:
temp_df

Unnamed: 0,index,datapoint_id,building_name,number_filter,postcode,street_name,unit_id,text,street_number,unit_type,city
0,0,0,block,even,se45 6pq,chapel street,36,"Flats 36 - 40 (even), 42-44, climb house, Flat...",block,,
1,0,0,block,even,se45 6pq,chapel street,38,"Flats 36 - 40 (even), 42-44, climb house, Flat...",block,,
2,0,0,block,even,se45 6pq,chapel street,40,"Flats 36 - 40 (even), 42-44, climb house, Flat...",block,,


In [37]:
def spread_address_labelsx(df, all_multi_ids):
    """
    Spreads address labels into columns while preserving multiple units
    """
    # Take only rows which contain multiple properties
    temp_df = df[df.datapoint_id.isin(all_multi_ids)].copy()
    
    # Create a unique row identifier to prevent collapsing during pivot
    temp_df['row_id'] = temp_df.groupby('datapoint_id').cumcount()
    
    # Pivot with both datapoint_id AND row_id as index
    pivoted_df = temp_df.pivot_table(
        index=["datapoint_id", "row_id"], 
        columns="label", 
        values="label_text",
        aggfunc='first'
    ).reset_index()
    
    # Add back the text column
    pivoted_df = pivoted_df.merge(
        temp_df[["datapoint_id", "row_id", "text"]],
        on=["datapoint_id", "row_id"],
        how="left"
    )
    
    return pivoted_df

In [38]:
spread_address_labelsx(initial_df, all_multi_ids)

Unnamed: 0,datapoint_id,row_id,building_name,number_filter,postcode,street_name,unit_id,text
0,0,0,,,,,36 - 40,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
1,0,1,,even,,,,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
2,0,2,,,,,42-44,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
3,0,3,climb house,,,,,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
4,0,4,,,,,1-5,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
5,0,5,down buildings,,,,,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
6,0,6,,,,chapel street,,"Flats 36 - 40 (even), 42-44, climb house, Flat..."
7,0,7,,,se45 6pq,,,"Flats 36 - 40 (even), 42-44, climb house, Flat..."


In [22]:
spread_labels_df

Unnamed: 0,datapoint_id,building_name,number_filter,postcode,street_name,unit_id,text,street_number,unit_type,city
0,0,block,even,se45 6pq,chapel street,36 - 40,"Flats 36 - 40 (even), 42-44, climb house, Flat...",block,,


In [32]:
initial_df

Unnamed: 0,datapoint_id,label,start,end,text,label_text,label_id_count
0,0,unit_id,5,13,"Flats 36 - 40 (even), 42-44, climb house, Flat...",36 - 40,0
1,0,number_filter,15,19,"Flats 36 - 40 (even), 42-44, climb house, Flat...",even,0
2,0,unit_id,21,27,"Flats 36 - 40 (even), 42-44, climb house, Flat...",42-44,1
3,0,building_name,28,40,"Flats 36 - 40 (even), 42-44, climb house, Flat...",climb house,0
4,0,unit_id,47,51,"Flats 36 - 40 (even), 42-44, climb house, Flat...",1-5,2
5,0,building_name,51,66,"Flats 36 - 40 (even), 42-44, climb house, Flat...",down buildings,1
6,0,street_name,67,81,"Flats 36 - 40 (even), 42-44, climb house, Flat...",chapel street,0
7,0,postcode,90,99,"Flats 36 - 40 (even), 42-44, climb house, Flat...",se45 6pq,0


In [34]:
pd.DataFrame({'unit_id':['36-40', '42-44', '1-5'], 'number_filter':['even', np.nan, np.nan], 
'building_name':['climb_house', 'climb house', 'down buildings'], 
'street_name':['chapel street', 'chapel street', 'chapel street'],
'postocde':['se45 6pq', 'se45 6pq', 'se45 6pq']})

Unnamed: 0,unit_id,number_filter,building_name,street_name,postocde
0,36-40,even,climb_house,chapel street,se45 6pq
1,42-44,,climb house,chapel street,se45 6pq
2,1-5,,down buildings,chapel street,se45 6pq


In [12]:
all_entities = initial_df.copy()

xx_to_yy_regex = r"^\d+\s?(?:-|to)\s?\d+$"

multi_check_df = all_entities[
    [
        "datapoint_id",
        "text",
    ]
].drop_duplicates()


multi_check_df["comma_count"] = multi_check_df["text"].str.count(",")

multi_check_df["land"] = multi_check_df["text"].str.contains(
    r"^(?:land|plot|airspace|car|parking)", case=False
)

multi_check_df["business"] = multi_check_df["text"].str.contains(
    r"cinema|hotel|office|centre|\bpub|holiday\s?inn|travel\s?lodge|business|cafe|^shop| shop|restaurant|home|^stores?\b|^storage\b|company|ltd|limited|plc|retail|leisure|industrial|hall of|trading|commercial|works",
    case=False,
)

temp_df = (
    all_entities[["datapoint_id", "label"]]
    .groupby(["datapoint_id", "label"])
    .value_counts()
    .to_frame(name="counts")
    .reset_index()
    .pivot(index="datapoint_id", columns="label", values="counts")
    .fillna(0)
)

xx_to_yy_street_counts = (
    all_entities["datapoint_id"][
        all_entities["label_text"].str.contains(xx_to_yy_regex)
        & (all_entities["label"] == "street_number")
    ]
    .to_frame(name="datapoint_id")
    .groupby("datapoint_id")
    .size()
    .to_frame(name="xx_to_yy_street_counts")
)

xx_to_yy_unit_counts = (
    all_entities["datapoint_id"][
        all_entities["label_text"].str.contains(xx_to_yy_regex)
        & (all_entities["label"] == "unit_id")
    ]
    .to_frame(name="datapoint_id")
    .groupby("datapoint_id")
    .size()
    .to_frame(name="xx_to_yy_unit_counts")
)

multi_check_df = (
    multi_check_df.merge(
        temp_df, how="left", left_on="datapoint_id", right_index=True
    )
    .merge(
        xx_to_yy_street_counts, how="left", left_on="datapoint_id", right_index=True
    )
    .merge(
        xx_to_yy_unit_counts, how="left", left_on="datapoint_id", right_index=True
    )
    .fillna(0)
)

# Ensures the neccesary columns are present
required_columns = ["building_name", "unit_id", "street_number"]
for col in required_columns:
    if col not in multi_check_df.columns:
        multi_check_df[col] = 0

del xx_to_yy_street_counts
del xx_to_yy_unit_counts

# separate the classes using logical rules
multi_check_df["class"] = np.select(
    [
        multi_check_df["land"],  # Land/plot addresses are single properties
        multi_check_df[
            "business"
        ],  # Business addresses are typically single properties
        (multi_check_df["building_name"] == 1)
        & (
            multi_check_df["unit_id"] == 0
        ),  # Single building name without units = single property
        (multi_check_df["xx_to_yy_unit_counts"] > 0)
        | (
            multi_check_df["xx_to_yy_street_counts"] > 0
        ),  # Range patterns in unit IDs = multiple units
        multi_check_df["street_number"]
        > 1,  # Multiple street numbers = multiple properties
        multi_check_df["unit_id"] > 1,  # Multiple unit IDs = multiple units
        (multi_check_df["street_number"] <= 1)
        & (multi_check_df["xx_to_yy_street_counts"] == 0)
        & (
            multi_check_df["unit_id"] <= 1
        ),  # Single street number, no ranges, single/no unit = single property
    ],
    [
        "single",
        "single",
        "single",
        "multi",
        "multi",
        "multi",
        "single",
    ],
    default="unknown",  # Fallback for edge cases
)
# With the multiaddress dataframe created the required vectors can now be produced

multi_unit_id = set(
    multi_check_df["datapoint_id"][
        (multi_check_df["class"] == "multi") & (multi_check_df["unit_id"] > 0)
    ].tolist()
)
multi_property = set(
    multi_check_df["datapoint_id"][
        (multi_check_df["class"] == "multi") & (multi_check_df["unit_id"] == 0)
    ].tolist()
)
all_multi_ids = list(multi_unit_id) + list(multi_property)

In [13]:
multi_check_df

Unnamed: 0,datapoint_id,text,comma_count,land,business,building_name,number_filter,postcode,street_name,unit_id,xx_to_yy_street_counts,xx_to_yy_unit_counts,street_number,class
0,0,"Flats 36 - 40 (even), 42-44, climb house, Flat...",6,False,False,2,1,1,1,3,0.0,3,0,multi


In [2]:
test_results = {'test':1,
'results':[{'row_index': 51352,
 'datapoint_id': 51352,
 'original_address': 'Ground to ninth Floor Flats being 101-114, 201-214, 301-314, 401-414, 501-514, 601-613 and 701-704 Alaska Building, 101-114, 201-214,301-314, 401-412, 501-506 and 601-605 Arizona Building, 101-114, 201-214, 301-314, 401-414, 501-514, 601-614, 701-708, 801-804, 901-903 California Building, 101-108,     201-208, 301-307, 401-408, 501-508, 601-608, 701-708, 801-808 and 901-903 Colorado Building, 1-4, 101-109, 201-210, 301-310, 401-410, 501-510 and 601-605 Dakota Building, 1-7, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Idaho Building, 102-112, 201-212, 301-312, 401-412, 501-508 and 601-604 Indiana Building, 1-15, 101-116, 201-216, 301-315, 401-416, 501-510 Montana Building, 101-108, 201-208, 301-308, 401-408, 501-506 and 601-604 Nebraska Building, 1-10, 101-110, 201-210, 301-310 and 402-403 Utah Building, 1-10 and 101-110 Boston Building, 1-6, 101-106, 201-206, 301-306, 401-408 and 501-507 Madison Building, Deals Gateway, London',
 'entities': [{'type': 'unit_id',
   'text': ' 101-114',
   'start': 33,
   'end': 41,
   'confidence': 0.9991774},
  {'type': 'unit_id',
   'text': ' 201-214',
   'start': 42,
   'end': 50,
   'confidence': 0.9986568},
  {'type': 'unit_id',
   'text': ' 301-314',
   'start': 51,
   'end': 59,
   'confidence': 0.99845725},
  {'type': 'unit_id',
   'text': ' 401-414',
   'start': 60,
   'end': 68,
   'confidence': 0.9988417},
  {'type': 'unit_id',
   'text': ' 501-514',
   'start': 69,
   'end': 77,
   'confidence': 0.99846214},
  {'type': 'unit_id',
   'text': ' 601-613',
   'start': 78,
   'end': 86,
   'confidence': 0.9977751},
  {'type': 'unit_id',
   'text': ' 701-704',
   'start': 90,
   'end': 98,
   'confidence': 0.99753064},
  {'type': 'building_name',
   'text': ' Alaska Building',
   'start': 98,
   'end': 114,
   'confidence': 0.9985297},
  {'type': 'unit_id',
   'text': ' 101-114',
   'start': 115,
   'end': 123,
   'confidence': 0.9983933},
  {'type': 'unit_id',
   'text': ' 201-214',
   'start': 124,
   'end': 132,
   'confidence': 0.9979008},
  {'type': 'unit_id',
   'text': '301-314',
   'start': 133,
   'end': 140,
   'confidence': 0.9941249},
  {'type': 'unit_id',
   'text': ' 401-412',
   'start': 141,
   'end': 149,
   'confidence': 0.9984674},
  {'type': 'unit_id',
   'text': ' 501-506',
   'start': 150,
   'end': 158,
   'confidence': 0.99758714},
  {'type': 'unit_id',
   'text': ' 601-605',
   'start': 162,
   'end': 170,
   'confidence': 0.9971976},
  {'type': 'building_name',
   'text': ' Arizona Building',
   'start': 170,
   'end': 187,
   'confidence': 0.99408555},
  {'type': 'unit_id',
   'text': ' 101-114',
   'start': 188,
   'end': 196,
   'confidence': 0.9980994},
  {'type': 'unit_id',
   'text': ' 201-214',
   'start': 197,
   'end': 205,
   'confidence': 0.9982705},
  {'type': 'unit_id',
   'text': ' 301-314',
   'start': 206,
   'end': 214,
   'confidence': 0.9963898},
  {'type': 'unit_id',
   'text': ' 401-414',
   'start': 215,
   'end': 223,
   'confidence': 0.9972661},
  {'type': 'unit_id',
   'text': ' 501-514',
   'start': 224,
   'end': 232,
   'confidence': 0.9966486},
  {'type': 'unit_id',
   'text': ' 601-614',
   'start': 233,
   'end': 241,
   'confidence': 0.9970266},
  {'type': 'unit_id',
   'text': ' 701-708',
   'start': 242,
   'end': 250,
   'confidence': 0.99756354},
  {'type': 'unit_id',
   'text': ' 801-804',
   'start': 251,
   'end': 259,
   'confidence': 0.9975583},
  {'type': 'unit_id',
   'text': ' 901-903',
   'start': 260,
   'end': 268,
   'confidence': 0.98705745},
  {'type': 'building_name',
   'text': ' California Building',
   'start': 268,
   'end': 288,
   'confidence': 0.99044013},
  {'type': 'unit_id',
   'text': ' 101-108',
   'start': 289,
   'end': 297,
   'confidence': 0.9976041},
  {'type': 'unit_id',
   'text': '201-208',
   'start': 303,
   'end': 310,
   'confidence': 0.9965289},
  {'type': 'unit_id',
   'text': ' 301-307',
   'start': 311,
   'end': 319,
   'confidence': 0.9975515},
  {'type': 'unit_id',
   'text': ' 401-408',
   'start': 320,
   'end': 328,
   'confidence': 0.9982498},
  {'type': 'unit_id',
   'text': ' 501-508',
   'start': 329,
   'end': 337,
   'confidence': 0.99775153},
  {'type': 'unit_id',
   'text': ' 601-608',
   'start': 338,
   'end': 346,
   'confidence': 0.9981995},
  {'type': 'unit_id',
   'text': ' 701-708',
   'start': 347,
   'end': 355,
   'confidence': 0.9981017},
  {'type': 'unit_id',
   'text': ' 801-808',
   'start': 356,
   'end': 364,
   'confidence': 0.9964859},
  {'type': 'unit_id',
   'text': ' 901-903',
   'start': 368,
   'end': 376,
   'confidence': 0.9841796},
  {'type': 'building_name',
   'text': ' Colorado Building',
   'start': 376,
   'end': 394,
   'confidence': 0.99584436},
  {'type': 'unit_id',
   'text': ' 1-4',
   'start': 395,
   'end': 399,
   'confidence': 0.9979697},
  {'type': 'unit_id',
   'text': ' 101-109',
   'start': 400,
   'end': 408,
   'confidence': 0.99773353},
  {'type': 'unit_id',
   'text': ' 201-210',
   'start': 409,
   'end': 417,
   'confidence': 0.99771434},
  {'type': 'unit_id',
   'text': ' 301-310',
   'start': 418,
   'end': 426,
   'confidence': 0.99711686},
  {'type': 'unit_id',
   'text': ' 401-410',
   'start': 427,
   'end': 435,
   'confidence': 0.9976558},
  {'type': 'unit_id',
   'text': ' 501-510',
   'start': 436,
   'end': 444,
   'confidence': 0.99646354},
  {'type': 'unit_id',
   'text': ' 601-605',
   'start': 448,
   'end': 456,
   'confidence': 0.99705046},
  {'type': 'building_name',
   'text': ' Dakota Building',
   'start': 456,
   'end': 472,
   'confidence': 0.997578},
  {'type': 'unit_id',
   'text': ' 1-7',
   'start': 473,
   'end': 477,
   'confidence': 0.9981144},
  {'type': 'unit_id',
   'text': ' 101-108',
   'start': 478,
   'end': 486,
   'confidence': 0.9977694},
  {'type': 'unit_id',
   'text': ' 201-208',
   'start': 487,
   'end': 495,
   'confidence': 0.99808764},
  {'type': 'unit_id',
   'text': ' 301-308',
   'start': 496,
   'end': 504,
   'confidence': 0.9975129},
  {'type': 'unit_id',
   'text': ' 401-408',
   'start': 505,
   'end': 513,
   'confidence': 0.99805266},
  {'type': 'unit_id',
   'text': ' 501-506',
   'start': 514,
   'end': 522,
   'confidence': 0.99704987},
  {'type': 'unit_id',
   'text': ' 601-604',
   'start': 526,
   'end': 534,
   'confidence': 0.9968517},
  {'type': 'building_name',
   'text': ' Idaho Building',
   'start': 534,
   'end': 549,
   'confidence': 0.99742293},
  {'type': 'unit_id',
   'text': ' 102-112',
   'start': 550,
   'end': 558,
   'confidence': 0.997581},
  {'type': 'unit_id',
   'text': ' 201-212',
   'start': 559,
   'end': 567,
   'confidence': 0.9977842},
  {'type': 'unit_id',
   'text': ' 301-312',
   'start': 568,
   'end': 576,
   'confidence': 0.99773747},
  {'type': 'unit_id',
   'text': ' 401-412',
   'start': 577,
   'end': 585,
   'confidence': 0.9975605},
  {'type': 'unit_id',
   'text': ' 501-508',
   'start': 586,
   'end': 594,
   'confidence': 0.9962113},
  {'type': 'unit_id',
   'text': ' 601-604',
   'start': 598,
   'end': 606,
   'confidence': 0.99632424},
  {'type': 'building_name',
   'text': ' Indiana Building',
   'start': 606,
   'end': 623,
   'confidence': 0.9981582},
  {'type': 'unit_id',
   'text': ' 1-15',
   'start': 624,
   'end': 629,
   'confidence': 0.9979341},
  {'type': 'unit_id',
   'text': ' 101-116',
   'start': 630,
   'end': 638,
   'confidence': 0.9975262},
  {'type': 'unit_id',
   'text': ' 201-216',
   'start': 639,
   'end': 647,
   'confidence': 0.99808216},
  {'type': 'unit_id',
   'text': ' 301-315',
   'start': 648,
   'end': 656,
   'confidence': 0.9971941},
  {'type': 'unit_id',
   'text': ' 401-416',
   'start': 657,
   'end': 665,
   'confidence': 0.9978809},
  {'type': 'unit_id',
   'text': ' 501-510',
   'start': 666,
   'end': 674,
   'confidence': 0.99565387},
  {'type': 'building_name',
   'text': ' Montana Building',
   'start': 674,
   'end': 691,
   'confidence': 0.9937879},
  {'type': 'unit_id',
   'text': ' 101-108',
   'start': 692,
   'end': 700,
   'confidence': 0.99743795},
  {'type': 'unit_id',
   'text': ' 201-208',
   'start': 701,
   'end': 709,
   'confidence': 0.99765307},
  {'type': 'unit_id',
   'text': ' 301-308',
   'start': 710,
   'end': 718,
   'confidence': 0.99723357},
  {'type': 'unit_id',
   'text': ' 401-408',
   'start': 719,
   'end': 727,
   'confidence': 0.9979413},
  {'type': 'unit_id',
   'text': ' 501-506',
   'start': 728,
   'end': 736,
   'confidence': 0.9964202},
  {'type': 'unit_id',
   'text': ' 601-604',
   'start': 740,
   'end': 748,
   'confidence': 0.9958934},
  {'type': 'building_name',
   'text': ' Nebraska Building',
   'start': 748,
   'end': 766,
   'confidence': 0.9981447},
  {'type': 'unit_id',
   'text': ' 1-10',
   'start': 767,
   'end': 772,
   'confidence': 0.99691874},
  {'type': 'unit_id',
   'text': ' 101-110',
   'start': 773,
   'end': 781,
   'confidence': 0.9970587},
  {'type': 'unit_id',
   'text': ' 201-210',
   'start': 782,
   'end': 790,
   'confidence': 0.9973461},
  {'type': 'unit_id',
   'text': ' 301-310',
   'start': 791,
   'end': 799,
   'confidence': 0.99471074},
  {'type': 'unit_id',
   'text': ' 402-403',
   'start': 803,
   'end': 811,
   'confidence': 0.9956574},
  {'type': 'building_name',
   'text': ' Utah Building',
   'start': 811,
   'end': 825,
   'confidence': 0.9984679},
  {'type': 'unit_id',
   'text': ' 1-10',
   'start': 826,
   'end': 831,
   'confidence': 0.9936445},
  {'type': 'unit_id',
   'text': ' 101-110',
   'start': 835,
   'end': 843,
   'confidence': 0.99575347},
  {'type': 'building_name',
   'text': ' Boston Building',
   'start': 843,
   'end': 859,
   'confidence': 0.9987799},
  {'type': 'unit_id',
   'text': ' 1-6',
   'start': 860,
   'end': 864,
   'confidence': 0.99722934},
  {'type': 'unit_id',
   'text': ' 101-106',
   'start': 865,
   'end': 873,
   'confidence': 0.9975369},
  {'type': 'unit_id',
   'text': ' 201-206',
   'start': 874,
   'end': 882,
   'confidence': 0.9980733},
  {'type': 'unit_id',
   'text': ' 301-306',
   'start': 883,
   'end': 891,
   'confidence': 0.99768084},
  {'type': 'unit_id',
   'text': ' 401-408',
   'start': 892,
   'end': 900,
   'confidence': 0.99634665},
  {'type': 'unit_id',
   'text': ' 501-507',
   'start': 904,
   'end': 912,
   'confidence': 0.9965609},
  {'type': 'building_name',
   'text': ' Madison Building',
   'start': 912,
   'end': 929,
   'confidence': 0.9980187},
  {'type': 'street_name',
   'text': ' Deals Gateway',
   'start': 930,
   'end': 944,
   'confidence': 0.99961597},
  {'type': 'city',
   'text': ' London',
   'start': 945,
   'end': 952,
   'confidence': 0.84333366}],
 'parsed_components': {'unit_id': [' 101-114',
   ' 201-214',
   ' 301-314',
   ' 401-414',
   ' 501-514',
   ' 601-613',
   ' 701-704',
   ' 101-114',
   ' 201-214',
   '301-314',
   ' 401-412',
   ' 501-506',
   ' 601-605',
   ' 101-114',
   ' 201-214',
   ' 301-314',
   ' 401-414',
   ' 501-514',
   ' 601-614',
   ' 701-708',
   ' 801-804',
   ' 901-903',
   ' 101-108',
   '201-208',
   ' 301-307',
   ' 401-408',
   ' 501-508',
   ' 601-608',
   ' 701-708',
   ' 801-808',
   ' 901-903',
   ' 1-4',
   ' 101-109',
   ' 201-210',
   ' 301-310',
   ' 401-410',
   ' 501-510',
   ' 601-605',
   ' 1-7',
   ' 101-108',
   ' 201-208',
   ' 301-308',
   ' 401-408',
   ' 501-506',
   ' 601-604',
   ' 102-112',
   ' 201-212',
   ' 301-312',
   ' 401-412',
   ' 501-508',
   ' 601-604',
   ' 1-15',
   ' 101-116',
   ' 201-216',
   ' 301-315',
   ' 401-416',
   ' 501-510',
   ' 101-108',
   ' 201-208',
   ' 301-308',
   ' 401-408',
   ' 501-506',
   ' 601-604',
   ' 1-10',
   ' 101-110',
   ' 201-210',
   ' 301-310',
   ' 402-403',
   ' 1-10',
   ' 101-110',
   ' 1-6',
   ' 101-106',
   ' 201-206',
   ' 301-306',
   ' 401-408',
   ' 501-507'],
  'building_name': [' Alaska Building',
   ' Arizona Building',
   ' California Building',
   ' Colorado Building',
   ' Dakota Building',
   ' Idaho Building',
   ' Indiana Building',
   ' Montana Building',
   ' Nebraska Building',
   ' Utah Building',
   ' Boston Building',
   ' Madison Building'],
  'street_name': [' Deals Gateway'],
  'city': [' London']}}]
}

In [10]:
initial_df = convert_to_entity_dataframe(test_results)

expanded_df = parsing_and_expansion_process(all_entities=initial_df)


Processing 90 entities into DataFrame...
Computing label counts...
✓ Named Entity Recognition processing complete
Total entities extracted: 90
Added missing columns: ['street_number', 'number_filter', 'unit_type', 'postcode']


In [11]:
expanded_df

Unnamed: 0,index,datapoint_id,building_name,city,street_name,unit_id,text,street_number,number_filter,unit_type,postcode
0,0.0,51352,block,London,Deals Gateway,101-114,"Ground to ninth Floor Flats being 101-114, 201...",block,,,


In [None]:
ocod_data = post_process_expanded_data(expanded_df, ocod_data)

In [6]:
test

Unnamed: 0,datapoint_id,label,start,end,text,label_text,label_id_count
0,51352,unit_id,33,41,"Ground to ninth Floor Flats being 101-114, 201...",101-114,0
1,51352,unit_id,42,50,"Ground to ninth Floor Flats being 101-114, 201...",201-214,1
2,51352,unit_id,51,59,"Ground to ninth Floor Flats being 101-114, 201...",301-314,2
3,51352,unit_id,60,68,"Ground to ninth Floor Flats being 101-114, 201...",401-414,3
4,51352,unit_id,69,77,"Ground to ninth Floor Flats being 101-114, 201...",501-514,4
...,...,...,...,...,...,...,...
85,51352,unit_id,892,900,"Ground to ninth Floor Flats being 101-114, 201...",401-408,74
86,51352,unit_id,904,912,"Ground to ninth Floor Flats being 101-114, 201...",501-507,75
87,51352,building_name,912,929,"Ground to ninth Floor Flats being 101-114, 201...",Madison Building,11
88,51352,street_name,930,944,"Ground to ninth Floor Flats being 101-114, 201...",Deals Gateway,0
