In [None]:
import en_plumbing_ner
import spacy
import json

In [None]:
#load the NER model to parse Plumbing data

In [None]:
nlp_ner = spacy.load('en_plumbing_ner')

In [None]:
# Custom tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex

# Define custom infixes
# Dash preceded by a space and followed by a digit
# Comma preceded by a letter and followed by a digit
custom_infixes = [r'(?<=\s)-(?=\d)', r'(?<=[a-zA-Z]),(?=\d)']
default_infixes = list(nlp_ner.Defaults.infixes)
all_infixes = custom_infixes + default_infixes

# Define custom prefixes - to capture standalone dash before a digit
custom_prefixes = [r'-']
default_prefixes = list(nlp_ner.Defaults.prefixes)
all_prefixes = custom_prefixes + default_prefixes

# Compile new regex rules
prefix_regex = compile_prefix_regex(all_prefixes)
infix_regex = compile_infix_regex(all_infixes)

# Update the tokenizer with new rules
nlp_ner.tokenizer.prefix_search = prefix_regex.search
nlp_ner.tokenizer.infix_finditer = infix_regex.finditer

In [None]:
#define function to format the output as json

In [None]:
properties = set()

def json_output(doc):
    # Process the 'doc' and extract the desired information in JSON format
    
    entity_dict = {}
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    if entities:
        for text, label in entities:
            entity_dict[label] = text
            properties.add(label)
    return entity_dict
    # output_data = json.dumps(entity_dict, indent=4)
    # return output_data


In [None]:
from Components.disaggregate_units import disaggregate_units

# Open and convert the HVAC data to JSON
with open("plumbing-unique.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Process each line with the NER model
results = {}
untagged_results = []
for line in lines:
    line = line.strip()  # Remove leading/trailing whitespace
    doc = nlp_ner(line)
    result = json_output(doc)
    if bool(result):
        results[line] = disaggregate_units(result)
    else:
        untagged_results.append(line)

with open("plumbing-results.json", "w", encoding="utf-8") as file:
    json.dump(results, file, ensure_ascii=False, indent=2)

In [None]:
with open("plumbing-untagged.txt", "w", encoding="utf-8") as file:
    file.write('\n'.join(untagged_results))

In [None]:
sorted_properties = list(properties)
sorted_properties.sort()
print(sorted_properties)