In [1]:
#! pip install -U spacy -q 

In [2]:
#!python -m spacy download en_core_web_lg

In [3]:
!python -m spacy info

[1m

spaCy version    3.7.2                         
Location         C:\Python310\lib\site-packages\spacy
Platform         Windows-10-10.0.19045-SP0     
Python version   3.10.10                       
Pipelines        en_core_web_lg (3.7.0), en_core_web_sm (3.7.0), en_core_web_trf (3.7.2)



In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [5]:
import json

In [6]:
nlp_ner = spacy.load("Components/03HVAC/cpu_acc_new/model-best") 

In [7]:
test_text1 = 'Heat Pump Package Evaporator/Air Handler - 7.5 TN'
test_text2 = 'Heat Pump Package Evaporator/Air Handler -3.5 TN'
test_text3 = 'Storage Tank - Galvanized steel, 400 gallon, 36" diameter, 100" L.O.A.'
test_text4 = 'Storage Tank - Galvanized steel, 500 gallon, 36" diameter,126" L.O.A.'

In [8]:
print(nlp_ner.tokenizer.explain(test_text1))

[('TOKEN', 'Heat'), ('TOKEN', 'Pump'), ('TOKEN', 'Package'), ('TOKEN', 'Evaporator'), ('INFIX', '/'), ('TOKEN', 'Air'), ('TOKEN', 'Handler'), ('TOKEN', '-'), ('TOKEN', '7.5'), ('TOKEN', 'TN')]


In [9]:
print(nlp_ner.tokenizer.explain(test_text2))

[('TOKEN', 'Heat'), ('TOKEN', 'Pump'), ('TOKEN', 'Package'), ('TOKEN', 'Evaporator'), ('INFIX', '/'), ('TOKEN', 'Air'), ('TOKEN', 'Handler'), ('TOKEN', '-3.5'), ('TOKEN', 'TN')]


In [10]:
print(nlp_ner.tokenizer.explain(test_text3))

[('TOKEN', 'Storage'), ('TOKEN', 'Tank'), ('TOKEN', '-'), ('TOKEN', 'Galvanized'), ('TOKEN', 'steel'), ('SUFFIX', ','), ('TOKEN', '400'), ('TOKEN', 'gallon'), ('SUFFIX', ','), ('TOKEN', '36'), ('SUFFIX', '"'), ('TOKEN', 'diameter'), ('SUFFIX', ','), ('TOKEN', '100'), ('SUFFIX', '"'), ('TOKEN', 'L.O.A.')]


In [11]:
print(nlp_ner.tokenizer.explain(test_text4))

[('TOKEN', 'Storage'), ('TOKEN', 'Tank'), ('TOKEN', '-'), ('TOKEN', 'Galvanized'), ('TOKEN', 'steel'), ('SUFFIX', ','), ('TOKEN', '500'), ('TOKEN', 'gallon'), ('SUFFIX', ','), ('TOKEN', '36'), ('SUFFIX', '"'), ('TOKEN', 'diameter,126'), ('SUFFIX', '"'), ('TOKEN', 'L.O.A.')]


In [12]:
from spacy.util import compile_prefix_regex, compile_infix_regex

# Define custom infixes
# Dash preceded by a space and followed by a digit
# Comma preceded by a letter and followed by a digit
custom_infixes = [r'(?<=\s)-(?=\d)', r'(?<=[a-zA-Z]),(?=\d)']
default_infixes = list(nlp_ner.Defaults.infixes)
all_infixes = custom_infixes + default_infixes

# Define custom prefixes - to capture standalone dash before a digit
custom_prefixes = [r'-']
default_prefixes = list(nlp_ner.Defaults.prefixes)
all_prefixes = custom_prefixes + default_prefixes

# Compile new regex rules
prefix_regex = compile_prefix_regex(all_prefixes)
infix_regex = compile_infix_regex(all_infixes)

# Update the tokenizer with new rules
nlp_ner.tokenizer.prefix_search = prefix_regex.search
nlp_ner.tokenizer.infix_finditer = infix_regex.finditer



In [13]:
# Test the tokenizer with both cases
test_sentences = ["Evaporator/Air Handler -3.5 TN", "diameter,126", "Evaporator/Air Handler -13.5 TN", "blah,326"]
for sentence in test_sentences:
    doc = nlp_ner(sentence)
    tokens = [token.text for token in doc]
    print(f"'{sentence}' => {tokens}")

'Evaporator/Air Handler -3.5 TN' => ['Evaporator', '/', 'Air', 'Handler', '-', '3.5', 'TN']
'diameter,126' => ['diameter', ',', '126']
'Evaporator/Air Handler -13.5 TN' => ['Evaporator', '/', 'Air', 'Handler', '-', '13.5', 'TN']
'blah,326' => ['blah', ',', '326']


In [14]:
# Open and read the text file
with open("Components/hvac-validate-spaces.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Process each line with the NER model
for line in lines:
    line = line.strip()  # Remove leading/trailing whitespace
    doc = nlp_ner(line)
    print([w.text for w in doc])
    tok_exp = nlp_ner.tokenizer.explain(line)
    assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
    for t in tok_exp:
        print(t[1], "\\t", t[0])
    spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

['Heat', 'Pump', 'Package', 'Condenser', '/', 'Evaporator', '/', 'Air', 'Handler', '-', '5', 'TN']
Heat \t TOKEN
Pump \t TOKEN
Package \t TOKEN
Condenser \t TOKEN
/ \t INFIX
Evaporator \t TOKEN
/ \t INFIX
Air \t TOKEN
Handler \t TOKEN
- \t PREFIX
5 \t TOKEN
TN \t TOKEN


['Heat', 'Pump', 'Package', 'Condenser', '/', 'Evaporator', '/', 'Air', 'Handler', '-', '7.5', 'TN']
Heat \t TOKEN
Pump \t TOKEN
Package \t TOKEN
Condenser \t TOKEN
/ \t INFIX
Evaporator \t TOKEN
/ \t INFIX
Air \t TOKEN
Handler \t TOKEN
- \t PREFIX
7.5 \t TOKEN
TN \t TOKEN


['Heat', 'Pump', 'Package', 'Condenser', '/', 'Evaporator', '/', 'Air', 'Handler', '-', '3.5', 'TN']
Heat \t TOKEN
Pump \t TOKEN
Package \t TOKEN
Condenser \t TOKEN
/ \t INFIX
Evaporator \t TOKEN
/ \t INFIX
Air \t TOKEN
Handler \t TOKEN
- \t PREFIX
3.5 \t TOKEN
TN \t TOKEN


['Dehumidifier', '-', '1.5', 'lb./Hr', '.', ',', '50', 'CFM']
Dehumidifier \t TOKEN
- \t PREFIX
1.5 \t TOKEN
lb./Hr \t TOKEN
. \t SUFFIX
, \t SUFFIX
50 \t TOKEN
CFM \t TOKEN


['Electric', ',', 'Hot', 'Water', '-', '   ', '<', '22', 'KW', ',', '<', '78', 'MBH']
Electric \t TOKEN
, \t SUFFIX
Hot \t TOKEN
Water \t TOKEN
- \t PREFIX
< \t PREFIX
22 \t TOKEN
KW \t TOKEN
, \t SUFFIX
< \t PREFIX
78 \t TOKEN
MBH \t TOKEN


['Fan', 'System', ',', 'Wall', 'Exhaust', '-', '800', 'CFM']
Fan \t TOKEN
System \t TOKEN
, \t SUFFIX
Wall \t TOKEN
Exhaust \t TOKEN
- \t PREFIX
800 \t TOKEN
CFM \t TOKEN


['Fume', 'Hood', 'Exhaust', 'System', '-', '4', "'", ',', '2000', 'CFM']
Fume \t TOKEN
Hood \t TOKEN
Exhaust \t TOKEN
System \t TOKEN
- \t PREFIX
4 \t TOKEN
' \t SPECIAL-1
, \t SUFFIX
2000 \t TOKEN
CFM \t TOKEN


['Gas', '/', 'Oil', ',', 'Hot', 'Water', '-', ' ', '10,000', '-', '12,500', 'MBH']
Gas \t TOKEN
/ \t INFIX
Oil \t TOKEN
, \t SUFFIX
Hot \t TOKEN
Water \t TOKEN
- \t PREFIX
10,000 \t TOKEN
- \t INFIX
12,500 \t TOKEN
MBH \t TOKEN


['Gas', '/', 'Oil', ',', 'Hot', 'Water', '-', '>', '12,500', 'MBH']
Gas \t TOKEN
/ \t INFIX
Oil \t TOKEN
, \t SUFFIX
Hot \t TOKEN
Water \t TOKEN
- \t PREFIX
> \t PREFIX
12,500 \t TOKEN
MBH \t TOKEN


['Gas', '/', 'Oil', ',', 'Steam', '-', ' ', '<', '1000', 'MBH']
Gas \t TOKEN
/ \t INFIX
Oil \t TOKEN
, \t SUFFIX
Steam \t TOKEN
- \t PREFIX
< \t PREFIX
1000 \t TOKEN
MBH \t TOKEN


In [15]:
# Open and read the text file
with open("Components/03HVAC_reduced.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Process each line with the NER model
for line in lines:
    line = line.strip()  # Remove leading/trailing whitespace
    doc = nlp_ner(line)
    spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
# Create a list to store the JSON objects for each line
output_list = []

# Open and read the text file
with open("Components/03HVAC_reduced.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Process each line with the NER model
for line in lines:
    line = line.strip()  # Remove leading/trailing whitespace
    doc = nlp_ner(line)
    
    # Create a dictionary for entities and labels
    entity_dict = {}
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    if entities:
        for text, label in entities:
            entity_dict[label] = text
    
    # Append the entity dictionary to the output list
    output_list.append(entity_dict)

# Convert the output list to a JSON string
output_json = json.dumps(output_list, indent=4)

# Save the JSON data to a file
with open("Components/03HVAC_reduced_cpu_acc_new_tokenizer.json", "w", encoding="utf-8") as output_file:
    output_file.write(output_json)

# Print a confirmation message
print("JSON data saved to 'Components/03HVAC_reduced_cpu_acc_new_tokenizer.json'")

In [None]:
import re

def add_space_after_comma_or_dash(text):
    # Regex pattern: looks for patterns of alphabetic characters followed by a dash or a comma (with an optional space) and then numeric characters
    pattern = r'([a-zA-Z]+)([-,]) ?(\d+)'
    # Replacement pattern: same as the matched pattern but with a space added after the dash or comma
    replacement = r'\1\2 \3'
    # Perform the substitution
    return re.sub(pattern, replacement, text)