###########################################################
### Goal: parse block of text and extract categorical data
####      eg: "Address 123 Lester, Suite 567" => {'Address': "123 Lester", 'Suite': "567"}
###########################################################

In [1]:
# Sample text to parse
text = "Suite 08 Condominium Plot aBGH-1234 Lot 123 Leslie St. Size 45km^2, Unit 512; High elevation on walls and damp asphalt around building makes cleanup difficult during times of disturbance in the nature"

In [2]:
import spacy
from spacy.matcher import PhraseMatcher
# python -m spacy download en_core_web_sm

In [3]:
# Tokenizing text
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

for token in doc:
    print(token, end=' | ')

Suite | 08 | Condominium | Plot | aBGH-1234 | Lot | 123 | Leslie | St. | Size | 45km^2 | , | Unit | 512 | ; | High | elevation | on | walls | and | damp | asphalt | around | building | makes | cleanup | difficult | during | times | of | disturbance | in | the | nature | 

In [4]:
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
categories = ['Address', 'Suite', 'Condominium Unit', 'Unit', 'Plot', 'Lot', 'Condominium Plot', 'Size']

patterns = [nlp(category) for category in categories]
matcher.add('Categories', patterns)

matches = matcher(doc)
print(matches)

[(6837309499179236792, 0, 1), (6837309499179236792, 2, 4), (6837309499179236792, 3, 4), (6837309499179236792, 5, 6), (6837309499179236792, 9, 10), (6837309499179236792, 12, 13)]


In [5]:
for match in matches:
    match_id, start, end = match
    print(f"{nlp.vocab.strings[match_id]}: {doc[start:end]}")

Categories: Suite
Categories: Condominium Plot
Categories: Plot
Categories: Lot
Categories: Size
Categories: Unit
