In [2]:
# Import spaCy and load the pre-trained English model
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

In [18]:
pattern1 = [{"LOWER": "solarpower"}]
pattern2 = [{"LOWER": "solar"}, {"IS_PUNCT": True}, {"LOWER": "power"}]

In [19]:
matcher.add(key="SolarPower", patterns=[pattern1, pattern2])

In [17]:
# Sentence with compound words (concatenated words) - some repeated
text = "SolarPower and WindEnergy are leading renewable energy companies. Solar-Power products use SmartGrid technology. Wind-Energy also uses SmartGrid for efficient PowerManagement and EcoFriendly solutions."
doc = nlp(text)

In [21]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 0, 1), (8656102463236116519, 9, 12)]


In [29]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(f"{match_id} {string_id} {start:<2} {end:>2} {span}")

8656102463236116519 SolarPower 0   1 SolarPower
8656102463236116519 SolarPower 9  12 Solar-Power


### Phrase Matching with spaCy

In this example, we will demonstrate how to use spaCy's `PhraseMatcher` to find specific phrases in a text.

In [33]:
from spacy.matcher import PhraseMatcher

In [34]:
# Create phrase matcher
matcher = PhraseMatcher(nlp.vocab)

In [54]:
# Read the text file
with open("sample_text.txt", "r") as f:
    text = f.read()

sample_text_doc = nlp(text)

In [43]:
# Define phrase patterns
phrase_list = [
    "Natural Language Processing",
    "Machine Learning",
    "NLP",
    "Deep Learning",
    "Text Preprocessing",
    "Named Entity Recognition",
    "Sentiment Analysis",
]

In [51]:
# Convert phrases to doc patterns
phrase_patterns = [nlp(phrase) for phrase in phrase_list]
phrase_patterns

[Natural Language Processing,
 Machine Learning,
 NLP,
 Deep Learning,
 Text Preprocessing,
 Named Entity Recognition,
 Sentiment Analysis]

In [52]:
# Check the type of a phrase pattern
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [53]:
matcher.add("PhraseMatcher", phrase_patterns)

In [57]:
found_matches = matcher(sample_text_doc)
found_matches

[(12217749092145805733, 0, 3),
 (13348250359072121662, 0, 3),
 (15832915187156881108, 11, 12),
 (13348250359072121662, 11, 12),
 (16378066519788692076, 18, 20),
 (13348250359072121662, 18, 20),
 (13348250359072121662, 21, 23),
 (17857678330435779591, 21, 23),
 (15832915187156881108, 27, 28),
 (13348250359072121662, 27, 28),
 (11488481882475214979, 29, 31),
 (13348250359072121662, 29, 31),
 (15832915187156881108, 34, 35),
 (13348250359072121662, 34, 35),
 (16295637461494666337, 37, 40),
 (13348250359072121662, 37, 40),
 (5041873368324210433, 41, 43),
 (13348250359072121662, 41, 43),
 (15832915187156881108, 45, 46),
 (13348250359072121662, 45, 46),
 (16378066519788692076, 48, 50),
 (13348250359072121662, 48, 50)]

In [59]:
for match_id, start, end in found_matches:
    span = sample_text_doc[start:end]
    print(f"Found: {span.text}")

Found: Natural Language Processing
Found: Natural Language Processing
Found: NLP
Found: NLP
Found: Machine Learning
Found: Machine Learning
Found: Deep Learning
Found: Deep Learning
Found: NLP
Found: NLP
Found: Text Preprocessing
Found: Text Preprocessing
Found: NLP
Found: NLP
Found: Named Entity Recognition
Found: Named Entity Recognition
Found: Sentiment Analysis
Found: Sentiment Analysis
Found: NLP
Found: NLP
Found: Machine Learning
Found: Machine Learning
