# Text Extrction using Regular Expression

spacy is used as key library

# 1)- Importing key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

In [5]:
text

'Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google.'

**we only need Google I/O and not all other instances where Google is used. So, pattern is very specific**

In [6]:
pattern = [{'TEXT':'Google'}, {'TEXT': 'I'}, {'TEXT':'/'}, {'TEXT':'O'}]

**Defining callback**

In [7]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [8]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [9]:
doc=nlp(text)

In [10]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 6, 10), (11578853341595296054, 10, 14)]

### Find word Google

In all instances

In [11]:
pattern = [{'TEXT':'Google'}, {'TEXT': 'I', 'OP': '?'}, {'TEXT':'/', 'OP': '?'}, {'TEXT':'O', 'OP': '?'}]

**OP ? means that it may or may not have that letter**

In [12]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [13]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [14]:
doc = nlp(text)

In [15]:
matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


[(11578853341595296054, 0, 1),
 (11578853341595296054, 6, 7),
 (11578853341595296054, 6, 8),
 (11578853341595296054, 6, 9),
 (11578853341595296054, 6, 10),
 (11578853341595296054, 10, 11),
 (11578853341595296054, 10, 12),
 (11578853341595296054, 10, 13),
 (11578853341595296054, 10, 14),
 (11578853341595296054, 23, 24)]

Shows all instances with google and their span as well.

### Other example 

In [16]:
text="Python 3.0 is not backward-compatible on purpose. Thanks to that, you can benefit from a whole new set of features. ... Python 3.0, compared to Python 2.0. Python 3.0, also known as “Python 3000” or “Py3K”, is the first ever intentionally backwards incompatible Python release"

In [17]:
print(text)

Python 3.0 is not backward-compatible on purpose. Thanks to that, you can benefit from a whole new set of features. ... Python 3.0, compared to Python 2.0. Python 3.0, also known as “Python 3000” or “Py3K”, is the first ever intentionally backwards incompatible Python release


**A pattern for only Python 3.0**

In [18]:
pattern = [{'TEXT':'Python'}, {'TEXT': '3.0'}]

In [19]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [20]:
matcher = Matcher(nlp.vocab)
matcher.add('Python', callback_method, pattern)

In [21]:
doc=nlp(text)

In [22]:
matcher(doc)

Python 3.0
Python 3.0
Python 3.0


[(15328717830860514303, 0, 2),
 (15328717830860514303, 26, 28),
 (15328717830860514303, 34, 36)]

**create pattern for all Py instances**

In [23]:
pattern = [{'TEXT':'Python'}, {'TEXT': '3.0', 'OP': '?'}, {'TEXT':'2.0', 'OP': '?'}]

In [24]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [25]:
matcher = Matcher(nlp.vocab)
matcher.add('Py', callback_method, pattern)

In [26]:
doc=nlp(text)

In [27]:
matcher(doc)

Python
Python 3.0
Python
Python 3.0
Python
Python 2.0
Python
Python 3.0
Python
Python


[(5518491823875680071, 0, 1),
 (5518491823875680071, 0, 2),
 (5518491823875680071, 26, 27),
 (5518491823875680071, 26, 28),
 (5518491823875680071, 31, 32),
 (5518491823875680071, 31, 33),
 (5518491823875680071, 34, 35),
 (5518491823875680071, 34, 36),
 (5518491823875680071, 41, 42),
 (5518491823875680071, 57, 58)]