In [1]:
# We already know we can search for substrings
# within a larger string with the in operator:
"dog" in "my dog is great"

True

In [2]:
# This has severe limitations, we need to know 
# the exact string, and need to perform additional
# operations to account for capitalization and
# punctuation.

# What if we only know the pattern structure of
# the string we're looking for? Like an email
# or phone number?

In [3]:
# Regular Expressions (regex) allow us to search
# for general patterns in text data!

# For example, a simple email format can be:
# user@email.com

# We know in this case we're looking for a pattern
# "text" + "@" + "text" + ".com"

In [4]:
# The re library allows us to create specialized
# pattern strings and then search for matches
# within text.

# The primary skillset for regex is understanding
# the special syntax for these pattern strings.

In [5]:
# Don't feel like you need to memorize these patterns!
# Regular expressions are notoriously difficult to
# memorize and understand. Focus on understanding how
# to look up the information.

In [6]:
# This series of lectures will first focus on how
# to use the re library to search for patterns
# within text.

# Afterwards we will focus on understanding the
# regex syntax codes.

In [7]:
# searching basic patterns
text = "The agent's phone number is 408-555-1234. Call soon!"

In [8]:
'phone' in text

True

In [9]:
# using regular expressions
import re

In [10]:
pattern = 'phone'

In [11]:
re.search(pattern, text) # returns a match object

<_sre.SRE_Match object; span=(12, 17), match='phone'>

In [12]:
pattern = "NOT IN TEXT"

In [13]:
re.search(pattern, text)

In [14]:
# taking a closer look at match object
pattern = 'phone'

In [15]:
match = re.search(pattern, text)

In [16]:
match

<_sre.SRE_Match object; span=(12, 17), match='phone'>

In [17]:
match.span() # returns start and end+1 index of the pattern

(12, 17)

In [18]:
match.start()

12

In [19]:
match.end()

17

In [20]:
# if we have multiple mathes for the
# pattern string then we will unfortunately
# get only the first match

In [21]:
text = "my phone one, my phone two"

In [22]:
re.search('phone', text)

<_sre.SRE_Match object; span=(3, 8), match='phone'>

In [23]:
match # returns only the first match

<_sre.SRE_Match object; span=(12, 17), match='phone'>

In [24]:
# to get all the matches use findall function
matches = re.findall('phone', text)

In [25]:
matches

['phone', 'phone']

In [26]:
len(matches)

2

In [27]:
# iterating through all the match objects using finditer
for match in re.finditer('phone', text):
    print(match)

<_sre.SRE_Match object; span=(3, 8), match='phone'>
<_sre.SRE_Match object; span=(17, 22), match='phone'>


In [28]:
for match in re.finditer('phone', text):
    print(match.span())

(3, 8)
(17, 22)


In [29]:
for match in re.finditer('phone', text):
    print(match.group()) # to get the actual pattern we are looking for

phone
phone


In [30]:
# So far we have been able to realize the critical
# funtions(search, findall, finditer) of the re module.

In [31]:
# Next will learn how to build actual regular expression 
# syntax for general patterns.