In [1]:
# Regular Expressions Library is used to search for patterns within text

In [2]:
# Regular Expressions can use a specific string, but also a pattern code

In [2]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [3]:
"phone" in text

True

In [8]:
import re

In [8]:
pattern = "phone"

In [9]:
re.search(pattern, text) #span displays where the pattern is found within the text

<re.Match object; span=(12, 17), match='phone'>

In [11]:
new_pattern = "not in text"

In [13]:
re.search(new_pattern, text) # no output indicates not found

In [14]:
match = re.search(pattern, text)

In [15]:
match

<re.Match object; span=(12, 17), match='phone'>

In [18]:
match.span() # you are able to call specific data 

(12, 17)

In [17]:
match.start()

12

In [19]:
match.end()

17

In [20]:
new_text = "hello i am saying hello to you hello"

In [21]:
match = re.search("hello", new_text)

In [23]:
match # only returns the first match

<re.Match object; span=(0, 5), match='hello'>

In [24]:
matches = re.findall("hello", new_text) # search for multiple matches

In [25]:
matches

['hello', 'hello', 'hello']

In [26]:
len(matches)

3

In [32]:
for match in re.finditer("hello", new_text): # iterate through found matches
    print(match)
    print(match.span())
    print(match.group())

<re.Match object; span=(0, 5), match='hello'>
(0, 5)
hello
<re.Match object; span=(18, 23), match='hello'>
(18, 23)
hello
<re.Match object; span=(31, 36), match='hello'>
(31, 36)
hello


In [12]:
# Regular Expression Patterns & Quantifiers

In [13]:
# Patterns

# \d -- Digit               -- file_\d\d -- file_25
# \w -- Alphanumeric        -- \w-\w\w\w -- A-b_1
# \s -- White space         -- \a\sb\sc  -- a b c
# \D -- Non-digit           -- \D\D\D    -- ABC
# \W -- Non-alphanumeric    -- \W\W\W\W  -- *-+=
# \S -- Non-whitespace      -- \S\S\S\S  -- Yoyo


# Quantifiers

# +     -- Occurs one or more times     -- Version \w-\w+  --  Version A-b1_1 
# {3}   -- Occurs exactly 3 times       -- \D{3}           --  abc
# {2,4} -- Occurs 2-4 times             -- \d{2,4}         --  123
# {3,}  -- Occurs 3 or more times       -- \w{3,}          --  anycharacters
# *     -- Occurs zero or more times    -- ABC*            --  AAACC
# ?     -- Occurs once or never         -- plurals?        --  plural


In [6]:
phone_number = "My phone number is 408-555-1234"

In [9]:
phone = re.search(r"\d\d\d-\d\d\d-\d\d\d\d", phone_number)

In [10]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [11]:
phone.group()

'408-555-1234'

In [15]:
phone = re.search(r"\d{3}-\d{3}-\d{4}", phone_number)

In [16]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [17]:
phone_pattern = re.compile(r"(\d{3})-(\d{3})-(\d{4})") # create a single expression to represent a regular expression group

In [18]:
results = re.search(phone_pattern, phone_number)

In [19]:
results.group()

'408-555-1234'

In [20]:
results.group(1)

'408'

In [74]:
# Additional Regex Syntax

# IMPORTANT regarding "^":

# [^abc] -> not a, b or c
# [ab^cd] -> a, b, ^ (character), c or d
# \^ -> a ^ character
# Anywhere else -> start of string / line.

In [25]:
re.search(r"cat|dog", "The cat is here") # "or" operator

<re.Match object; span=(4, 7), match='cat'>

In [24]:
re.search(r"cat|dog", "The dog is here")

<re.Match object; span=(4, 7), match='dog'>

In [31]:
re.findall(r".at", "The cat in the hat just sat there.") # "." wildcard operator

['cat', 'hat', 'sat']

In [33]:
re.findall(r"^\d", "1 is a number.") # "^" operator searches for an entire string starting with a specific pattern

# If the entire string does NOT start with said pattern, it will not return anything

['1']

In [35]:
re.findall(r"\d$", "The number is 2") # "$" operator searches for a string that ends with a specific pattern

['2']

In [58]:
# Group for EXCLUSION

In [37]:
phrase = "there are 3 numbers 34 inside 5 this sentence"

In [59]:
pattern = r"[^\d]+"  # [] group for exclusion everything within the square brackets

In [52]:
re.findall(pattern,phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [42]:
test_phrase = "This is a string! But it has punctuation. How can we remove it?"

In [67]:
clean = re.findall(r"[^!.? ]+", test_phrase) 
# find everything in a string and exclude punctuation, "+" signifying repeating punctuation

In [68]:
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [55]:
" ".join(clean).lower()
# clean out the punctuation and then join the sentence

'this is a string but it has punctuation how can we remove it'

In [57]:
# Group for INCLUSION

In [60]:
text = "Only find the hyphen-words in this sentence. But you do not know how long-ish they are"

In [65]:
pattern = r"[\w]+-[\w]+"

In [66]:
re.findall(pattern,text)

['hyphen-words', 'long-ish']

In [69]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [70]:
re.search(r"cat(fish|nap|claw)", text)

<re.Match object; span=(27, 34), match='catfish'>

In [71]:
re.search(r"cat(fish|nap|claw)", texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [73]:
re.search(r"cat(fish|nap|erpillar)", textthree)

<re.Match object; span=(26, 37), match='caterpillar'>