In [1]:
# Part One
text = 'The agents phone number is 408-555-1234. Call soon!'

In [2]:
'phone' in text

True

In [3]:
import re

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern,text)

<re.Match object; span=(11, 16), match='phone'>

In [6]:
pattern = 'Not in text'

In [8]:
re.search(pattern,text)

In [9]:
# We get back nothing because there is no match.
pattern = 'phone'

In [10]:
match = re.search(pattern,text)

In [11]:
match

<re.Match object; span=(11, 16), match='phone'>

In [13]:
match.span()

(11, 16)

In [14]:
match.start()

11

In [15]:
match.end()

16

In [16]:
text = 'My phone once, my phone twice.'

In [17]:
match = re.search('phone',text)

In [18]:
match

<re.Match object; span=(3, 8), match='phone'>

In [19]:
# It will only return the first instance.
# Use the find all function to find all the instances.
matches = re.findall('phone',text)

In [20]:
matches

['phone', 'phone']

In [21]:
len(matches)

2

In [23]:
# To get actual match objects.
for match in re.finditer('phone',text): # It iterates through this text, and returns each match object found.
    print(match.group())

phone
phone


In [24]:
# Right now we have been only searching for basic strings.

In [34]:
# Part Two (The Key Part)
text = "My telephone number is 408-555-7777"

In [36]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [37]:
phone

<re.Match object; span=(23, 35), match='408-555-7777'>

In [38]:
# Notice: the phone object is the exact same thing.
phone.group()

'408-555-7777'

In [39]:
# This is how you can actually grab the phone number itself.
# Use of quantifiers to indicate repetition of same character.
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)

In [40]:
phone

<re.Match object; span=(23, 35), match='408-555-7777'>

In [41]:
phone.group()

'408-555-7777'

In [42]:
# Now we want to do two tasks.
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') # .compile(): Compiles together different regex patterns codes.

In [44]:
result = re.search(phone_pattern,text)

In [45]:
result.group()

'408-555-7777'

In [49]:
# Group ordering starts at one, unlike indexing.
result.group(1)

'408'

In [51]:
# Part Three (additional syntax)
re.search(r"man|woman","This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [55]:
re.findall(r"at","The cat in the hat sat here.")

['at', 'at', 'at']

In [56]:
# To grab the actual letter
re.findall(r".at","The cat in the hat sat here.") # "".at",...

['cat', 'hat', 'sat']

In [57]:
re.findall(r"...at","The bat went splat")

['e bat', 'splat']

In [58]:
# Starts with a number
re.findall(r'^\d','1 is the loneliest number.') # ^

['1']

In [59]:
# Keep in mind this is only for the entire text, not for a number randomly inside of this.

In [60]:
# Ends with a number
re.findall(r'\d$','This ends with a number 2') # $

['2']

In [61]:
phrase = "there are 3 numbers 34 inside 5 this sentence."

In [62]:
# Get back everything that isnt a number.
re.findall(r'[^\d]',phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

In [63]:
re.findall(r'[^\d]+',phrase)

['there are ', ' numbers ', ' inside ', ' this sentence.']

In [64]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [65]:
# Essentially being split at every punctuation and spaces.
re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [66]:
clean = re.findall('[^!.? ]+',test_phrase) # [] example of grouping inclusion.

In [67]:
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [68]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

In [69]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [73]:
pattern = r'[\w]+-[\w]+' # [] allows you to combine things together.(plus readability)

In [74]:
re.findall(pattern,text)

['hypen-words', 'long-ish']

In [75]:
pattern = r'[\w]+-[\w]+' # Remove the []

In [76]:
# still get same result, but some coders will tell you that it looks hard to read.
re.findall(pattern,text)

['hypen-words', 'long-ish']

In [77]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [82]:
# Combine or statement with other pieces of text.
re.search(r'cat(fish|nap|erpillar)',textthree)

<re.Match object; span=(26, 37), match='caterpillar'>