# Overview of Regular Expressions

### Searching for Basic Patterns

In [1]:
text = "The person's phone number is 408-555-1234. Call soon!"

In [2]:
'phone' in text

True

In [3]:
import re

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern,text)

<re.Match object; span=(13, 18), match='phone'>

In [6]:
pattern = 'NOT IN TEXT'

In [7]:
re.search(pattern, text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern, text)

In [10]:
match

<re.Match object; span=(13, 18), match='phone'>

In [11]:
match.span()

(13, 18)

In [12]:
match.start()

13

In [13]:
match.end()

18

In [14]:
text = "my phone is a new phone"

In [15]:
match = re.search('phone',text)

In [16]:
match.span()

(3, 8)

In [17]:
matches = re.findall('phone',text)

In [18]:
matches

['phone', 'phone']

In [19]:
len(matches)

2

In [20]:
for match in re.finditer('phone',text):
    print(match.span())

(3, 8)
(18, 23)


In [21]:
match.group()

'phone'

# Patterns

### Identifiers for Characters in Patterns

In [22]:
text = 'My telephone number is 408-555-1234'

In [23]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [24]:
phone.group()

'408-555-1234'

### Quantifiers

In [25]:
re.search(r'\d{3}-\d{3}-\d{4}',text)

<re.Match object; span=(23, 35), match='408-555-1234'>

### Groups

In [26]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [27]:
results = re.search(phone_pattern,text)

In [29]:
# The entire result
results.group()

'408-555-1234'

In [31]:
# Can then also call by group position.
# remember groups were separated by parenthesis ()
# Something to note is that group ordering starts at 1. Passing in 0 returns everything
results.group(1)

'408'

In [32]:
results.group(2)

'555'

In [33]:
results.group(3)

'1234'

In [34]:
# We only had three groups of parenthesis
results.group(4)

IndexError: no such group

### Additional Regex Syntax

##### Or operator |

In [35]:
re.search(r"man|woman","This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [36]:
re.search(r"man|woman","This woman was here.")

<re.Match object; span=(5, 10), match='woman'>

### The Wildcard Character

In [37]:
re.findall(r".at","The cat in the hat sat here.")

['cat', 'hat', 'sat']

In [38]:
re.findall(r".at","The bat went splat")

['bat', 'lat']

In [39]:
re.findall(r"...at","The bat went splat")

['e bat', 'splat']

In [40]:
# One or more non-whitespace that ends with 'at'
re.findall(r"\S+at","The bat went splat")

['bat', 'splat']

### Starts with and Ends With

In [41]:
# Ends with a number
re.findall(r'\d$','This ends with a number 2')

['2']

In [42]:
# Starts with a number
re.findall(r'^\d','1 is the loneliest number.')

['1']

### Exclusion

In [43]:
phrase = "there are 3 numbers 34 inside 5 this sentence."

In [44]:
re.findall(r'[^\d]',phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

In [45]:
re.findall(r'[^\d]+',phrase)

['there are ', ' numbers ', ' inside ', ' this sentence.']

In [46]:
test_phrase = "This is a string! But it has punctuation. How can we remove it?"

In [47]:
re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [48]:
clean = ' '.join(re.findall('[^!.? ]+',test_phrase))

In [49]:
clean

'This is a string But it has punctuation How can we remove it'

### Brackets for Grouping

In [50]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [51]:
re.findall(r'[\w]+-[\w]+',text)

['hypen-words', 'long-ish']

### Parenthesis for Multiple Options

In [52]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = 'Hello, would you like to take a catnap?'
textthree = 'Hello, have you seen this caterpillar?'

In [53]:
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [54]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [55]:
re.search(r'cat(fish|nap|claw)',textthree)