# Regular Expressions in Python







In [35]:
import re
text= "The phone number of the agent is 409-444-1234. Call soon!"
"409-444-1234" in text

True

In [36]:
pattern = "phone"

In [37]:
re.search(pattern,text)

<re.Match object; span=(4, 9), match='phone'>

In [38]:
my_match = re.search(pattern,text)

### We can see at what position does phone appear in text

In [39]:
my_match.span()

(4, 9)

In [40]:
my_match.start()

4

In [41]:
my_match.end()

9

In [42]:
text = "my phone is a new phone"

In [43]:
match = re.search(pattern,text)

In [44]:
match.span()

(3, 8)

In [45]:
re.findall("pepe",text)

[]

In [46]:
re.findall("phone",text)

['phone', 'phone']

In [47]:
for match in re.finditer("phone",text):
    print(match.span())

(3, 8)
(18, 23)


In [48]:
text = "My telephone number is 777-555-1234"

In [49]:
text

'My telephone number is 777-555-1234'

### Identifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>


In [50]:
pattern = r"\d\d\d-\d\d\d-\d\d\d\d"


In [51]:
phone_number = re.search(pattern,text)

In [52]:
phone_number

<re.Match object; span=(23, 35), match='777-555-1234'>

In [53]:
phone_number.group()

'777-555-1234'

### Quantifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Example Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

A more elegant way to find a telephone number using quantifiers

In [54]:
pattern = r"\d{3}-\d{3}-\d{4}"
mymatch = re.search(pattern,text)
mymatch.group()

'777-555-1234'

### We can use parenthesis to access certain groups.

In [58]:
pattern = r"(\d{3})-(\d{3})-(\d{4})"
mymatch = re.search(pattern,text)
print("This is group 1",mymatch.group(1))
print("This is group 2",mymatch.group(2))
print("This is group 3",mymatch.group(3))


This is group 1 777
This is group 2 555
This is group 3 1234


### One line

In [59]:
re.search(r"man|woman","This woman was here")

<re.Match object; span=(5, 10), match='woman'>

### Wild card

In [61]:
re.findall(r".at","The cat in the hat sat splat")


['cat', 'hat', 'sat', 'lat']

In [62]:
re.findall(r"..at","The cat in the hat sat splat")

[' cat', ' hat', ' sat', 'plat']

In [63]:
re.findall(r"\d$","This ends with a number 2")

['2']

In [64]:
phrase = "there are 3 numbers 34 inside 5 this sentence"

### Exclude any digitis

In [65]:
re.findall(r"[^\d]",phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

### If you want to return digits together use +

In [66]:
re.findall(r"[^\d]+",phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [67]:
test_phrase = "This is a string! but it has punctuation. How to remove it?"

In [70]:
re.findall(r"[^!.?]+",test_phrase)

['This is a string', ' but it has punctuation', ' How to remove it']

In [73]:
mylist = re.findall(r"[^!.? ]+",test_phrase)
mylist

['This',
 'is',
 'a',
 'string',
 'but',
 'it',
 'has',
 'punctuation',
 'How',
 'to',
 'remove',
 'it']

In [74]:
" ".join(mylist)

'This is a string but it has punctuation How to remove it'

In [78]:
text = "Only find the hyphen-words. Were are the long-ish dash words?"

In [80]:
re.findall(r"[\w]+-[\w]+",text)

['hyphen-words', 'long-ish']