In [1]:
# re module contains all regex funcitons
import re

In [2]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # Regex object
mo = phoneNumRegex.search("My mobile number is 454-555-6654") # Match object
print("Mobile Number Found: " + mo.group())

Mobile Number Found: 454-555-6654


In [3]:
# Grouping with paranthesis ()
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search("My mobile number is 454-555-6654")
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.groups())
areaCode, mainNumber = mo.groups()
print("The area code is",areaCode)
print("The main number is",mainNumber)

454
555-6654
454-555-6654
('454', '555-6654')
The area code is 454
The main number is 555-6654


In [4]:
# Escaping paranthesis
phoneNumRegex = re.compile(r'(\(\d\d\d\))-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search("My mobile number is (454)-555-6654")
print(mo.group(1))
print(mo.group(2))

(454)
555-6654


In [5]:
# Matching multiple groups with pipe (|)
heroRegex = re.compile(r'Batman|Tina Frey')
mo1 = heroRegex.search("Batman and Tina Frey")
mo2 = heroRegex.search("Tina Frey and Batman")
print(mo1.group())
print(mo2.group())

Batman
Tina Frey


In [6]:
# Multiple patterns with pipe and paranthesis
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search("Batmobile lost a wheel")
print(mo.group())
print(mo.group(1))

Batmobile
mobile


In [7]:
# Optional matching with question mark (?)
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search("The Adventures of Batman")
print(mo1.group())
mo2 = batRegex.search("The Adventures of Batwoman")
print(mo2.group())

phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search("My number is 343-464-6343")
print(mo1.group())
mo2 = phoneRegex.search("My number is 443-6546")
print(mo2.group())

Batman
Batwoman
343-464-6343
443-6546


In [8]:
# Matching zero or more with the star (*)
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search("The Adventures of Batman")
print(mo1.group())
mo2 = batRegex.search("The Adventures of Batwoman")
print(mo2.group())
mo3 = batRegex.search("The Adventures of Batwowowoman")
print(mo3.group())

Batman
Batwoman
Batwowowoman


In [9]:
# Matching one or more with the plus (+)
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search("The Adventures of Batman")
print(mo1 == None)
mo2 = batRegex.search("The Adventures of Batwoman")
print(mo2.group())
mo3 = batRegex.search("The Adventures of Batwowowoman")
print(mo3.group())

True
Batwoman
Batwowowoman


In [10]:
# Matching specific repitions with braces
haRegex = re.compile(r'(Ha){3}') # exactly 3
mo = haRegex.search("HaHaHa")
print(mo.group())

haRegex = re.compile(r'(Ha){3,}') # 3 or more
mo = haRegex.search("HaHaHaHaHa")
print(mo.group())

haRegex = re.compile(r'(Ha){,3}') # 0 to 3
mo = haRegex.search("HaHa")
print(mo.group())

haRegex = re.compile(r'(Ha){3,5}') # 3 to 5
mo = haRegex.search("HaHaHaHaHaHaHa")
print(mo.group())

HaHaHa
HaHaHaHaHa
HaHa
HaHaHaHaHa


In [11]:
# Greedy and non-greedy matching
greedyHaRegrex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegrex.search("HaHaHaHaHa")
print(mo1.group())

nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search("HaHaHaHaHa")
print(mo2.group())

# Non-greedy matching and optional group are unrelated functions of ?

HaHaHaHaHa
HaHaHa


In [12]:
# findall() method
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search("Cell: 415-544-6545 Work: 546-654-4636")
print(mo.group())
print(phoneNumRegex.findall("Cell: 415-544-6545 Work: 546-654-4636"))


phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)')
print(phoneNumRegex.findall("Cell: 415-544-6545 Work: 546-654-4636"))




415-544-6545
['415-544-6545', '546-654-4636']
[('415', '544', '6545'), ('546', '654', '4636')]


In [13]:
# ------------------ Character Classes --------------------

# Shorthand Character Classes
# \d - digits 0 to 9
# \D - not digits 0 to 9
# \w - letter, numeric digit, or the underscore
# \W - not letter, numeric digit, or the underscore
# \s - space, tab, or newline
# \S - not space, tab, or newline
xmasRegex = re.compile(r'\d+\s\w+') # a number followed by space and a word
print(xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge'))

# User defined character classes []
vowelRegex = re.compile(r'[aeiouAEIOU]')
print(vowelRegex.findall("RoboCop eats all baby food. BABY FOOD."))

# Ranges with hyphen (-)

# negation
consonantRegex = re.compile(r'[^aeiouAEIOU]')
print(consonantRegex.findall("RoboCop eats all baby food. BABY FOOD."))



['12 drummers', '11 pipers', '10 lords', '9 ladies', '8 maids', '7 swans', '6 geese', '5 rings', '4 birds', '3 hens', '2 doves', '1 partridge']
['o', 'o', 'o', 'e', 'a', 'a', 'a', 'o', 'o', 'A', 'O', 'O']
['R', 'b', 'C', 'p', ' ', 't', 's', ' ', 'l', 'l', ' ', 'b', 'b', 'y', ' ', 'f', 'd', '.', ' ', 'B', 'B', 'Y', ' ', 'F', 'D', '.']


In [14]:
# Caret (^) and dolar ($) sign characters - match at beginning and at end
beginsWithHello = re.compile(r'^Hello')
mo1 = beginsWithHello.search("Hello World")
print(mo1.group())
mo2 = beginsWithHello.search("Hi said hello")
print(mo2 == None)

endsWithNumber = re.compile(r'\d$')
mo1 = endsWithNumber.search("Your number is 43")
print(mo1.group())
mo2 = endsWithNumber.search("Your number is Forty Three")
print(mo2 == None)

wholeStringIsNum = re.compile(r'^\d+$')
mo1 = wholeStringIsNum.search('1234567890')
print(mo1.group())
mo2 = wholeStringIsNum.search('123abc890')
print(mo2 == None)
mo3 = wholeStringIsNum.search('1234  7890')
print(mo3 == None)

Hello
True
3
True
1234567890
True
True


In [15]:
# Wildcard character (.) - any character except a newline
atRegex = re.compile(r'.at')
print(atRegex.findall("The cat in the hat sat on the flat mat"))

# match everything ( .* )
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search("First Name: Gorno Last Name: Jelovce")
print(mo.groups())

# greedy vs non greedy matching
greedyRegex = re.compile(r'<.*>')
mo1 = greedyRegex.search("<To serve man> for dinner>")
print(mo1.group())

nongreedyRegex = re.compile(r'<.*?>')
mo2 = nongreedyRegex.search("<To serve man> for dinner>")
print(mo2.group())

# matching newlines with dot character
noNewlineRegex = re.compile('.*')
mo1 = noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.')
print(mo1.group())

newlineRegex = re.compile('.*', re.DOTALL)
mo2 = newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.')
print(mo2.group())

['cat', 'hat', 'sat', 'lat', 'mat']
('Gorno', 'Jelovce')
<To serve man> for dinner>
<To serve man>
Serve the public trust.
Serve the public trust.
Protect the innocent.
Uphold the law.


In [16]:
# Case-insensitive matching
robocop = re.compile(r'robocop', re.I)
mo1 = robocop.search("RoboCop is part man, part machine, all cop.")
print(mo1.group())
mo2 = robocop.search("ROBOCOP protects the innocent.")
print(mo2.group())
mo3 = robocop.search("Al, why does your programming book talk about robocop so much?")
print(mo3.group())

RoboCop
ROBOCOP
robocop


In [17]:
# Substituting strings with the sub() method
namesRegex = re.compile(r'Agent \w+')
print(namesRegex.sub('CENSORED', 'Agent Alice gave the secret document to Agent Bob'))

agentNamesRegex = re.compile(r'Agent (\w)\w*')
print(agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew that Agent Bob was a double agent.'))

CENSORED gave the secret document to CENSORED
A**** told C**** that E**** knew that B**** was a double agent.


In [18]:
# ----------------- Managing Complex Regexex -----------------
# without newline and comments
phoneRegex = re.compile(r'((\d{3}|\(\d{3}\))?(\s|-|\.)?\d{3}(\s|-|\.)\d{4}(\s*(ext|x|ext.)\s*\d{2,5})?)')
# with newline and comments
phoneRegex = re.compile(r'''
    (\d{3}|\(\d{3}\))?            # area code
    (\s|-|\.)?                    # saperator
    \d{3}                         # first 3 digits
    (\s|-|\.)                     # saperator
    \d{4}                         # last 4 digits
    (\s*(ext|x|ext.)\s*\d{2,5})?  # extension
''', re.VERBOSE)

In [19]:
# Combining re.IGNORECASE, re.DOTALL, re.VERBOSE (|)
someRegex = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)

In [45]:
# How would you write a regex that matches a number with commas for every three digits?
numRegex = re.compile(r'^\d{1,3}(,\d{3})*$')
while num:=input():
    mo = numRegex.search(num)
    if mo:
        print(mo.group())

In [47]:
# How would you write a regex that matches the full name of someone whose last name is Watanabe? You can assume that the first name that comes before it will always be one word that begins with a capital letter.
nameRegex = re.compile(r'[A-Z][A-Za-z]* Watanabe')

In [57]:
# How would you write a regex that matches a sentence where the first word
# is either Alice, Bob, or Carol; the second word is either eats, pets, or throws;
# the third word is apples, cats, or baseballs; and the sentence ends with a
# period? This regex should be case-insensitive. 
sentenceRegex = re.compile(r'(Alice|Bob|Carol) (eats|pets|throws) (apple|cats|baseballs).', re.IGNORECASE)