## Understanding the Usage of Regular Expressions (RegEx)

In [237]:
import re # in-built regular expression module

### Basic RegEx

In [238]:
# find substring 'aza':
print(re.search(r"aza", "plaza"))
print(re.search(r"aza", "bazaar"))

<re.Match object; span=(2, 5), match='aza'>
<re.Match object; span=(1, 4), match='aza'>


In [239]:
# find string that starts with 'x':
print(re.search(r"^x", "xenon"))
print(re.search(r"^x", "hi xenon"))

<re.Match object; span=(0, 1), match='x'>
None


In [240]:
# find string that ends with 'n':
print(re.search(r"n$", "xenon"))
print(re.search(r"n$", "xenon byeeee"))

<re.Match object; span=(4, 5), match='n'>
None


In [241]:
# find sub-string with 'p' followed by 'any character' followed by 'ng':
print(re.search(r"p.ng", "penguin"))
print(re.search(r"p.ng", "clapping"))

<re.Match object; span=(0, 4), match='peng'>
<re.Match object; span=(4, 8), match='ping'>


### Using Square Brackets '[]' as Character Class

In [242]:
# search for sub-string 'Python' or 'python':
print(re.search(r"[Pp]ython", "Python"))
print(re.search(r"[Pp]ython", "python"))
print(re.search(r"[Pp]ython", "ython"))

<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 6), match='python'>
None


In [243]:
# search for a sub-string that starts with a lowercase letter, followed by 'way':
print(re.search(r"[a-z]way", "The end of the highway"))
print(re.search(r"[a-z]way", "What a way to go"))

<re.Match object; span=(18, 22), match='hway'>
None


In [244]:
# search for 'cloud' sub-string followed by a (case-insensitive) letter or number:
print(re.search("cloud[a-zA-Z0-9]", "cloudy"))
print(re.search("cloud[a-zA-Z0-9]", "cloud9"))

<re.Match object; span=(0, 6), match='cloudy'>
<re.Match object; span=(0, 6), match='cloud9'>


### Using Carat '^' as NOT Operator (Negation)

In [245]:
# find non-alphabetic characters in a string:
print(re.search(r"[^a-zA-Z]", "This is a sentence with spaces."))
print(re.search(r"[^a-zA-Z ]", "This is a sentence with spaces.")) # also exclude spaces

# note: 'negations' are only valid inside character class '[]'

<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(30, 31), match='.'>


### Using Pipe '|' as OR Operator

In [246]:
# find substring 'cat' or 'dog':
print(re.search(r"cat|dog", "I like cats."))
print(re.search(r"cat|dog", "I love dogs!"))
print(re.search(r"cat|dog", "I like both dogs and cats."))

# note: returns first occurrence of the match (regardless of order)

<re.Match object; span=(7, 10), match='cat'>
<re.Match object; span=(7, 10), match='dog'>
<re.Match object; span=(12, 15), match='dog'>


In [247]:
print(re.findall(r"cat|dog", "I like both dogs and cats."))
print(re.findall(r"cat|dog", "I like both cats and dog but I prefer cats."))

# note: returns all (repeated) matches in the order they appear

['dog', 'cat']
['cat', 'dog', 'cat']


### Repetition Qualifiers (*, +, ?)

In [248]:
# find sub-string with 'Py' followed by any number of characters followed by 'n':
print(re.search(r"Py.*n", "Pygmalion"))
print(re.search(r"Py.*n", "Python Programming"))

# note: '.*' matches any character (except newline) zero or more times

<re.Match object; span=(0, 9), match='Pygmalion'>
<re.Match object; span=(0, 17), match='Python Programmin'>


In [249]:
# find sub-string with 'Py' followed by any number of lowercase letters followed by 'n':
print(re.search(r"Py[a-z]*n", "Python Programming"))
print(re.search(r"Py[a-z]*n", "Pyn"))

# note: '[a-z]*' matches any lowercase letter zero or more times

<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 3), match='Pyn'>


In [250]:
# find sub-string with 'o' one or more times followed by 'l' one or more times:
print(re.search(r"o+l+", "goldfish"))
print(re.search(r"o+l+", "woolly"))
print(re.search(r"o+l+", "boil"))

# note: '+' matches one or more occurrences of the preceding character

<re.Match object; span=(1, 3), match='ol'>
<re.Match object; span=(1, 5), match='ooll'>
None


In [251]:
# find sub-string 'each' or 'peach':
print(re.search(r"p?each", "To each their own"))
print(re.search(r"p?each", "I like peaches"))

# note: '?' matches zero or exactly one occurrence of the preceding character

<re.Match object; span=(3, 7), match='each'>
<re.Match object; span=(7, 12), match='peach'>


### Escaping characters

In [252]:
print(re.search(r".com", "welcome"))
print(re.search(r"\.com", "welcome"))
print(re.search(r"\.com", "mydomain.com"))

# note: '\' is used to escape special characters

<re.Match object; span=(2, 6), match='lcom'>
None
<re.Match object; span=(8, 12), match='.com'>


In [253]:
print(re.search(r"\w*", "This is an example")) # \w -> [a-zA-Z0-9_]
print(re.search(r"\w*", "And_this_is_another"))

# some other escape sequences: \d (digits), \s (whitespace), \b (word boundary)

<re.Match object; span=(0, 4), match='This'>
<re.Match object; span=(0, 19), match='And_this_is_another'>


### Understand When to Avoid 'Greedy Match'

In [254]:
# find sub-string starting with 'A' and ending with 'a' (greedy match):
print(re.search(r"A.*a", "Argentina"))
print(re.search(r"A.*a", "Azerbaijan"))

<re.Match object; span=(0, 9), match='Argentina'>
<re.Match object; span=(0, 9), match='Azerbaija'>


In [255]:
# find complete string starting with 'A' and ending with 'a' (non-greedy match):
print(re.search(r"^A.*a$", "Australia"))

<re.Match object; span=(0, 9), match='Australia'>


### Capturing Groups

In [256]:
# extract first and last names (as captured groups):
result = re.search(r"^(\w*), (\w*)$", "Lovelace, Ada")

print(result) # returns the match object
print(result.groups(), '\n') # returns a tuple of all captured groups

print(result[0]) # returns the entire match string
print(result[1]) # returns the first captured group
print(result[2]) # returns the second captured group (and so on, if more groups are present)

<re.Match object; span=(0, 13), match='Lovelace, Ada'>
('Lovelace', 'Ada') 

Lovelace, Ada
Lovelace
Ada


In [257]:
# rearrange names by reversing the order of extracted names (use-case):
def rearrange_name(name, regex):
    result = re.search(regex, name) # extracting names
    if result is None:
        return name
    return f"{result[2]} {result[1]}" # reversing the order of names

def test_cases(regex):
    print(rearrange_name("Lovelace, Ada", regex))
    print(rearrange_name("Ritchie, Dennis", regex))
    print(rearrange_name("Hopper, Grace M.", regex))

# simple grouping:
test_cases(r"^(\w*), (\w*)$")
print()

# more flexible grouping:
test_cases(r"^([\w \.-]*), ([\w \.-]*)$")

# note: using more flexible grouping, we were able to work with 3rd test-case as well

Ada Lovelace
Dennis Ritchie
Hopper, Grace M.

Ada Lovelace
Dennis Ritchie
Grace M. Hopper


### More on Repetition Qualifiers

In [258]:
print(re.search(r"[a-zA-Z]{5}", "a ghost"))
print(re.search(r"[a-zA-Z]{5}", "a scary ghost appeared"))

# note: '{n}' matches exactly 'n' occurrences of the preceding character

<re.Match object; span=(2, 7), match='ghost'>
<re.Match object; span=(2, 7), match='scary'>


In [259]:
print(re.findall(r"[a-zA-Z]{5}", "a scary ghost appeared"))
print(re.findall(r"\b[a-zA-Z]{5}\b", "A scary ghost appeared"))

['scary', 'ghost', 'appea']
['scary', 'ghost']


In [260]:
print(re.findall(r"\w{5,10}", "I really like strawberries"))
print(re.findall(r"\w{5,}", "I really like strawberries"))

['really', 'strawberri']
['really', 'strawberries']


In [261]:
print(re.findall(r"\w{,5}", "I really like strawberries")) # this also includes length '0' words
print(re.findall(r"\w{1,5}", "I really like strawberries")) # ignore length '0' words
print(re.findall(r"\w{1,20}", "I really like strawberries"))

# observation:
# if you keep the upper limit high enough, then it will not slice any token (substring)

['I', '', 'reall', 'y', '', 'like', '', 'straw', 'berri', 'es', '']
['I', 'reall', 'y', 'like', 'straw', 'berri', 'es']
['I', 'really', 'like', 'strawberries']


In [262]:
print(re.search(r"\w{,20}", "I really like strawberries"))
print(re.search(r"s\w{,20}", "I really like strawberries")) # force 'match' to start with 's'

<re.Match object; span=(0, 1), match='I'>
<re.Match object; span=(14, 26), match='strawberries'>


### Some Important Use Cases

In [263]:
# validate a variable name:
pattern = r"^[a-zA-Z_]\w*$" # \w -> [a-zA-Z0-9_]

print(re.search(pattern, "_this_is_a_valid_variable_name"))
print(re.search(pattern, "this isn't a valid variable"))
print(re.search(pattern, "my_variable1"))
print(re.search(pattern, "2my_variable1"))

<re.Match object; span=(0, 30), match='_this_is_a_valid_variable_name'>
None
<re.Match object; span=(0, 12), match='my_variable1'>
None


In [264]:
# validate a US phone number:
print(re.search(r"\d{3}-\d{3}-\d{4}", "123-456-7890")) # \d{3} -> [0-9][0-9][0-9]
print(re.search(r"\d{3}-\d{3}-\d{4}", "0333-4455666"))

<re.Match object; span=(0, 12), match='123-456-7890'>
None


In [265]:
# validate a floating point number (aka float):
pattern = r"^-?\d*(\.\d+)?$"

print(re.search(pattern, "-123.456"))
print(re.search(pattern, "123.456"))
print(re.search(pattern, "-123"))
print(re.search(pattern, "123"))
print(re.search(pattern, "-.456"))
print(re.search(pattern, "123.")) # invalid

# note: 'float' is any positive or negative number, with or without decimal places

<re.Match object; span=(0, 8), match='-123.456'>
<re.Match object; span=(0, 7), match='123.456'>
<re.Match object; span=(0, 4), match='-123'>
<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 5), match='-.456'>
None


### Extract Process ID from Log Entry (Practical Use Case)

In [266]:
log = "July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade"
log = "A completely different string that also has numbers [34567]"
# log = "99 elephants in a [cage]"

regex = r"\[(\d+)\]" # find string enclosed in square brackets followed by one or more digits
result = re.search(regex, log) # search for regex in log
print(result[1]) # print the first group of regex match

# note: here, 3rd log entry will raise an error (if we uncomment it)
# reason: no safe-guard against 'None' type (if no match is found)

34567


In [267]:
logs = [
    "July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade",
    "A completely different string that also has numbers [34567]",
    "99 elephants in a [cage]"
]

# correct implementation:
def extract_pid(log):
    regex = r"\[(\d+)\]"
    result = re.search(regex, log)
    return "not found!" if result is None else result[1] # conditional check (as safe-guard)

# testing all log entries:
for log in logs:
    print(extract_pid(log))

12345
34567
not found!


### Splitting and Replacing

In [268]:
print(re.split(r"[.?!]", "One sentence. Another one? And the last one!"))

print(re.split(r"([.?!])", "One sentence. Another one? And the last one!")) # include delimiters too

# note: inside character class '[]', you don't necessarily need to escape 'special characters'
# some exceptions: '^', '-', ']', '\', '['

['One sentence', ' Another one', ' And the last one', '']
['One sentence', '.', ' Another one', '?', ' And the last one', '!', '']


In [276]:
print(re.sub(r"[\w.%+-]+@[\w.-]+", "[REDACTED]", "Received an email for go_nuts95@my.example.com")) # censor email

print(re.sub(r"^([\w .-]*), ([\w .-]*)$", r"\2 \1", "Lovelace, Ada")) # rearrange names

print(re.sub(r"([A-Z])\.\s+(\w+)", r"Ms. \2", "A. Weber and B. Bellmas have joined the team.")) # replace initials

# note: '\1, \2, ...' are used to refer to 'captured groups' in the replacement string (aka back-references)

Received an email for [REDACTED]
Ada Lovelace
Ms. Weber and Ms. Bellmas have joined the team.


### Look-around Assertions

In [291]:
# look-ahead assertion:
regex = r"(Test\d)-(?=Passed)"
tests = "Test1-Passed, Test2-Passed, Test3-Failed, Test4-Passed, Test5-Failed"

print(re.findall(regex, tests))

# note: look-ahead assertion is used to match a pattern only if it is followed by another pattern

['Test1', 'Test2', 'Test4']


In [299]:
# look-behind assertion:
regex = r"(?<=\$)\d+\.\d+"
prices = "The iPhone X costs $999.99 and the iPhone 8 costs $699.99"

print(re.findall(regex, prices))

# note: look-behind assertion is used to match a pattern only if it is preceded by another pattern

['999.99', '699.99']


In [311]:
# using both look-ahead and look-behind assertions:
regex = r"(?<=\$)\d+\.\d+(?= per share)"
stocks = "AMZN is trading at €987.65, AAPL is trading at $123.45 per share, GOOG is trading at $678.90 per share"

print(re.findall(regex, stocks))

# note: works only if both assertions are satisfied (i.e. both conditions are met)

['123.45', '678.90']
