# Understanding Usage of Regular Expressions (RegEx)

In [109]:
import re # in-built regular expression module

#### Practical Use Case in IT

In [110]:
# extract process id from log entry:
log = "July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade"

regex = r"\[(\d+)\]" # find string enclosed in square brackets followed by one or more digits

result = re.search(regex, log) # search for regex in log

print(result[1]) # print the first group of regex match

12345


#### Basic RegEx

In [111]:
# find substring "aza":
print(re.search(r"aza", "plaza"))
print(re.search(r"aza", "bazaar"))

<re.Match object; span=(2, 5), match='aza'>
<re.Match object; span=(1, 4), match='aza'>


In [112]:
# find string that starts with "x":
print(re.search(r"^x", "xenon"))
print(re.search(r"^x", "hi xenon"), '\n')

<re.Match object; span=(0, 1), match='x'>
None 



In [113]:
# find string that ends with "n":
print(re.search(r"n$", "xenon"))
print(re.search(r"n$", "xenon byeeee"))

<re.Match object; span=(4, 5), match='n'>
None


In [114]:
# find sub-string with "p" followed by any character followed by "ng":
print(re.search(r"p.ng", "penguin"))
print(re.search(r"p.ng", "clapping"))

<re.Match object; span=(0, 4), match='peng'>
<re.Match object; span=(4, 8), match='ping'>


In [115]:
# search for sub-string "Python" or "python":
print(re.search(r"[Pp]ython", "Python"))
print(re.search(r"[Pp]ython", "python"))
print(re.search(r"[Pp]ython", "ython"))

<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 6), match='python'>
None


In [116]:
# search for a sub-string that starts with a lowercase letter, followed by "way":
print(re.search(r"[a-z]way", "The end of the highway"))
print(re.search(r"[a-z]way", "What a way to go"))

<re.Match object; span=(18, 22), match='hway'>
None


In [117]:
# search for "cloud" sub-string followed by a (case-insensitive) letter or number:
print(re.search("cloud[a-zA-Z0-9]", "cloudy"))
print(re.search("cloud[a-zA-Z0-9]", "cloud9"))

<re.Match object; span=(0, 6), match='cloudy'>
<re.Match object; span=(0, 6), match='cloud9'>


#### Circumflex/Carat Operator (^) as a Negation

In [118]:
# find non-alphabetic characters in a string:
print(re.search(r"[^a-zA-Z]", "This is a sentence with spaces."))
print(re.search(r"[^a-zA-Z ]", "This is a sentence with spaces.")) # also exclude spaces

# note: 'negations' are only valid within square brackets

<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(30, 31), match='.'>


#### Using Pipe (|) as OR Operator

In [119]:
# find substring "cat" or "dog":
print(re.search(r"cat|dog", "I like cats."))
print(re.search(r"cat|dog", "I love dogs!"))
print(re.search(r"cat|dog", "I like both dogs and cats."))

# note: returns first occurrence of the match (regardless of order)

<re.Match object; span=(7, 10), match='cat'>
<re.Match object; span=(7, 10), match='dog'>
<re.Match object; span=(12, 15), match='dog'>


In [120]:
print(re.findall(r"cat|dog", "I like both dogs and cats."))
print(re.findall(r"cat|dog", "I like both cats and dog but I prefer cats."))

# note: returns all (repeated) matches in the order they appear

['dog', 'cat']
['cat', 'dog', 'cat']


#### Repetition Qualifiers (*, +, ?)

In [121]:
# find sub-string with "Py" followed by any number of characters followed by "n":
print(re.search(r"Py.*n", "Pygmalion"))
print(re.search(r"Py.*n", "Python Programming"))

# note: ".*" matches any character (except newline) zero or more times

<re.Match object; span=(0, 9), match='Pygmalion'>
<re.Match object; span=(0, 17), match='Python Programmin'>


In [122]:
# find sub-string with "Py" followed by any number of lowercase letters followed by "n":
print(re.search(r"Py[a-z]*n", "Python Programming"))
print(re.search(r"Py[a-z]*n", "Pyn"))

# note: "[a-z]*" matches any lowercase letter zero or more times

<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 3), match='Pyn'>


In [123]:
# find sub-string with "o" one or more times followed by "l" one or more times:
print(re.search(r"o+l+", "goldfish"))
print(re.search(r"o+l+", "woolly"))
print(re.search(r"o+l+", "boil"))

# note: "+" matches one or more occurrences of the preceding character

<re.Match object; span=(1, 3), match='ol'>
<re.Match object; span=(1, 5), match='ooll'>
None


In [124]:
# find sub-string "each" or "peach":
print(re.search(r"p?each", "To each their own"))
print(re.search(r"p?each", "I like peaches"))

# note: "?" matches zero or exactly one occurrence of the preceding character

<re.Match object; span=(3, 7), match='each'>
<re.Match object; span=(7, 12), match='peach'>


#### Escaping characters

In [125]:
print(re.search(r".com", "welcome"))
print(re.search(r"\.com", "welcome"))
print(re.search(r"\.com", "mydomain.com"))

# note: "\" is used to escape special characters

<re.Match object; span=(2, 6), match='lcom'>
None
<re.Match object; span=(8, 12), match='.com'>


In [126]:
print(re.search(r"\w*", "This is an example"))
print(re.search(r"\w*", "And_this_is_another"))

# note: "\w" matches any alphanumeric character (including underscore)
# some other important escape sequences: \d (digits), \s (whitespace), \b (word boundary)

<re.Match object; span=(0, 4), match='This'>
<re.Match object; span=(0, 19), match='And_this_is_another'>


#### Understand When to Avoid 'Greedy Match'

In [127]:
# find sub-string starting with "A" and ending with "a" (greedy match):
print(re.search(r"A.*a", "Argentina"))
print(re.search(r"A.*a", "Azerbaijan"))

<re.Match object; span=(0, 9), match='Argentina'>
<re.Match object; span=(0, 9), match='Azerbaija'>


In [128]:
# find complete string starting with "A" and ending with "a" (non-greedy match):
print(re.search(r"^A.*a$", "Australia"))

<re.Match object; span=(0, 9), match='Australia'>


#### Some Other Practical Use Cases

In [129]:
# validate a variable name:
pattern = r"^[a-zA-Z_]\w*$" # \w -> [a-zA-Z0-9_]

print(re.search(pattern, "_this_is_a_valid_variable_name"))
print(re.search(pattern, "this isn't a valid variable"))
print(re.search(pattern, "my_variable1"))
print(re.search(pattern, "2my_variable1"))

<re.Match object; span=(0, 30), match='_this_is_a_valid_variable_name'>
None
<re.Match object; span=(0, 12), match='my_variable1'>
None


In [130]:
# validate a US phone number:
print(re.search(r"\d{3}-\d{3}-\d{4}", "123-456-7890"))
print(re.search(r"\d{3}-\d{3}-\d{4}", "0333-4455666"))

# note: "{n}" matches exactly 'n' occurrences of the preceding character

<re.Match object; span=(0, 12), match='123-456-7890'>
None


In [131]:
# validate a floating point number (aka float):
pattern = r"^-?\d*(\.\d+)?$"

print(re.search(pattern, "-123.456"))
print(re.search(pattern, "123.456"))
print(re.search(pattern, "-123"))
print(re.search(pattern, "123"))
print(re.search(pattern, "-.456"))
print(re.search(pattern, "123.")) # invalid

# note: 'float' is any positive or negative number, with or without decimal places

<re.Match object; span=(0, 8), match='-123.456'>
<re.Match object; span=(0, 7), match='123.456'>
<re.Match object; span=(0, 4), match='-123'>
<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 5), match='-.456'>
None
