# Capturing Groups

In [1]:
import re

In [2]:
result = re.search(r"^(\w*), (\w*)$", "Lovelace, Ada")

In [3]:
print(result)

<re.Match object; span=(0, 13), match='Lovelace, Ada'>


In [4]:
print(result.groups())

('Lovelace', 'Ada')


In [5]:
print(result[0])

Lovelace, Ada


In [6]:
print(result[1])

Lovelace


In [7]:
print(result[2])

Ada


In [8]:
print(result[3])

IndexError: no such group

In [9]:
'{} {}'.format(result[2], result[1])

'Ada Lovelace'

In [10]:
def rearrange_name(name):
    result = re.search(r"^(\w.*), (\w.*)$", name)
    if result is None:
        return name
    return "{} {}".format(result[2], result[1])

In [11]:
rearrange_name("Lovelace, Ada")

'Ada Lovelace'

In [12]:
rearrange_name("Richie, Denis")

'Denis Richie'

In [13]:
rearrange_name("Hopper, Grace M.")

'Grace M. Hopper'

# More on Repitition Qualifiers

In [14]:
# To match exactly five characters in a word we can write code as following

print(re.search(r"[a-zA-Z]{5}", "a ghost"))

<re.Match object; span=(2, 7), match='ghost'>


In [15]:
print(re.search(r"[a-zA-Z]{3}", "an apple is in the garden"))

<re.Match object; span=(3, 6), match='app'>


In [16]:
print(re.search(r"[a-zA-Z]{5}", "a scary ghost appeared"))

<re.Match object; span=(2, 7), match='scary'>


In [17]:
print(re.findall(r"[a-zA-Z]{5}", "a scary ghost appeared"))

['scary', 'ghost', 'appea']


#### To search exactly five letter words we use word boundary \w in our regular expression 

In [18]:
print(re.findall(r"\b[a-zA-Z]{5}\b", "A scary ghost appeared"))

['scary', 'ghost']


In [19]:
print(re.findall(r"\w{5,10}", "I really like strawberries"))

['really', 'strawberri']


In [20]:
# Leaving empty string after comma means that there is no upper limit boundaries
print(re.findall(r"\w{5,}", "I really like strawberries"))

['really', 'strawberries']


In [21]:
# Leaving first parameter in curly bracket empty means there is no lower limit of the word length.
print(re.search(r"s\w{,20}", "I really like strawberries"))

<re.Match object; span=(14, 26), match='strawberries'>


# Extracting a PID Using Regexes

In [33]:
log = "July 31 07:51:48 my computer bad_process[12345]: ERROR performing package upgrade"
regex = r"\[(\d+)\]"
result = re.search(regex, log)
print(result[1])

12345


In [34]:
result = re.search(regex, "A completely different string that also has numbers [34567]")
print(result[1])

34567


In [35]:
result = re.search(regex, "99 elephants in a [cage]")
print(result[1])

TypeError: 'NoneType' object is not subscriptable

######  To resolve this type of regex we define a function "extract_pid" 

In [36]:
def extract_pid(log_line):
    regex = r"\[(\d+)\]"
    result = re.search(regex, log_line)
    if result is None:
        return ""
    return result[1]

In [38]:
print(extract_pid(log))

12345


In [39]:
print(extract_pid("99 elephants in a [cage]"))




In [31]:
# To resolve this type of error we can also use \w which matches letters, numbers and underscores
regex = r"\[(\w*)\]"
result = re.search(regex, "99 elephants in a [cage]")
print(result[1])

cage


# "split" function of RE module

In [41]:
re.split(r"[.?!]", "One Sentence. Another One? and the last One!")

['One Sentence', ' Another One', ' and the last One', '']

In [42]:
# For capturing groups
re.split(r"([.?!])", "One Sentence. Another One? and the last One!")

['One Sentence', '.', ' Another One', '?', ' and the last One', '!', '']

# "sub" function of RE module is used to replace string

In [43]:
re.sub(r"[\w.%+-]+@[\w.-]+", "[REDACTED]", "Recieved an email for go_nuts95@my.example.com")

'Recieved an email for [REDACTED]'

In [44]:
# Another example

re.sub(r"^([\w .-]*), ([\w .-]*)$", r"\2 \1", "Lovelace, Ada")

'Ada Lovelace'