In [1]:
import re

In [2]:
title = "And now for something completely different"
pattern = "(\w)\\1+"

In [3]:
print(re.search(pattern, title))

<re.Match object; span=(35, 37), match='ff'>


In [4]:
description = "The Norwegian Blue is a wonderful parrot. This parrot is notable for its exquisite plumage."
pattern = "(parrot)"
rep = "ex-\\1"
print(re.sub(pattern, rep, description))

The Norwegian Blue is a wonderful ex-parrot. This ex-parrot is notable for its exquisite plumage.


In [2]:
import re
names = ['Xander Harris', 'Jennifer Smith', 'Timothy Jones', 'Amy Alexandrescu', 'Peter Price', 'Weifung Xu.']
pattern = "[Xx]"
b = [n for n in names if re.search(pattern, n)]
print(b)

['Xander Harris', 'Amy Alexandrescu', 'Weifung Xu.']


In [11]:
word = "The quick brown fox jumps over the lazy dog"
match = "/fox/"

In [12]:
pattern = re.compile(r"\w+")
pattern.findall("hello word")

['hello', 'word']

# Regular Expressions
> An explanation of reg exp

|Element |Type |Decription| Example problem | Expected result |
|--------|-----|----------|-----------------|-----------------|
|+  | Quantifier| - previous character repeated one or more times |
|*  | Quantifier| - previous character repeated zero or more times |
|?  | Quantifier| - previous character repeated zero or one time - Optional (0 or 1 repetitions) | /cars?/ | Matches the characters c,a,r and s - optionally matches the character s if present| 
|{n}  | Quantifier | - previous character is repeated exactly n times | 
|{n,} | Quantifier | - previous character is repeated at least n times | 
|{,n} | Quantifier | - previous character is repeated at most n times | 
|{n,m}| Quantifier | - previous character is between n and m times repetitions (both inclusive) | /\d{1,3}[-\s]?\d{3}[-\s]?\d{3}/ | Matches the format 555-555-555, 12 555 555, or 555555555 |
|\|| Quantifier | - matches either of the following character sets| /yes\|no/| Matches "yes" or "no"|
|[ ]| Quantifier | - grouping: matches any character or set of characters enclosed in between the "[" "]" | /licen[cs]e/ | Matches "licence" and "license"|
|^ | Quantifier | - negative matching: does not match following symbols|
|^ | Boundary Matcher | - Matches at the beginning of the line | /^Name/ | Finds "Name" at the beginning of a line |
|\$ | Boundary Matcher | - Matches at the end of the line | /^Name: [\sa-zA-Z]+$/ | Matches "Name:" at the beginning of the line followed by a whitespace & alphabets, repeated one or more times until the end of the line |
|\b| Boundary Matcher | - Matches a word boundary |
|\B| Boundary Matcher | - Matches the opposite of \b. Anything that is not a word boundary|
|\A| Boundary Matcher | - Matches the beginning of the input |
|\Z| Boundary Matcher | - Matches the end of the input |
|//| Quantifier| - matches a defined character set | /fox/| Matches "fox" exactly|
|\ | Quantifier | - escape metacharacters by preceding them with a backlash| /\(cat)/| Matches "(cat)"|
|- | Quantifier | - designates a range of characters | [a-z] | Matches all characters from small letter 'a' to small letter 'z'|
|- | Character set | - matches a hyphen|
|0-9 | Character set | - Matches anything between 0 and 9 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)|
|a-z | Character set | - Matches anything between a and z (a, b, c, d, ..., z)|
|A-Z | Character set | - Matches anything between A and Z (A, B, C, D, ..., Z)|
|.  | Predefined character set | Matches any character except newline \n |
|\d | Predefined character set | Matches any decimal digit; equivalent to [0-9] |
|\D | Predefined character set | Matches any non-digit character; equivalent to [^0-9] |
|\s | Predefined character set | Matches any whitespace character; equivalent to [ \t\n\r\f\v] |
|\S | Predefined character set | Matches any non-whitespace character; equivalent to [^ \t\n\r\f\v]  |
|\w | Predefined character set | Matches any alphanumeric character; equivalent to [a-zA-Z0-9_] |
|\W | Predefined character set | Matches any non-alphanumeric character; equivalent to [^a-zA-Z0-9_] |
|[-\s]| Character set| - Matches a hyphen or whitespace character |

In [14]:
#Keep in mind that empty matches are a part of the result:
p = re.compile(r'a*')
p.findall("aba")

['a', '', 'a', '']

In [15]:
p = re.compile(r'a?')
p.findall("aba")

['a', '', 'a', '']

In [16]:
p = re.compile(r"(\w+) (\w+)")
p.findall("Hello world hola mundo")

[('Hello', 'world'), ('hola', 'mundo')]

In [17]:
p = re.compile(r"(\w+ \w+)")
p.findall("Hello world hola mundo")

['Hello world', 'hola mundo']

In [19]:
re.split(r"\n", "Beautiful⇢is better⇢than⇢ugly.\nExplicit⇢is⇢be tter⇢than⇢implicit.")

['Beautiful⇢is better⇢than⇢ugly.', 'Explicit⇢is⇢be tter⇢than⇢implicit.']

In [22]:
p = re.compile(r"\W")
p.split("Hello world")

['Hello', 'world']

In [23]:
p = re.compile(r"\W")
p.findall("Hello world")

[' ']

In [24]:
p = re.compile(r"\W")
p.split("Beautiful is better than ugly", 2)

['Beautiful', 'is', 'better than ugly']

In [25]:
p = re.compile(r"(-)")
p.split("hello-word")

['hello', '-', 'word']

In [26]:
pattern = re.compile(r"(\W)")
pattern.split("⇢hello⇢word")

['', '⇢', 'hello', '⇢', 'word']

In [47]:
pattern = re.compile(r"(\W)")
t = pattern.sub(r"*\g<1>*","⇢hello⇢word")
t

'*⇢*hello*⇢*word'

In [30]:
p = re.compile(r"[0-9]+")
p.sub("*", "order0⇢order1⇢order13")

'order*⇢order*⇢order*'

In [31]:
p = re.compile(r"[0-9]")
p.sub("*", "order0⇢order1⇢order13")

'order*⇢order*⇢order**'

In [34]:
re.sub('00', '-', 'order00000')

'order--0'

In [35]:
def normalize_orders(m):
    if m.group(1) == '-': return "A"
    else: return "B"

In [36]:
re.sub('([-|A-Z])', normalize_orders, '-1234⇢A193⇢ B123')

'A1234⇢B193⇢ B123'

## Backreferences
Backreferences,  replaces the backreferences with the corresponding groups. For example, let's say you want to transform markdown to HTML, for the sake of keeping the example short, just bold the text:

In [38]:
text = "imagine⇢a⇢new⇢*world*,⇢a⇢magic⇢*world*"
p = re.compile(r'\*(.*?)\*')
p.sub(r"<b>\g<1><\\b>", text)

'imagine⇢a⇢new⇢<b>world<\\b>,⇢a⇢magic⇢<b>world<\\b>'

In [39]:
text = "imagine⇢a⇢new⇢*world*,⇢a⇢magic⇢*world*"
p = re.compile(r'\*(.*?)\*')
p.sub(r"<b>\g<1>1<\\b>", text)

'imagine⇢a⇢new⇢<b>world1<\\b>,⇢a⇢magic⇢<b>world1<\\b>'

In [48]:
text = "imagine⇢a⇢new⇢*world*,⇢a⇢magic⇢*world*"
p = re.compile(r'\*(.*?)\*')
p.subn(r"<b>\g<1>1<\\b>", text)

('imagine⇢a⇢new⇢<b>world1<\\b>,⇢a⇢magic⇢<b>world1<\\b>', 2)

## MatchObject
This object represents the matched pattern; you will get one every time you execute one of these operations:
* match
* search 
* finditer

In [7]:
p = re.compile(r"(\w+) (\w+)")
match = p.search("Hello world")

#### group([group1, ...])
The group operation gives you the subgroups of the match. If it's invoked with no arguments or zero, it will return the entire match; while if one or more group identifiers are passed, the corresponding groups' matches will be returned.

In [6]:
help(re)

Help on module re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.7/library/re
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.
    
    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last mat

In [8]:
type(match)

re.Match

In [9]:
match.group()

'Hello world'

In [10]:
match.group(0)

'Hello world'

In [11]:
match.group(1)

'Hello'

In [12]:
match.group(2)

'world'

In [13]:
match.group(0,2)

('Hello world', 'world')

In [14]:
match.group(0,1,2)

('Hello world', 'Hello', 'world')

> Groups can be named. If the pattern has named groups, they can be accessed using the names or the index:

In [15]:
p = re.compile(r"(?P<first>\w+) (?P<second>\w+)")
match = p.search("Hello world")

In [16]:
match.group('first')

'Hello'

In [17]:
match.group('second')

'world'

In [18]:
match.groups()

('Hello', 'world')

In [19]:
# Note that if there aren't named groups, 
# then it returns an empty dictionary.
match.groupdict()

{'first': 'Hello', 'second': 'world'}

#### start([group])
Sometimes, it is useful to know the index where the pattern matched. As with all the operations related to groups, if the argument group is zero, then the operation works with the whole string matched:

In [20]:
match.start(1)

0

In [21]:
match.start(2)

6

In [22]:
match.start(10)

IndexError: no such group

In [23]:
match.end(1)

5

In [24]:
match.end(2)

11

In [25]:
# It's an operation that gives you a tuple with the values from start and end.
match.span(1)

(0, 5)

In [26]:
match.span(2)

(6, 11)

In [27]:
[item.upper() for item in ['python', 'is', 'fun'] if item != 'is']

['PYTHON', 'FUN']

matrix = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]

Output:

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [28]:
matrix = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
[n for m in matrix for n in m]

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [29]:
from collections import defaultdict
y = defaultdict(lambda: 10)
y['start'] = 5
print(y['start'] + y['end'])

15


# Grouping
You could use parentheses whenever you want to group meaningful subpatterns inside the main pattern. \
Grouping is accomplished through two metacharacters, the parentheses ().
> The simplest example of the use of parentheses would be building a subexpression.

In [33]:
"""
For example, let's say we would like to write an expression to match if someone is from Spain.
"""
re.search("Espana|ol", "Espanol")

<re.Match object; span=(5, 7), match='ol'>

In [34]:
re.search("Espana|ol", "Espana")

<re.Match object; span=(0, 6), match='Espana'>

In [35]:
#The problem is that this also matches ol: 
re.search("Espana|ol", "ol")

<re.Match object; span=(0, 2), match='ol'>

In [36]:
#The solution here is to use parentheses:
re.search("Espan(a|ol)", "Espana")

<re.Match object; span=(0, 6), match='Espana'>

In [37]:
re.search("Espan(a|ol)", "ol")

## Capturing
Let's see another key feature of grouping, capturing. Groups also capture the matched pattern, so you can use them later in several operations, such as sub or in the regex itself.

In [57]:
p = re.compile(r"(\d+)-\w+")
it = p.finditer(r"1-a\n20-baer\n34-afcr")

In [59]:
for match in it:
    print("%s: %s" % (match.start(), match.group(1)))

0: 1
5: 20
14: 34


### Backreferences


In [60]:
p = re.compile(r"(\w+) \1")
m = p.search(r"hello hello world")

In [61]:
m.groups()

('hello',)


Now, let's try to change the order of the ID, so we have
the ID in the DB, a dash, and the country code

In [67]:
pattern = re.compile(r"(\d+)-(\w+)")

In [68]:
# Replace what you've matched with the second group, a dash, and the first group
pattern.sub(r"\2-\1", "1-a\n20-baer\n34-afcr")

'a-1\nbaer-20\nafcr-34'

In [69]:
p = re.compile(r"(\w+) (\w+)")
m = p.search("Hello world")

In [70]:
m.group(1)

'Hello'

In [71]:
m.group(2)

'world'

### Named groups
Using numbers to refer to groups can be tedious and confusing, and the worst thing is that it doesn't allow you to give meaning or context to the group. That's why we have named groups
> In order to use it, we have to use the syntax,`(?P<name>pattern)`

In [73]:
p = re.compile(r"(?P<first>\w+) (?P<second>\w+)")
m = re.search(p, "Hello world")

In [74]:
m.group("first")

'Hello'

In [75]:
m.group("second")

'world'

** in order to reference a group by the name in the sub operation, we have to use `\g<name>` **

In [77]:
p = re.compile(r"(?P<country>\d+)-(?P<id>\w+)")
p.sub(r"\g<id>-\g<country>", "1-a\n20-baer\n34-afcr")

'a-1\nbaer-20\nafcr-34'


We can also use named groups inside the pattern itself, as seen in the following example

In [78]:
p = re.compile(r"(?P<word>\w+) (?P=word)")
m = p.search(r"hello hello world")
m.groups()

('hello',)

In [79]:
m.groupdict()

{'word': 'hello'}

### yes-pattern|no-pattern
It tries to match a pattern in case a previous one was found. On the other hand, it doesn't try to match a pattern in case a previous group was not found. In short, it's like an if-else statement. The syntax for this operation is as follows:
    `(?(id/name)yes-pattern|no-pattern)`
    
This expression means: if the group with this ID has already been matched, then at this point of the string, the yes-pattern pattern has to match. If the group hasn't been matched, then the no-pattern pattern has to match.

In [80]:
pattern = re.compile(r"(\d\d-)?(\w{3,4})-(?(1)(\d\d)|[a-z]{3,4})$") 
pattern.match("34-erte-22")

<re.Match object; span=(0, 10), match='34-erte-22'>

In [81]:
pattern.match("34-erte")

In [82]:
pattern.match("erte-abcd")

<re.Match object; span=(0, 9), match='erte-abcd'>