# Regex

What is a regular expression?
- A meta language for describing regular text

When are regular expressions useful?
- When you need to parse regular text
- Helps in turning unstructured data into structured data

In [1]:
import pandas as pd
import re

In [2]:
log_file_lines = '''
76.185.131.226 - - [11/May/2020:14:25:53 +0000] "GET / HTTP/1.1" 200 42 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:25:46 +0000] "GET / HTTP/1.1" 200 42 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:25:58 +0000] "GET / HTTP/1.1" 200 42 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
76.185.131.226 - - [11/May/2020:16:25:58 +0000] "GET /favicon.ico HTTP/1.1" 200 162 "https://python.zach.lol/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
104.5.217.57 - - [11/May/2020:16:26:27 +0000] "GET / HTTP/1.1" 200 42 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:26:46 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:26:54 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
104.5.217.57 - - [11/May/2020:16:27:04 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:27:05 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:27:10 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
'''

In [3]:
import re # part of the python stdlib

- search: shows a single match for a regex
- findall: shows *all* the matches for a regex in a subject

### Literals

In [4]:
# r stands for raw string
regexp = r'a'
subject = 'abc'

re.search(regexp, subject)

<re.Match object; span=(0, 1), match='a'>

In [5]:
print('This string has a backslash: \n')

This string has a backslash: 



In [6]:
print(r'This string has a backslash: \n')

This string has a backslash: \n


<div style="background-color: rgba(0, 100, 200, .1); padding: 1em 3em; border-radius: 5px; border: 1px solid black">
    <div style="font-weight: bold; font-size: 1.2em; border-bottom: 1px dashed black; padding-bottom: .5em;">
        Mini Exercise
    </div>
    <ol>
        <li>Change your regular expression to match the literal character "b". What do you notice?</li>
        <li>Change your regular expression to match the literal string "ab". What do you notice?</li>
        <li>Change your regular expression to match the literal "d". What do you notice?</li>
        <li>Use <code>re.findall</code> instead of <code>re.search</code>. How do the results differ?</li>
        <li>Change your regular expression to just the "." character. What are the results?</li>
    </ol>
</div>

In [7]:
regexp = r'b'
subject = 'abc'

re.search(regexp, subject)

<re.Match object; span=(1, 2), match='b'>

In [8]:
regexp = r'ab'
subject = 'abc'

re.search(regexp, subject)

<re.Match object; span=(0, 2), match='ab'>

In [9]:
regexp = r'd'
subject = 'abc'

re.search(regexp, subject)

In [10]:
regexp = r'a'
subject = 'abc'

re.findall(regexp, subject)

['a']

In [11]:
regexp = r'b'
subject = 'abc'

re.findall(regexp, subject)

['b']

In [12]:
regexp = r'ab'
subject = 'abc'

re.findall(regexp, subject)

['ab']

In [13]:
regexp = r'd'
subject = 'abc'

re.findall(regexp, subject)

[]

In [14]:
regexp = r'.'
subject = 'abc'

re.search(regexp, subject)

<re.Match object; span=(0, 1), match='a'>

In [15]:
regexp = r'.'
subject = 'abc'

re.findall(regexp, subject)

['a', 'b', 'c']

In [16]:
regexp = r'ac'
subject = 'abc'

re.search(regexp, subject)

In [17]:
regexp = r'ac'
subject = 'abc'

re.findall(regexp, subject)

[]

In [18]:
regexp = r'..'
subject = 'abc'

re.findall(regexp, subject)

['ab']

In [19]:
regexp = r'...'
subject = 'abc'

re.findall(regexp, subject)

['abc']

In [20]:
regexp = r'....'
subject = 'abc'

re.findall(regexp, subject)

[]

In [21]:
regexp = r'..'
subject = 'abc'

re.search(regexp, subject)

<re.Match object; span=(0, 2), match='ab'>

In [22]:
regexp = r''
subject = 'abc'

re.findall(regexp, subject)

['', '', '', '']

---
### Metacharacters

- `.`: any character except newline
- `\w`: any alphanumeric (unicode) character
- `\s`: any whitespace character
- `\d`: any digit
- Captial variants: show opposite of lowercase metacharacter

In [23]:
regexp = r'.\s\d'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(2, 5), match='c 1'>

In [24]:
regexp = r'.\s\d'
subject = 'abc 123'

re.findall(regexp, subject)

['c 1']

---

In [25]:
regexp = r'\d'
subject = 'abc 123'

re.findall(regexp, subject)

['1', '2', '3']

In [26]:
regexp = r'\d'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(4, 5), match='1'>

---

In [27]:
regexp = r'\w\w'
subject = 'abc 123'

re.findall(regexp, subject)

['ab', '12']

In [28]:
regexp = r'\w\w'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(0, 2), match='ab'>

---

In [29]:
regexp = r'\w\w'
subject = 'ab cf 123e d'

re.findall(regexp, subject)

['ab', 'cf', '12', '3e']

In [30]:
regexp = r'\w\w'
subject = 'ab cf 123e d'

re.search(regexp, subject)

<re.Match object; span=(0, 2), match='ab'>

In [31]:
subject[: 2]

'ab'

---

In [32]:
regexp = r'\s'
subject = 'ab cf 123e d'

re.findall(regexp, subject)

[' ', ' ', ' ']

In [33]:
regexp = r'\s'
subject = 'ab cf 123e d'

re.search(regexp, subject)

<re.Match object; span=(2, 3), match=' '>

In [34]:
subject[2:3]

' '

---

In [35]:
regexp = r'\s\w'
subject = 'ab cf 123e d'

re.findall(regexp, subject)

[' c', ' 1', ' d']

In [36]:
regexp = r'\s\w'
subject = 'ab cf 123e d'

re.search(regexp, subject)

<re.Match object; span=(2, 4), match=' c'>

---

In [37]:
regexp = r'\D'
subject = 'ab cf 123e d'

re.findall(regexp, subject)

['a', 'b', ' ', 'c', 'f', ' ', 'e', ' ', 'd']

In [38]:
regexp = r'\D'
subject = 'ab cf 123e d'

re.search(regexp, subject)

<re.Match object; span=(0, 1), match='a'>

In [39]:
subject[:1]

'a'

---

In [40]:
regexp = r'\d\W'
subject = 'abc 123.'

re.search(regexp, subject)

<re.Match object; span=(6, 8), match='3.'>

In [41]:
regexp = r'\d\W'
subject = 'abc 123.'

re.findall(regexp, subject)

['3.']

---

In [42]:
regexp = r'\w'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(0, 1), match='a'>

In [43]:
regexp = r'\w'
subject = 'abc 123'

re.findall(regexp, subject)

['a', 'b', 'c', '1', '2', '3']

---

In [44]:
regexp = r'\S'
subject = 'abc 123.'

re.findall(regexp, subject)

['a', 'b', 'c', '1', '2', '3', '.']

In [45]:
regexp = r'\w'
subject = 'abc 123.'

re.findall(regexp, subject)

['a', 'b', 'c', '1', '2', '3']

In [46]:
regexp = r'\S\S'
subject = 'abc 123.'

re.findall(regexp, subject)

['ab', '12', '3.']

In [47]:
regexp = r'\S\S'
subject = 'abc 123'

re.findall(regexp, subject)

['ab', '12']

<div style="background-color: rgba(0, 100, 200, .1); padding: 1em 3em; border-radius: 5px; border: 1px solid black">
    <div style="font-weight: bold; font-size: 1.2em; border-bottom: 1px dashed black; padding-bottom: .5em;">
        Mini Exercise
    </div>
    <p>Continue to use the same subject variable from above.</p>
    <ol>
        <li>Use all of the above metacharacters with <code>re.findall</code>. What do you notice?</li>
        <li>What does the regular expression <code>\w\w</code> match?</li>
        <li>Use only metacharacters to write a regular expression to match "c 1".</li>
        <li>Use a combination of metacharacters to match 3 digits in a row.</li>
    </ol>
</div>

In [48]:
# 1
regexp = r'.\w\s\d'
subject = 'abc 123'

re.findall(regexp, subject)

['bc 1']

In [49]:
# 1
regexp = r'.\w\s\d\S'
subject = 'abc 123'

re.findall(regexp, subject)

['bc 12']

In [50]:
# 2
regexp = r'\w\w'
subject = 'abc 123'

re.findall(regexp, subject)

['ab', '12']

In [51]:
# 3
regexp = r'\w\s\d'
subject = 'abc 123'

re.findall(regexp, subject)

['c 1']

In [52]:
# 3
regexp = r'\w\s.'
subject = 'abc 123'

re.findall(regexp, subject)

['c 1']

In [53]:
# 4
regexp = r'\d\d\d'
subject = 'abc 123'

re.findall(regexp, subject)

['123']

In [54]:
# 4
regexp = r'\w\w\d'
subject = 'abc 123'

re.findall(regexp, subject)

['123']

In [55]:
# 4
regexp = r'\d\w\S'
subject = 'abc 123'

re.findall(regexp, subject)

['123']

In [56]:
# 4
regexp = r'\d..'
subject = 'abc 123'

re.findall(regexp, subject)

['123']

### Repeating

- `{}`: repeat a given number of times -- `{x,y}` means at least `x` times, at most `y` times
- `*`: 0 or more
- `+`: 1 or more
- `?`: optional
- greedy + non-greedy: `?` after a repition operator
    - greedy tries to match as *much as possible*
    - non-greedy tries to match *as little as possible*
    - whatever precedes the `?` can optionally be included
    - `?` after a repititon  operator (`+`, `*`, `{}`) means non-greedy
    - `?` after anything else mean optional

In [57]:
regexp = r'\w'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(0, 1), match='a'>

In [58]:
regexp = r'\W+'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(3, 4), match=' '>

In [59]:
regexp = r'\W+'
subject = 'abc 123'

re.findall(regexp, subject)

[' ']

In [60]:
regexp = r'\w+'
subject = 'abc 123'

re.findall(regexp, subject)

['abc', '123']

In [61]:
regexp = r'\d+'
subject = 'abc 123'

re.findall(regexp, subject)

['123']

In [62]:
regexp = r'\D+'
subject = 'abc 123'

re.findall(regexp, subject)

['abc ']

In [63]:
regexp = r'\s+'
subject = 'abc 123'

re.findall(regexp, subject)

[' ']

In [64]:
regexp = r'\S+'
subject = 'abc 123'

re.findall(regexp, subject)

['abc', '123']

In [65]:
regexp = r'\d*'
subject = 'abc 123'

re.findall(regexp, subject)

['', '', '', '', '123', '']

In [66]:
regexp = r'\D*'
subject = 'abc 123'

re.findall(regexp, subject)

['abc ', '', '', '', '']

In [67]:
regexp = r'\s*'
subject = 'abc 123'

re.findall(regexp, subject)

['', '', '', ' ', '', '', '', '']

In [68]:
regexp = r'\S*'
subject = 'abc 123'

re.findall(regexp, subject)

['abc', '', '123', '']

In [69]:
regexp = r'\d{1}'
subject = 'abc 123.'

re.findall(regexp, subject)

['1', '2', '3']

In [70]:
regexp = r'\d{2}'
subject = 'abc 123.'

re.findall(regexp, subject)

['12']

In [71]:
# one or more alphanumeric, 
# followed by whitespace, 
# followed by one or more numbers
# followed by a plus character
regexp = r'\w+\s\d+\+'
subject = 'abc 123+'

re.findall(regexp, subject)

['abc 123+']

In [72]:
regexp = r'\W{2}'
subject = 'abc + 123.'

re.search(regexp, subject)

<re.Match object; span=(3, 5), match=' +'>

In [73]:
regexp = r'\W{3}'
subject = 'abc + 123.'

re.findall(regexp, subject)

[' + ']

In [74]:
regexp = r'.{3}'
subject = 'abc + 123.'

re.findall(regexp, subject)

['abc', ' + ', '123']

In [75]:
regexp = r'.{3,6}'
subject = 'abc + 123'

re.findall(regexp, subject)

['abc + ', '123']

In [76]:
regexp = r'.{3,6}'
subject = 'abc + 123'

re.search(regexp, subject)

<re.Match object; span=(0, 6), match='abc + '>

In [77]:
# finds anything up to and including a digit
regexp = r'.+\d'
subject = 'abc . 123'

re.search(regexp, subject)

<re.Match object; span=(0, 9), match='abc . 123'>

In [78]:
regexp = r'.+\d'
subject = 'abc 123.'

re.findall(regexp, subject)

['abc 123']

In [79]:
regexp = r'.+?\d'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(0, 5), match='abc 1'>

In [80]:
regexp = r'.+?\d'
subject = 'abc 123'

re.findall(regexp, subject)

['abc 1', '23']

In [81]:
regexp = r'\w+\s?\d+'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(0, 7), match='abc 123'>

In [82]:
regexp = r'\w+\s?\d+'
subject = 'abc .123'

re.search(regexp, subject)

<re.Match object; span=(5, 8), match='123'>

In [83]:
regexp = r'\w+\s?\d+'
subject = 'abc.123'

re.search(regexp, subject)

<re.Match object; span=(4, 7), match='123'>

<div style="background-color: rgba(0, 100, 200, .1); padding: 1em 3em; border-radius: 5px; border: 1px solid black">
    <div style="font-weight: bold; font-size: 1.2em; border-bottom: 1px dashed black; padding-bottom: .5em;">
        Mini Exercise
    </div>
    <p>Use the string below as your subject for this exercise.</p>
    <pre><code>Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, San Antonio, TX 78230. You can find us online at http://codeup.com and our alumni portal is located at https://alumni.codeup.com.</code></pre>
    <ol>
        <li>Write a regular expression that matches all the numbers.</li>
        <li>Write a regular expression that matches a 5 digit number, but not a number with fewer digits.</li>
        <li>Write a regular expression that matches any urls in the subject.</li>
    </ol>
</div>

- all greedy by default
- the `?` after a repitition operator makes the regex *nongreedy*

In [84]:
subject = 'Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, San Antonio, TX 78230. You can find us online at http://codeup.com and our alumni portal is located at https://alumni.codeup.com.'
regexp = r'.{10}'
re.search(regexp, subject)

<re.Match object; span=(0, 10), match='Codeup, fo'>

In [85]:
regexp = r'https?://.+?\.com'
re.findall(regexp, subject)

['http://codeup.com', 'https://alumni.codeup.com']

In [86]:
regexp = r'https?://.+\.com'
re.findall(regexp, subject)

['http://codeup.com and our alumni portal is located at https://alumni.codeup.com']

In [87]:
regexp = r'https?://.+?\.com'
re.findall(regexp, subject)

['http://codeup.com', 'https://alumni.codeup.com']

In [88]:
subject = 'Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, San Antonio, TX 78230. You can find us online at http://codeup.com and our alumni portal is located at https://alumni.codeup.com.'
regexp = r'\s+\d+\s+'
re.findall(regexp, subject)

[' 600 ']

In [89]:
regexp = r'https?://.+?\.com'
re.findall(regexp, subject)

['http://codeup.com', 'https://alumni.codeup.com']

In [90]:
regexp = r'https?://.+?\.com'
re.findall(regexp, subject)

['http://codeup.com', 'https://alumni.codeup.com']

### Any/None Of

- `[]`: anything 
- `[^]`: nothing after carrot

In [91]:
regexp = r'[a1][b2][c3]'
subject = 'abc 123'

re.match(regexp, subject)

<re.Match object; span=(0, 3), match='abc'>

In [92]:
subject = '123abc'

re.match(regexp, subject)

<re.Match object; span=(0, 3), match='123'>

<div style="background-color: rgba(0, 100, 200, .1); padding: 1em 3em; border-radius: 5px; border: 1px solid black">
    <div style="font-weight: bold; font-size: 1.2em; border-bottom: 1px dashed black; padding-bottom: .5em;">
        Mini Exercise
    </div>
    <p>For this exercise you should make up various subjects and test them with your regular expressions.</p>
    <ol>
        <li>Write a regular expression that matches even numbers.</li>
        <li>Write a regular expression that matches 2 or more odd numbers in a row.</li>
        <li>Write a regular expression that any word with a vowel in it.</li>
    </ol>
</div>

### Anchors

- `^`: starts with
- `$`: ends with
- `\b`: word anchor boundary

In [93]:
# start of a string followed by a b-character
regexp = r'^b'
subject = 'abc 123'

re.search(regexp, subject)

In [94]:
# start of a string followed by a a-character
regexp = r'^a'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(0, 1), match='a'>

In [95]:
regexp = r'\d'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(4, 5), match='1'>

In [96]:
regexp = r'\d$'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(6, 7), match='3'>

In [97]:
# one or more of anything until a word boundary
regexp = r'^.+?\b'
subject = 'abc123.'

re.search(regexp, subject)

<re.Match object; span=(0, 6), match='abc123'>

In [98]:
subject = 'abc12 3'

re.search(regexp, subject)

<re.Match object; span=(0, 5), match='abc12'>

In [99]:
# a word boundary with one or more of anythig up until a word boundary
regexp = r'\b.+?\b'
subject = 'abc 123  sd83o . skdjs20P'

re.findall(regexp, subject)

['abc', ' ', '123', '  ', 'sd83o', ' . ', 'skdjs20P']

<div style="background-color: rgba(0, 100, 200, .1); padding: 1em 3em; border-radius: 5px; border: 1px solid black">
    <div style="font-weight: bold; font-size: 1.2em; border-bottom: 1px dashed black; padding-bottom: .5em;">
        Mini Exercise
    </div>
    <p>For this exercise you should make up various subjects and test them with your regular expressions.</p>
    <ol>
        <li>Write a regular expression that matches if a word starts with a vowel.</li>
        <li>Write a regular expression that matches if a word starts with a capital letter.</li>
        <li>Write a regular expression that matches if a word ends with a capital letter.</li>        
        <li>Write a regular expression that matches if a word starts <b>and</b> ends with a capital letter.</li>
    </ol>
</div>

In [100]:
regexp = r'\d$'
subject = 'abc 123'

re.search(regexp, subject)

<re.Match object; span=(6, 7), match='3'>

### Capture Groups

Allows us to group up any pieces of a regular expression

In [101]:
regexp = r'.*?(\d)(\d+)'
subject = 'def456'

match = re.search(regexp, subject)
match

<re.Match object; span=(0, 6), match='def456'>

In [102]:
match.groups()

('4', '56')

In [103]:
regexp = r'.*?(\d+)(\d)'
subject = 'def456'

match = re.search(regexp, subject)
match

<re.Match object; span=(0, 6), match='def456'>

In [104]:
match.groups()

('45', '6')

In [105]:
regexp = r'.*?(\d+?)(\d)'
subject = 'def456'

match = re.search(regexp, subject)
match

<re.Match object; span=(0, 5), match='def45'>

In [106]:
match.groups()

('4', '5')

In [107]:
s = pd.Series([
    "Hello", 
    "Germain", 
    "Happy Monday!"
])
s

0            Hello
1          Germain
2    Happy Monday!
dtype: object

In [108]:
regexp = r'([A-Z])[a-z]+([a-z])\s.+\W$'
s.str.extract(regexp)

Unnamed: 0,0,1
0,,
1,,
2,H,y


In [109]:
regexp = r'([A-Z])[a-z]+([a-z])(\s.+\W$)?'
s.str.extract(regexp)

Unnamed: 0,0,1,2
0,H,o,
1,G,n,
2,H,y,Monday!


In [110]:
# 0 or more of anything, nongreddily
regexp = '.*?(\d+)'
s = pd.Series(['abc', 'abc123', '123'])
s.str.extract(regexp)

Unnamed: 0,0
0,
1,123.0
2,123.0


## `re.sub`

- removing
- substitution

In [111]:
# remove all numbers
regexp = r'\d+'
subject = 'abc123'

re.sub(regexp, '', subject)

'abc'

In [112]:
# replace all number patterns with 'X'
regexp = r'\d+'
subject = 'abc123'

re.sub(regexp, 'X', subject)

'abcX'

<div style="background-color: rgba(0, 100, 200, .1); padding: 1em 3em; border-radius: 5px; border: 1px solid black">
    <div style="font-weight: bold; font-size: 1.2em; border-bottom: 1px dashed black; padding-bottom: .5em;">
        Mini Exercise
    </div>
    <p>Use the code below to get started on this exercise.</p>
    <pre><code>dates = pd.Series(['2020-11-12', '2020-07-13', '2021-01-12'])</code></pre>
    <p>Use regular expression substitution to reformat the dates in the format common in the US: m/d/y.</p>
</div>

In [113]:
dates = pd.Series(['2020-11-12', '2020-07-13', '2021-01-12'])
dates

0    2020-11-12
1    2020-07-13
2    2021-01-12
dtype: object

In [114]:
regexp = r'(\d{4})-(\d{2})-(\d{2})'

dates.str.extract(regexp)

Unnamed: 0,0,1,2
0,2020,11,12
1,2020,7,13
2,2021,1,12


In [115]:
# can reference individual capture groups
dates.str.replace(regexp, r'\2/\3/\1')

0    11/12/2020
1    07/13/2020
2    01/12/2021
dtype: object

In [116]:
dates.str.replace(regexp, r'\1/\1')

0    2020/2020
1    2020/2020
2    2021/2021
dtype: object

In [117]:
dates.str.replace(regexp, r'\2/\1')

0    11/2020
1    07/2020
2    01/2021
dtype: object

## Misc

### Pandas Usage

- `.str`
    - `.extract`
    - `.count`
    - `.contains`
    - `.replace`
- extract + concat
- named groups

In [118]:
df = pd.DataFrame()
df['text'] = pd.Series([
    'You should go check out https://regex101.com, it is a great website!',
    'My favorite search engine is https://duckduckgo.com',
    'If you have a question, you can get it answered through http://askjeeves.com, it is great!',
])
df

Unnamed: 0,text
0,"You should go check out https://regex101.com, ..."
1,My favorite search engine is https://duckduckg...
2,"If you have a question, you can get it answere..."


In [119]:
df.text.str.count(r'[aeiou]')

0    19
1    14
2    28
Name: text, dtype: int64

In [120]:
df.text.str.contains(r'https?')

0    True
1    True
2    True
Name: text, dtype: bool

In [121]:
df[df.text.str.contains(r'http://')]

Unnamed: 0,text
2,"If you have a question, you can get it answere..."


In [122]:
df.text.str.count(r'')

0    69
1    52
2    91
Name: text, dtype: int64

In [123]:
df.text.str.extract(r'(https?)://(\w+)\.(\w+)')

Unnamed: 0,0,1,2
0,https,regex101,com
1,https,duckduckgo,com
2,http,askjeeves,com


In [124]:
pd.concat([df, df.text.str.extract(r'(https?)://(\w+)\.(\w+)')], axis=1)

Unnamed: 0,text,0,1,2
0,"You should go check out https://regex101.com, ...",https,regex101,com
1,My favorite search engine is https://duckduckg...,https,duckduckgo,com
2,"If you have a question, you can get it answere...",http,askjeeves,com


In [125]:
# name the new column of dataframe
pd.concat([
    df, 
    df.text.str.extract(r'(?P<protocal>https?)://(?P<domain>\w+)\.(?P<tld>\w+)')
],axis=1)

Unnamed: 0,text,protocal,domain,tld
0,"You should go check out https://regex101.com, ...",https,regex101,com
1,My favorite search engine is https://duckduckg...,https,duckduckgo,com
2,"If you have a question, you can get it answere...",http,askjeeves,com


### Interactive Regex Tool

To install the `hlre` tool:

```
python -m pip install hlre
```

[For more documentation and the source](https://github.com/zgulde/hlre)

See also [regex101](https://regex101.com) (make sure to select the Python flavor)

### Named capture groups

In [126]:
text = 'You should go check out https://regex101.com, it is a great website!'

match = re.search(r'(?P<protocol>https?)://(?P<base_domain>\w+)\.(?P<tld>\w+)', text)
match.groupdict()

{'protocol': 'https', 'base_domain': 'regex101', 'tld': 'com'}

In [127]:
df.text.str.extract(r'(?P<protocol>https?)://(?P<base_domain>\w+)\.(?P<tld>\w+)')

Unnamed: 0,protocol,base_domain,tld
0,https,regex101,com
1,https,duckduckgo,com
2,http,askjeeves,com


### Verbose regular expressions

- `re.VERBOSE`
- `(?# this is a comment)`

In [128]:
text = 'You should go check out https://regex101.com, it is a great website!'

regexp = r"""
(?P<protocol>https?)
:// (?# ignore the :// that seperates protocol from domain)
(?P<base_domain>\w+)
\.
(?P<tld>\w+)
"""
match = re.search(regexp, text, re.VERBOSE) # whitespace in the regex is ignored
match.groupdict()

{'protocol': 'https', 'base_domain': 'regex101', 'tld': 'com'}

In [129]:
re.findall(r'[a-z]', 'aBcD', re.IGNORECASE)

['a', 'B', 'c', 'D']

In [130]:
re.findall(r'[a-z]', 'aBcD', re.IGNORECASE | re.VERBOSE)

['a', 'B', 'c', 'D']

In [131]:
dir(re)

['A',
 'ASCII',
 'DEBUG',
 'DOTALL',
 'I',
 'IGNORECASE',
 'L',
 'LOCALE',
 'M',
 'MULTILINE',
 'Match',
 'Pattern',
 'RegexFlag',
 'S',
 'Scanner',
 'T',
 'TEMPLATE',
 'U',
 'UNICODE',
 'VERBOSE',
 'X',
 '_MAXCACHE',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '__version__',
 '_cache',
 '_compile',
 '_compile_repl',
 '_expand',
 '_locale',
 '_pickle',
 '_special_chars_map',
 '_subx',
 'compile',
 'copyreg',
 'enum',
 'error',
 'escape',
 'findall',
 'finditer',
 'fullmatch',
 'functools',
 'match',
 'purge',
 'search',
 'split',
 'sre_compile',
 'sre_parse',
 'sub',
 'subn',
 'template']

In [132]:
# Multiline
text = '''
Hello, Germain class!
I like regular expressions.
They are pretty neat!
'''.strip()

re.findall(r'^\w+', text)

['Hello']

In [133]:
re.findall(r'^\w+', text, re.MULTILINE)

['Hello', 'I', 'They']

In [134]:
re.findall(r'\w+\W$', text, re.MULTILINE)

['class!', 'expressions.', 'neat!']