# Regular Expressions

When you call re.split('\s+',text), the regular expression is first compiled, and then its split
method is called on the passed text. You can compile the regex yourself with re.compile, forming
a reusable regex object:

In [8]:
import numpy as np
import pandas as pd
import re

In [5]:
text = "foo    bar\t baz \tqux"
# normal split = text.split()qux
re.split('\s+', text) # spliting basedon whitespaces
text.split(" ")
['foo', 'bar', 'baz', 'qux']
# compile once to use again and again and saved time
rgx = re.compile('\s+')
rgx.split(text)

['foo', 'bar', 'baz', 'qux']

In [6]:
text = "foo    bar\t baz \tqux"
# normal split = text.split()qux
re.split('\s+', text) # spliting basedon whitespaces

['foo', 'bar', 'baz', 'qux']

In [7]:
text = "foo    bar\t baz \tqux"
text.split(" ")

['foo', '', '', '', 'bar\t', 'baz', '\tqux']

In [9]:
text = "foo    bar\t baz \tqux"
['foo', 'bar', 'baz', 'qux']

['foo', 'bar', 'baz', 'qux']

In [11]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s+')
rgx.split(text)

['foo', 'bar', 'baz', 'qux']

In [12]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s-')
rgx.split(text)

['foo    bar\t baz \tqux']

In [13]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s*')
rgx.split(text)

['',
 'f',
 'o',
 'o',
 '',
 'b',
 'a',
 'r',
 '',
 'b',
 'a',
 'z',
 '',
 'q',
 'u',
 'x',
 '']

In [14]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s/')
rgx.split(text)

['foo    bar\t baz \tqux']

In [15]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s%')
rgx.split(text)

['foo    bar\t baz \tqux']

# Vectorized String Functions In Pandas

In [20]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'
data.str.findall(pattern, flags = re.IGNORECASE)
matches = data[data.str.match(pattern, flags = re.IGNORECASE)]
matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

0          simpleEmail@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

In [22]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

In [24]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

In [25]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'
data.str.findall(pattern, flags = re.IGNORECASE)
matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

0          simpleEmail@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

In [26]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'
matches = data[data.str.match(pattern, flags = re.IGNORECASE)]
matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

0          simpleEmail@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object