# Regular Expressions

When you call re.split('\s+',text), the regular expression is first compiled, and then its split
method is called on the passed text. You can compile the regex yourself with re.compile, forming
a reusable regex object:

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
text = "foo    bar\t baz \tqux"
# normal split = text.split()qux
re.split('\s+', text) # spliting basedon whitespaces
text.split(" ")
['foo', 'bar', 'baz', 'qux']
# compile once to use again and again and saved time
rgx = re.compile('\s+')
rgx.split(text)

['foo', 'bar', 'baz', 'qux']

In [3]:
text = "foo    bar\t baz \tqux"
# normal split = text.split()qux
re.split('\s+', text) # spliting basedon whitespaces

['foo', 'bar', 'baz', 'qux']

In [4]:
text = "foo    bar\t baz \tqux"
text.split(" ")

['foo', '', '', '', 'bar\t', 'baz', '\tqux']

In [5]:
text = "foo    bar\t baz \tqux"
['foo', 'bar', 'baz', 'qux']

['foo', 'bar', 'baz', 'qux']

In [6]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s+')
rgx.split(text)

['foo', 'bar', 'baz', 'qux']

In [7]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s-')
rgx.split(text)

['foo    bar\t baz \tqux']

In [8]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s*')
rgx.split(text)

['',
 'f',
 'o',
 'o',
 '',
 'b',
 'a',
 'r',
 '',
 'b',
 'a',
 'z',
 '',
 'q',
 'u',
 'x',
 '']

In [9]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s/')
rgx.split(text)

['foo    bar\t baz \tqux']

In [10]:
text = "foo    bar\t baz \tqux"

# compile once to use again and again and saved time
rgx = re.compile('\s%')
rgx.split(text)

['foo    bar\t baz \tqux']

# Vectorized String Functions In Pandas

In [11]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'
data.str.findall(pattern, flags = re.IGNORECASE)
matches = data[data.str.match(pattern, flags = re.IGNORECASE)]
matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

0          simpleEmail@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

In [12]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

In [13]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

In [14]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'
data.str.findall(pattern, flags = re.IGNORECASE)
matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

0          simpleEmail@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

In [15]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]   

data = pd.Series(data)
display(data)
pattern = '([A-Z0-9_%+-]+)@([A-Z0-9.-]+)\\.([A-z]{2,4})'
matches = data[data.str.match(pattern, flags = re.IGNORECASE)]
matches

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

0          simpleEmail@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

In [16]:
data = pd.Series(np.random.randn(10),
        index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd', 'a'],
        [1, 2, 3, 1, 3, 1, 2, 2, 3,1] ] )
display(data)
#hierarchically indexed object partial indexing is possible

a  1    2.035043
   2    0.280113
   3    0.292083
b  1   -1.280208
   3    0.732803
c  1    0.485788
   2    1.181040
d  2    1.256277
   3   -0.081624
a  1   -0.862150
dtype: float64

In [17]:
display( data['b'] )
#data['b':'c']
#data.loc[['b', 'd']]

#Selection is even possible from an “inner” level:
#data.loc[:, 2]

1   -1.280208
3    0.732803
dtype: float64

In [18]:
data.loc[['b','d']]

b  1   -1.280208
   3    0.732803
d  2    1.256277
   3   -0.081624
dtype: float64

In [19]:
data.loc[['b','c']]

b  1   -1.280208
   3    0.732803
c  1    0.485788
   2    1.181040
dtype: float64

In [20]:
data.loc[['a','d']]

a  1    2.035043
   2    0.280113
   3    0.292083
d  2    1.256277
   3   -0.081624
a  1   -0.862150
dtype: float64

In [21]:
data.loc[['b','c']]

b  1   -1.280208
   3    0.732803
c  1    0.485788
   2    1.181040
dtype: float64

In [22]:
data.loc[:, 2]

a    0.280113
c    1.181040
d    1.256277
dtype: float64

In [23]:
data.loc[:, 3]

a    0.292083
b    0.732803
d   -0.081624
dtype: float64

In [30]:
#Hierarchical indexing plays an important role in reshaping data and group-based
#operations like forming a pivot table. For example, 
#you could rearrange the data into
#a DataFrame using its unstack method:
display(data)
#df = data.unstack()
#display(df)
#df.stack()

a  1    2.035043
   2    0.280113
   3    0.292083
b  1   -1.280208
   3    0.732803
c  1    0.485788
   2    1.181040
d  2    1.256277
   3   -0.081624
a  1   -0.862150
dtype: float64

In [31]:
data.unstack().stack()

ValueError: Index contains duplicate entries, cannot reshape

In [32]:
#With a DataFrame, either axis can have a hierarchical index
# [18]
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
        columns=[['Ohio', 'Ohio', 'Colorado'],
        ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
display(frame)
frame['Ohio']

#Be careful to distinguish the index names 'state' and 'color'
#from the row labels

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [34]:
#With a DataFrame, either axis can have a hierarchical index
# [18]
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
        columns=[['Ohio', 'Ohio', 'Colorado'],
        ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
display(frame)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [36]:
#With a DataFrame, either axis can have a hierarchical index
# [18]
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
        columns=[['Ohio', 'Ohio', 'Colorado'],
        ['Green', 'Red', 'Green']])

frame['Ohio']

Unnamed: 0,Unnamed: 1,Green,Red
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [38]:
# Reording and sorting levels
frame.swaplevel('key1', 'key2')
#can save in the same frame variables 
# or another variables like
frame = frame.swaplevel('key1', 'key2')

KeyError: 'Level key1 not found'

In [39]:
#sort_index , on the other hand, sorts the data using only the values in a single level.
#When swapping levels, it’s not uncommon to also use sort_index so that the result is
#lexicographically sorted by the indicated level:
frame.sort_index(level = 1)

# compare the above statement output with following statement output
frame.swaplevel(0, 1).sort_index(level = 0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [45]:
#sort_index , on the other hand, sorts the data using only the values in a single level.
#When swapping levels, it’s not uncommon to also use sort_index so that the result is
#lexicographically sorted by the indicated level:
frame.sort_index(level = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [46]:
# compare the above statement output with following statement output
frame.swaplevel(0, 1).sort_index(level = 0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11
