In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

In [3]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [4]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [5]:
'::'.join(pieces)

'a::b::guido'

In [6]:
'guido' in val

True

In [7]:
# 指定した文字列内から、引数で与えた文字列が見つかるindexを返す、見つからない場合は例外になる
val.index(',')

1

In [8]:
# 指定した文字列内から、引数で与えた文字列が見つかるindexを返す、見つからない場合は−1を返す
val.find(':')

-1

In [9]:
# 引数に指定した文字列が何回発見されるかを返す
val.count(',')

2

In [10]:
# 引数に指定した文字列を置換する
val.replace(',', '::')

'a::b:: guido'

In [11]:
# 特定の文字列パターンを削除する場合にも使われる
val.replace(',', '')

'ab guido'

In [12]:
text = 'foo    bar\t baz  \tqux'
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [13]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [14]:
regex.findall(text)

['    ', '\t ', '  \t']

In [15]:
# メールアドレスとマッチする文字列を考える
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASEは大文字と小文字を区別しない正規表現を作成する
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [16]:
m = regex.search(text)
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [17]:
text[m.start():m.end()]

'dave@google.com'

In [18]:
m.group()

'dave@google.com'

In [19]:
regex.sub('REDACTED', text)

'Dave REDACTED\nSteve REDACTED\nRob REDACTED\nRyan REDACTED\n'

In [20]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [21]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [22]:
regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)

'Dave Username: dave, Domain: google, Suffix: com\nSteve Username: steve, Domain: gmail, Suffix: com\nRob Username: rob, Domain: gmail, Suffix: com\nRyan Username: ryan, Domain: yahoo, Suffix: com\n'

In [23]:
regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9._]+)
    \.
    (?P<suffix>[A-Z]{2,4})
""", flags=re.IGNORECASE|re.VERBOSE)
m = regex.match('wesm@bright.net')
m.groupdict()

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

In [24]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

In [25]:
data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

In [26]:
data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [27]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [28]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [29]:
matches = data.str.extractall(pattern, flags=re.IGNORECASE)
matches

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dave,0,dave,google,com
Rob,0,rob,gmail,com
Steve,0,steve,gmail,com
