In [1]:
import numpy as np
import re

In [20]:
# match 方法从字符串的第一个位置开始尝试匹配
# 符合则返回匹配的范围， 否则返回None
pattern = re.compile(r'abc',  re.I)    # 注意！ re.I 等修饰符号要写在 compile 中
strings = ['abcd', 'dabcbd', 'Abcd']

for item in strings:
    print(pattern.match(item))

<re.Match object; span=(0, 3), match='abc'>
None
<re.Match object; span=(0, 3), match='Abc'>


In [22]:
# search 在 string 中搜寻指定的 reg ，返回位置信息
pattern = re.compile(r'abc')
strings = ['abcd', 'babc', 'ddabc']

for item in strings:
    print(pattern.search(item))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(1, 4), match='abc'>
<re.Match object; span=(2, 5), match='abc'>


In [27]:
# split方法 按照指定pattern 对字符串进行分割
pattern = re.compile(r'\d+')

string = 'hello23world3423!'
res = pattern.split(string)

print(type(res))
print(res)

<class 'list'>
['hello', 'world', '!']


#### findall(): 在字符串中查找所有符合要求的字符串 查找所有的匹配

In [35]:
pattern = re.compile(r'\d+')
strings = ['3sd', 'sdf23', '4sdf34m32']

for item in strings:
    print(pattern.findall(item))

['3']
['23']
['4', '34', '32']


#### finditer()：在字符串中查找所有符合字串 并生成一个迭代器返回

In [50]:
pattern = re.compile(r'\d+')
string = 'a23b3434fgg445'
res = pattern.finditer(string)
print(type(res))
for item in res:
    print(item)

<class 'callable_iterator'>
<re.Match object; span=(1, 3), match='23'>
<re.Match object; span=(4, 8), match='3434'>
<re.Match object; span=(11, 14), match='445'>


#### sub()：替换字符串中匹配串

In [43]:
pattern = re.compile(r'\d+')
stirngs = ['a2b43c23d', 'ab3n']

for item in strings:
    print(item)
    res = pattern.sub(r'|', item)
    print(res)

3sd
|sd
sdf23
sdf|
4sdf34m32
|sdf|m|


#### ? 可以匹配之前字符 0， 1 次

In [23]:
pattern = re.compile(r'abc?', re.I)
strings = ['abc', 'ab', 'abcc', 'abcd']  

for item in strings:
    print(pattern.search(item))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 3), match='abc'>


#### * 表示匹配之前的字符 0，1，n次

In [24]:
pattern = re.compile(r'abc*')
strings = ['abc', 'abcc', 'ab', 'abcccc']

for item in strings:
    print(pattern.search(item))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 4), match='abcc'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 6), match='abcccc'>


#### + 表示匹配之前字符 1，n次

In [33]:
pattern = re.compile(r'\d+')
strings = ['a23d', '23', 'b2b', 'dd33']

for item in strings:
    print(item)
    print(pattern.search(item))

a23d
<re.Match object; span=(1, 3), match='23'>
23
<re.Match object; span=(0, 2), match='23'>
b2b
<re.Match object; span=(1, 2), match='2'>
dd33
<re.Match object; span=(2, 4), match='33'>


#### .匹配任意字符

In [54]:
pattern = re.compile(r'a.b')
strings = ['adb', 'a8b', 'a_b']

for item in strings:
    print(pattern.search(item))

<re.Match object; span=(0, 3), match='adb'>
<re.Match object; span=(0, 3), match='a8b'>
<re.Match object; span=(0, 3), match='a_b'>


#### 匹配字符串开头、结尾字串

In [55]:
# 只匹配从字符串开头部分符合规则的字串
pattern = re.compile(r'^He')    
strings = ['He is reading', 'talk to He']

for item in strings:
    print(pattern.search(item))

<re.Match object; span=(0, 2), match='He'>
None


In [57]:
# 只匹配字符串结尾部分符合规则的字串
pattern = re.compile(r'dog.$')
strings = ['a dog b', 'a dog.']

for item in strings:
    print(pattern.search(item))

None
<re.Match object; span=(2, 6), match='dog.'>


#### 匹配单词边界处的字串
这里注意 `\b`表示匹配word边界，  `\B`表示不匹配word边界

In [63]:
pattern = re.compile(r'\bdog\b')
pattern2 = re.compile(r'\Bdog\B')
strings = ['dog a', 'a dog', 'adoge', 'a dog d']

for item in strings:
    print(pattern.search(item))
print('-'*40)
for item in strings:
    print(pattern2.search(item))

<re.Match object; span=(0, 3), match='dog'>
<re.Match object; span=(2, 5), match='dog'>
None
<re.Match object; span=(2, 5), match='dog'>
----------------------------------------
None
None
<re.Match object; span=(1, 4), match='dog'>
None


#### `|` Disjunction （or）
注意对于 `|`， 它会将之前与之后的整体作为匹配的子串 
如果只匹配一部分 要使用 `()`进行划分

In [64]:
pattern = re.compile(r'abc|e')
strings = ['abc', 'abe', 'e']

for item in strings:
    print(pattern.search(item))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(2, 3), match='e'>
<re.Match object; span=(0, 1), match='e'>


#### 关于 Regular Expression 的匹配方式
Regular Expression的匹配是 greedy的， 总是尽可能地匹配最多字符