# 正则表达式 

In [2]:
# 系统模块，直接导入
import re

## 匹配某个字符

### match方法从前往后（从第一个字符）开始匹配

In [4]:
text = "python"
match_result = re.match('py', text)

# 返回object对象
print(match_result)

<re.Match object; span=(0, 2), match='py'>


### 需配合group方法才能打印内容

In [9]:
output_result = match_result.group()
print(output_result)

py


## 匹配任意字符

In [8]:
# 英文点表示匹配任意字符
re.match('.', text).group()

'p'

## 匹配任意的某个数字

In [23]:
text = 'hello3world'
text_num = '123python'
num = '123'


In [31]:
# match 方法，首字母不是数字，报错
result = re.match('\d', text).group()

AttributeError: 'NoneType' object has no attribute 'group'

In [125]:

re.match('\d', num).group()

'1'

In [37]:
re.match('\d', num).group()

'1'

## 匹配非数字

\D = [^0-9]

In [39]:
re.match('\D', text).group()

'h'

In [41]:
# 是数字，报错
re.match('\D', text_num).group

AttributeError: 'NoneType' object has no attribute 'group'

## 匹配空白字符
1. \s 匹配空白字符
2. 匹配空白字符
3. \n, \t, \r, 空白都表示空白字符

In [42]:
text1 = ' python'
text2 = '\n'
text3 = '\t'

In [44]:
re.match('\s', text1).group()

' '

In [46]:
re.match('\s', text2).group()

'\n'

## 匹配 大小写英文字母、数字、下划线和汉字！汉字！汉字！

\w = [0-9a-zA-Z]

In [127]:
text1 = 'python'
re.match('\w', text1).group()

'p'

In [70]:
text2 = '中文'
re.match('\w', text2).group()

'中'

In [72]:
# 报错
text3 = ',hello'
re.match('\w', text3).group()

AttributeError: 'NoneType' object has no attribute 'group'

In [73]:
# 报错
text4 = '————hello'
re.match('\w', text4).group()

AttributeError: 'NoneType' object has no attribute 'group'

## 反匹配\w=>\W

\W = [^0-9a-zA-Z]

In [77]:
text1 = ',python'
re.match('\W', text1).group()

','

In [78]:
text2 = '，中文'
re.match('\W', text2).group()

'，'

In [79]:
text = '中文'
re.match('\W', text).group()

AttributeError: 'NoneType' object has no attribute 'group'

## []组合方式，在括号内的内容均可匹配
> 类似与逻辑或

In [82]:
t1 = 'python'
re.match('[\d\w]', t1).group()

'p'


# 多字符匹配

## *匹配零个或多个字符

In [84]:
t = '13435434'
re.match('[\d]*', t).group()

'13435434'

In [89]:
# 无结果, 允许匹配零次，故不会报错
t = 'py4534534543'
re.match('[\d]*', t).group()

''

In [90]:

t = '4534-534,543'
re.match('[\d]*', t).group()

'4534'

## +匹配一个或多个结果

In [95]:
t = 'py343535-3435'
re.match('[py\d]+', t).group()

'py343535'

In [97]:
# 无匹配结果，报错  +匹配要求至少有一个结果,故此处报错
t = 'py343535-3435'
re.match('[\d]+', t).group()

AttributeError: 'NoneType' object has no attribute 'group'

## ? 匹配零个一个结果（出现第二个结果的时候就被舍弃）

In [103]:
t = '2343545python'
re.match('[\s\d\w]?', t).group()

'2'

## {n}, 匹配指定的次数

In [106]:
t = '123-456-789'
re.match('[\d]{3}', t).group()

'123'

In [107]:
# 指定次数内无匹配结果则出错
t = '123-456-789'
re.match('[\d]{4}', t).group()

AttributeError: 'NoneType' object has no attribute 'group'

## {m,n}, 匹配指定的区间次数, 默认最多次

In [109]:
t = '3434-1234|1234'
re.match('[\d]{2,4}', t).group()

'3434'

In [111]:
# 报错，至少需要两次匹配
t = '3|434-1234|1234'
re.match('[\d]{2,5}', t).group()

AttributeError: 'NoneType' object has no attribute 'group'

# 匹配任意一个字符

## 英文点. 表示任意任意一个字符，但[.]方括号包括后就表示本意为英文点.了

In [135]:
text = '--123-123-4567'
re.match('.', text).group()

'-'

In [137]:
# 无匹配内容，报错
text = '--123-123-4567'
re.match('[.]', text).group()

AttributeError: 'NoneType' object has no attribute 'group'

### 匹配邮箱
\w+@[0-9a-z]+[.][a-z]+

### 匹配电话
1[3-9][0-9]{9}

### 匹配身份证（非完美）二月特殊，没有带入法定的地区码
[1-9][0-9]{5}[18|19|20]{2}[0-9]{2}(0[1-9]|1[0-2])(0[1-9]|1[0-9]|2[0-9]|3[0-1])[0-9]{3}[0-9|x|X]

# 特殊匹配进阶

## re.search()方法，从全部文档内匹配，返回object对象

In [138]:
t = 'aa_python'

In [140]:
re.search('py', t).group()

'py'

## 美元符号$ 指定字符结尾

In [141]:
t = """342234L@qq.com
www.baidu.net
hello.png
http://hello.com
https://someany.cn
"""


In [163]:
re.search('(.*)[com$]', t).group()

'342234L@qq.com'

In [173]:
re.search('[(png)$]+', t)

<re.Match object; span=(25, 26), match='n'>

## [|] 匹配单个字符

In [174]:
t = 'https://www.google.com'
re.search('[http|https]+', t).group()

'https'

## (|) 匹配不同的字符串

In [181]:
# 匹配到第一个结构后就停止匹配
t = 'https://www.google.com'
re.match('(http|https)', t).group()

'http'

# 贪婪模式与非贪婪模式
1. 贪婪模式：尽可能的匹配更多的字符
2. 非贪婪模式，尽可能的匹配更少的字符

In [200]:
text = \
"""
<tr class="hots">
    <td class="1">hot1</td>
    <td class="2">hot2</td>
    <td class="3">hot3</td>
    <td class="4">hot4</td>
    <td class="5">hot5</td>
    <td class="6">爬虫</td>
</tr>
"""
    

### 贪婪模式，默认匹配更多次数
#### \s | \n包含空格或换行

In [208]:
result = re.match('\s<tr[\d\D]+>', text)
result.group()


'\n<tr class="hots">\n    <td class="1">hot1</td>\n    <td class="2">hot2</td>\n    <td class="3">hot3</td>\n    <td class="4">hot4</td>\n    <td class="5">hot5</td>\n    <td class="6">爬虫</td>\n</tr>'

### 非贪婪模式，默认匹配更少次数
#### 加了个问号

In [None]:
result = re.match('\s<tr[\d\D]+?>', text)
result.group()

'\n<tr class="hots">'

# 转义字符

## \ 表示转义

# group()函数

括号内可添加序号提取想要的值

In [210]:
t = 'my email is 27978798@qq.com and python1234@qq.com'

In [213]:
re.match('[\w\s]+\s(\w+@[0-9a-z]+\.com)[\w\s]+\s(\w+@[0-9a-z]+\.com)', t).group(1)

'27978798@qq.com'

# 正则表达式常见函数
1. re.match() 从前往后查找，第一个字符不满足就查找失败，返回object对象
2. re.search() 在整个字符串中查找内容，只返回查找结果的第一个，返回object对象
3. re.findall() 在整个字符串中查找所有满足条件的结果，返回列表

In [215]:
help(re.match)

Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a Match object, or None if no match was found.



In [216]:
help(re.search)

Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a Match object, or None if no match was found.



In [217]:
help(re.findall)

Help on function findall in module re:

findall(pattern, string, flags=0)
    Return a list of all non-overlapping matches in the string.
    
    If one or more capturing groups are present in the pattern, return
    a list of groups; this will be a list of tuples if the pattern
    has more than one group.
    
    Empty matches are included in the result.



In [218]:
help(re.sub)

Help on function sub in module re:

sub(pattern, repl, string, count=0, flags=0)
    Return the string obtained by replacing the leftmost
    non-overlapping occurrences of the pattern in string by the
    replacement repl.  repl can be either a string or a callable;
    if a string, backslash escapes in it are processed.  If it is
    a callable, it's passed the Match object and must return
    a replacement string to be used.



In [219]:
help(re.split)

Help on function split in module re:

split(pattern, string, maxsplit=0, flags=0)
    Split the source string by the occurrences of the pattern,
    returning a list containing the resulting substrings.  If
    capturing parentheses are used in pattern, then the text of all
    groups in the pattern are also returned as part of the resulting
    list.  If maxsplit is nonzero, at most maxsplit splits occur,
    and the remainder of the string is returned as the final element
    of the list.

