In [3]:
# 你需要将一个字符串分割为多个字段，但是分隔符(还有周围的空格)并不是固定的
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re
# 分隔符可以是逗号，分号或者是空格
print(re.split(r'[;,\s]\s*', line))
fields = re.split(r'(;|,|\s)\s*', line)
print(fields)
# 如果你不想保留分割字符串到结果列表中去
# 但仍然需要使用到括号来分组正则表达式的话， 确保你的分组是非捕获分组
re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']


['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [5]:
# 字符串开头或结尾匹配
# 你需要通过指定的文本模式去检查字符串的开头或者结尾，比如文件名后缀，URL Scheme等等
# 检查字符串开头或结尾的一个简单方法是使用 str.startswith() 或者是 str.endswith() 方法
filename = 'spam.txt'
filename.endswith('.txt')
url = 'http://www.python.org'
url.startswith('http:')

True

In [8]:
import os
filenames = os.listdir('.')
print(filenames)
l = [name for name in filenames if name.endswith(('.c', '.h')) ]
print(l)
any(name.endswith('.py') for name in filenames)

['.ipynb_checkpoints', '字符串和文本.ipynb', '数据结构和算法.ipynb']
[]


False

In [9]:
from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http:', 'https:', 'ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

In [11]:
# 用Shell通配符匹配字符串
# fnmatch 模块提供了两个函数—— fnmatch() 和 fnmatchcase() ，可以用来实现这样的匹配
from fnmatch import fnmatch, fnmatchcase
fnmatch('foo.txt', '*.txt')
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
[name for name in names if fnmatch(name, 'Dat*.csv')]
# fnmatch() 函数使用底层操作系统的大小写敏感规则(不同的系统是不一样的)来匹配模式
# On Windows
fnmatch('foo.txt', '*.TXT')
# 如果你对这个区别很在意，可以使用 fnmatchcase() 来代替。它完全使用你的模式大小写匹配
fnmatchcase('foo.txt', '*.TXT')

['Dat1.csv', 'Dat2.csv']

In [12]:
addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]
from fnmatch import fnmatchcase
a = [addr for addr in addresses if fnmatchcase(addr, '* ST')]
b = [addr for addr in addresses if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')]
print(a, b)

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST'] ['5412 N CLARK ST']


In [15]:
# 字符串匹配和搜索
# 你想匹配或者搜索特定模式的文本
text = 'yeah, but no, but yeah, but no, but yeah'
text.find('no')
# 对于复杂的匹配需要使用正则表达式和 re 模块
text1 = '11/27/2012'
import re
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')
# 如果你想使用同一个模式去做多次匹配，你应该先将模式字符串预编译为模式对象
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')

yes
yes


In [20]:
# match() 总是从字符串开始去匹配，
# 如果你想查找字符串任意部分的模式出现位置， 使用 findall() 方法去代替
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat = re.compile(r'\d+/\d+/\d+')
datepat.findall(text)
# 通常会利用括号去捕获分组
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.group(0), m.group(1), m.group(2), m.group(3))
month, day, year = m.groups()
# 如果你想以迭代方式返回匹配，可以使用 finditer() 方法来代替
for m in datepat.finditer(text):
    print(m.groups())

11/27/2012 11 27 2012
('11', '27', '2012')
('3', '13', '2013')


In [21]:
# 字符串搜索和替换
# 你想在字符串中搜索和匹配指定的文本模式
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')

'yep, but no, but yep, but no, but yep'

In [22]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [26]:
# 对于更加复杂的替换，可以传递一个替换回调函数来代替
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
datepat.sub(change_date, text)
# 如果除了替换后的结果外，你还想知道有多少替换发生了
newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext, n)

Today is 2012-11-27. PyCon starts 2013-3-13. 2


In [28]:
# 需要以忽略大小写的方式搜索与替换文本字符串
text = 'UPPER PYTHON, lower python, Mixed Python'
# 为了在文本操作时忽略大小写，你需要在使用 re 模块的时候给这些操作提供 re.IGNORECASE 标志参数
s = re.findall('python', text, flags=re.IGNORECASE)
print(s)
v = re.sub('python', 'snake', text, flags=re.IGNORECASE)
print(v)

['PYTHON', 'python', 'Python']
UPPER snake, lower snake, Mixed snake


In [29]:
# 最后的那个例子揭示了一个小缺陷，替换字符串并不会自动跟被匹配字符串的大小写保持一致
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace
v = re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)
print(v)

UPPER SNAKE, lower snake, Mixed Snake


In [32]:
# 最短匹配模式
# 你正在试着用正则表达式匹配某个文本模式，但是它找到的是模式的最长可能匹配
# 而你想修改它变成查找最短的可能匹配
str_pat = re.compile(r'"(.*)"')
text1 = 'Computer says "no."'
str_pat.findall(text1)
# 在正则表达式中*操作符是贪婪的，因此匹配操作会查找最长的可能匹配
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)
# 为了修正这个问题，可以在模式中的*操作符后面加上?修饰符
str_pat = re.compile(r'"(.*?)"')
str_pat.findall(text2)

['no.', 'yes.']

In [38]:
# 多行匹配模式
# 你正在试着使用正则表达式去匹配一大块的文本，而你需要跨越多行去匹配
# 点(.)不能匹配换行符的事实
comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''/* this is a
multiline comment */'''
comment.findall(text1)
comment.findall(text2)
# 为了修正这个问题，你可以修改模式字符串，增加对换行的支持
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
comment.findall(text2)
# re.compile() 函数接受一个标志参数叫 re.DOTALL，它可以让正则表达式中的点(.)匹配包括换行符在内的任意字符
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\nmultiline comment ']

In [41]:
# 将Unicode文本标准化
# 你正在处理Unicode字符串，需要确保所有字符串在底层有相同的表示
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1, s2)
s1 == s2

Spicy Jalapeño Spicy Jalapeño


False

In [43]:
# 可以使用unicodedata模块先将文本标准化
import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print(t1 == t2)
print(ascii(t1))

True
'Spicy Jalape\xf1o'


In [49]:
#  在正则式中使用Unicode
import re
num = re.compile('\d+')
num.match('123')
num.match('\u0661\u0662\u0663')
arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')
print(arabic)
pat = re.compile('stra\u00dfe', re.IGNORECASE)
s = 'straße'
pat.match(s)
pat.match(s.upper())
s.upper()

re.compile('[\u0600-ۿݐ-ݿࢠ-ࣿ]+')


'STRASSE'

In [51]:
# 删除字符串中不需要的字符
# 你想去掉文本字符串开头，结尾或者中间不想要的字符，比如空白
s = ' hello world \n'
s.strip()
s.lstrip()
s.rstrip()
t = '-----hello====='
t.lstrip('_')
t.strip('-=')

'hello'

In [53]:
s = ' hello     world \n'
s = s.strip()
import re
re.sub('\s+', ' ', s)

'hello world'

In [55]:
# 审查清理文本字符串
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {ord('\t') : ' ',ord('\f') : ' ',ord('\r') : None}
a = s.translate(remap)
a

'pýtĥöñ is awesome\n'

In [58]:
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
print(b)
b.translate(cmb_chrs)
digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' }
len(digitmap)

pýtĥöñ is awesome



610

In [66]:
# 你想通过某种对齐方式来格式化字符串
# 对于基本的字符串对齐操作，可以使用字符串的 ljust() , rjust() 和 center() 方法
text = 'Hello World'
text.ljust(20)
text.rjust(20)
text.center(20)
text.rjust(20,'=')
text.center(20,'*')
# 函数 format() 同样可以用来很容易的对齐字符串
# 你要做的就是使用 <,> 或者 ^ 字符后面紧跟一个指定的宽度
format(text, '>20')
format(text, '<20')
format(text, '^20')
format(text, '=>20s')
format(text, '*^20s')
'{:>10s} {:>10s}'.format('Hello', 'World')

'     Hello      World'

In [70]:
# 想将几个小的字符串合并为一个大的字符串
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
' '.join(parts)
','.join(parts)
a = 'Is Chicago'
b = 'Not Chicago?'
print(a + ' ' + b)
print('{} {}'.format(a,b))

Is Chicago Not Chicago?
Is Chicago Not Chicago?


In [74]:
# 字符串中插入变量
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)
# format 和 format_map() 的一个缺陷就是它们并不能很好的处理变量缺失的情况

'Guido has 37 messages.'

In [76]:
name = 'Guido'
n = 37
import string
s = string.Template('$name has $n messages.')
s.substitute(vars())

'Guido has 37 messages.'

In [79]:
# 你有一些长字符串，想以指定的列宽将它们重新格式化
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
import textwrap
print(textwrap.fill(s, 70))
print(textwrap.fill(s, 40))
print(textwrap.fill(s, 40, initial_indent='    '))
print(textwrap.fill(s, 40, subsequent_indent='    '))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.
Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


In [80]:
# textwrap 模块对于字符串打印是非常有用的，特别是当你希望输出自动匹配终端大小的时候
# 你可以使用 os.get_terminal_size() 方法来获取终端的大小尺寸
import os
os.get_terminal_size().columns

120

In [81]:
# 在字符串中处理html和xml
# 你想将HTML或者XML实体如 &entity; 或 &#code; 替换为对应的文本
# 再者，你需要转换文本中特定的字符(比如<, >, 或 &)
s = 'Elements are written as "<tag>text</tag>".'
import html
print(s) # Elements are written as "<tag>text</tag>".
print(html.escape(s)) # Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
# Disable escaping of quotes
print(html.escape(s, quote=False))

Elements are written as "<tag>text</tag>".
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".


In [82]:
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')

b'Spicy Jalape&#241;o'

In [87]:
# 你有一个字符串，想从左至右将其解析为一个令牌流
text = 'foo = 23 + 42 * 10'
tokens = [('NAME', 'foo'), ('EQ','='), ('NUM', '23'), ('PLUS','+'),
          ('NUM', '42'), ('TIMES', '*'), ('NUM', '10')]
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
scanner = master_pat.scanner('foo = 42')
scanner.match() # <_sre.SRE_Match object at 0x100677738>
# _.lastgroup, _.group() # ('NAME', 'foo')
# scanner.match() # <_sre.SRE_Match object at 0x100677738>
# _.lastgroup, _.group() # ('WS', ' ')
# scanner.match() # <_sre.SRE_Match object at 0x100677738>
# _.lastgroup, _.group() # ('EQ', '=')
# scanner.match() # <_sre.SRE_Match object at 0x100677738>
# _.lastgroup, _.group() # ('WS', ' ')
# scanner.match() # <_sre.SRE_Match object at 0x100677738>
# _.lastgroup, _.group() # ('NUM', '42')
# scanner.match()

<re.Match object; span=(0, 3), match='foo'>

In [91]:
from collections import namedtuple
def generate_tokens(pat, text):
    Token = namedtuple('Token', ['type', 'value'])
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

# Example use
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)
# Produces output
# Token(type='NAME', value='foo')
# Token(type='WS', value=' ')
# Token(type='EQ', value='=')
# Token(type='WS', value=' ')
# Token(type='NUM', value='42')
print()
# 如果你想过滤令牌流，你可以定义更多的生成器函数或者使用一个生成器表达式
tokens = (tok for tok in generate_tokens(master_pat, text)
          if tok.type != 'WS')
for tok in tokens:
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')

Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='23')
Token(type='PLUS', value='+')
Token(type='NUM', value='42')
Token(type='TIMES', value='*')
Token(type='NUM', value='10')


In [93]:
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'

master_pat = re.compile('|'.join([LE, LT, EQ])) # Correct
# master_pat = re.compile('|'.join([LT, LE, EQ])) # Incorrect
PRINT = r'(?P<PRINT>print)'
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'

master_pat = re.compile('|'.join([PRINT, NAME]))

for tok in generate_tokens(master_pat, 'printer'):
    print(tok)

Token(type='PRINT', value='print')
Token(type='NAME', value='er')


In [94]:
# 实现一个简单的递归下降分析器
# 你想根据一组语法规则解析文本并执行命令，或者构造一个代表输入的抽象语法树
# 如果语法非常简单，你可以不去使用一些框架，而是自己写这个解析器
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
Topic: 下降解析器
Desc :
"""
import re
import collections

# Token specification
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
                                  DIVIDE, LPAREN, RPAREN, WS]))
# Tokenizer
Token = collections.namedtuple('Token', ['type', 'value'])


def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok


# Parser
class ExpressionEvaluator:
    '''
    Implementation of a recursive descent parser. Each method
    implements a single grammar rule. Use the ._accept() method
    to test and accept the current lookahead token. Use the ._expect()
    method to exactly match and discard the next token on on the input
    (or raise a SyntaxError if it doesn't match).
    '''

    def parse(self, text):
        self.tokens = generate_tokens(text)
        self.tok = None  # Last symbol consumed
        self.nexttok = None  # Next symbol tokenized
        self._advance()  # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

    def _accept(self, toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self, toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    # Grammar rules follow
    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval

    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval

    def factor(self):
        "factor ::= NUM | ( expr )"
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')


def descent_parser():
    e = ExpressionEvaluator()
    print(e.parse('2'))
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))
    # print(e.parse('2 + (3 + * 4)'))
    # Traceback (most recent call last):
    #    File "<stdin>", line 1, in <module>
    #    File "exprparse.py", line 40, in parse
    #    return self.expr()
    #    File "exprparse.py", line 67, in expr
    #    right = self.term()
    #    File "exprparse.py", line 77, in term
    #    termval = self.factor()
    #    File "exprparse.py", line 93, in factor
    #    exprval = self.expr()
    #    File "exprparse.py", line 67, in expr
    #    right = self.term()
    #    File "exprparse.py", line 77, in term
    #    termval = self.factor()
    #    File "exprparse.py", line 97, in factor
    #    raise SyntaxError("Expected NUMBER or LPAREN")
    #    SyntaxError: Expected NUMBER or LPAREN


if __name__ == '__main__':
    descent_parser()

2
5
14
37


In [98]:
# 你想在字节字符串上执行普通的文本操作(比如移除，搜索和替换)
data = b'Hello World'
data[0:5] # b'Hello'
data.startswith(b'Hello') # True
data.split() # [b'Hello', b'World']
data.replace(b'Hello', b'Hello Cruel')
# 这些操作同样也适用于字节数组
data = bytearray(b'Hello World') 
data[0:5] # bytearray(b'Hello')
data.startswith(b'Hello') # True
data.split() # [bytearray(b'Hello'), bytearray(b'World')]
data.replace(b'Hello', b'Hello Cruel')

bytearray(b'Hello Cruel World')

In [99]:
# 字节字符串的索引操作返回整数而不是单独字符
b = b'Hello World' # Byte string
b[0] # 72
b[1]

101