# Chap7. 정규표현식

7-1 정규표현식 살펴보기

In [1]:
data = """
park 800905-1049118
kim 700905-1059119
"""
result= []
for line in data.split("\n"):
    word_result=[]
    for word in line.split(" "):
        if len(word)==14 and word[:6].isdigit() and word[7:].isdigit():
            word = word[:6] + "-"+"*******"
        word_result.append(word)
    result.append(" ".join(word_result))
print("\n".join(result))


park 800905-*******
kim 700905-*******



In [2]:
import re

data = """
park 800905-1049118
kim 700905-1059119
"""
pat = re.compile("\d{6}[-]\d{7}")
print(pat.sub("\g<1>-*******",data))

error: invalid group reference 1 at position 3

7-2 정규표현식 시작하기

In [4]:
import re
p= re.compile('[a-z]+')

#문자열 검색
#match: 처음부터 정규식과 매치되는가
m= p.match("python") #정규식에 부합-> match 객제 돌려줌
print(m)
n= p.match("3 python")
print(n) #부합하지 않을 경우 NONE

<_sre.SRE_Match object; span=(0, 6), match='python'>
None


In [5]:
#search: 전체를 검색하여 매치되는가
m= p.search("python")
print(m)
n= p.search("3 python")
print(n)

<_sre.SRE_Match object; span=(0, 6), match='python'>
<_sre.SRE_Match object; span=(2, 8), match='python'>


In [6]:
#findall: 모든 문자열을 리스트화
result = p.findall("life is too short")
print(result)

['life', 'is', 'too', 'short']


In [8]:
#finditer: 모든 문자열을 반복가능한 객체화
result = p.finditer("life is too short")
print(result)
for r in result: print(r)

<callable_iterator object at 0x000001427698B7B8>
<_sre.SRE_Match object; span=(0, 4), match='life'>
<_sre.SRE_Match object; span=(5, 7), match='is'>
<_sre.SRE_Match object; span=(8, 11), match='too'>
<_sre.SRE_Match object; span=(12, 17), match='short'>


In [11]:
#match객체의 메서드
import re
p= re.compile('[a-z]+')
m= p.match("python")
print(m.group())
print(m.start())
print(m.end())
print(m.span())

n= p.search("3 python")
print(n.group())
print(n.start())
print(n.end())
print(n.span())

python
0
6
(0, 6)
python
2
8
(2, 8)


In [12]:
#컴파일 옵션
#dotall(s): 줄바꿈 문자를 포함한 모든 문자와 매치
import re
p= re.compile('a.b')
m= p.match('a\nb')
print(m)

p= re.compile('a.b', re.DOTALL)
m= p.match('a\nb')
print(m)

None
<_sre.SRE_Match object; span=(0, 3), match='a\nb'>


In [13]:
#ignorecase(i): 대소문자 구별없이 매치
p= re.compile('[a-z]',re.I)
m= p.match('PYthon')
print(m)

<_sre.SRE_Match object; span=(0, 1), match='P'>


In [14]:
#multiline(m): ^;시작, $;마지막 메타문자를 문자열의 각 줄마다 적용
import re
p= re.compile("^python\s\w+") #문자열 전체의 처음

data= """python one
life is too short
python two
you need python
python three"""
print(p.findall(data))

p= re.compile("^python\s\w+", re.MULTILINE) #각 라인의 처음
data= """python one
life is too short
python two
you need python
python three"""
print(p.findall(data))

['python one']
['python one', 'python two', 'python three']


In [15]:
#verbose(x): 정규식을 주석 또는 줄 단위로 구분
charref= re.compile(r'&[#](0[0-7]+|[0-9]+|x[0-9a-fA-F]+);')
charref= re.compile(r"""
&[#]
(0[0-7]+
|[0-9]+
|x[0-9a-fA-F]+
)
;
""",re.VERBOSE)

In [16]:
#\가 문자-> \\
p= re.compile('\\\\section')
p= re.compile(r'\\section')

7-3 강력한 정규 표현식의 세계로

In [2]:
#메타문자
# |; 또는
import re
p= re.compile('Crow|Servo')
m= p.match("CrowHello")
print(m)

# ^(re.MULTILINE 옵션 적용), \A; 문자열 맨처음
print(re.search('^Life','Life is too short'))
print(re.search('^Life',"My Life"))
print(re.search('\ALife','Life is too short'))

# $(re.MULTILINE 옵션 적용), \Z; 문자열 맨끝
print(re.search('short$',"My life is short"))
print(re.search('short$',"short life"))
print(re.search('short\Z','My life is short'))

# \b; 단어구분자(' ')를 의미 -> raw string규칙 꼭!
p= re.compile(r'\bclass\b')
print(p.search('no class at all'))
print(p.search('the declassified algorithm'))

# \B; ' '로 구분된 단어가 아닌 경우
p= re.compile(r'\Bclass\B')
print(p.search('no class at all'))
print(p.search('the declassified algorithm'))
print(p.search('one subclass is'))

<_sre.SRE_Match object; span=(0, 4), match='Crow'>
<_sre.SRE_Match object; span=(0, 4), match='Life'>
None
<_sre.SRE_Match object; span=(0, 4), match='Life'>
<_sre.SRE_Match object; span=(11, 16), match='short'>
None
<_sre.SRE_Match object; span=(11, 16), match='short'>
<_sre.SRE_Match object; span=(3, 8), match='class'>
None
None
<_sre.SRE_Match object; span=(6, 11), match='class'>
None


In [7]:
#그루핑
import re
p= re.compile('(ABC)+')
m= p.search('ABCABCABC OK?')
print(m)
print(m.group())

p= re.compile(r"\w+\s+\d+[-]\d+[-]\d+")
m= p.search("park 010-1234-5678")

p= re.compile(r"(\w+)\s+((\d+)[-]\d+[-]\d+)")
m= p.search("park 010-1234-5678")
print(m.group(0))
print(m.group(1))
print(m.group(3))

#그루핑된 문자열을 재참조
p= re.compile(r'(\b\w+)\s+\1')
m= p.search('Paris in the the spring')
print(m.group())

<_sre.SRE_Match object; span=(0, 9), match='ABCABCABC'>
ABCABCABC
park 010-1234-5678
park
010
the the


In [8]:
#그루핑된 문자열에 이름 붙이기
p= re.compile(r"(?P<name>\w+)\s+((\d+)[-]\d+[-]\d+)")
m= p.search("park 010-1234-5678")
print(m.group("name"))

p= re.compile(r"(?P<word>\b\w+)\s+(?P=word)")
p.search('Paris in the the spring').group()

park


'the the'

In [17]:
#전방 탐색
p= re.compile(".+:")
m= p.search("http://naver.com")
print(m.group())

#긍정형 전방 탐색
p= re.compile(".+(?=:)")
m= p.search("http://naver.com")
print(m.group())

p= re.compile(".*[.](?=[^b]..|.[^a].|..[^t])")
m= p.search("autoexec.bat")
print(m)
m= p.search("autoexec.bas")
print(m.group())

#부정형 전방 탐색
p= re.compile(".*[.](?!bat)")
m= p.search("autoexec.bat")
print(m)

http:
http
None
autoexec.
None


In [23]:
#문자열 바꾸기 -> sub(바꿀문자열, 대상문자열)
p= re.compile('(blue|white|red)')
print(p.sub('colour','blue socks and red shoes'))
print(p.sub('colour','blue socks and red shoes',count=1)) #횟수 제한

#subn; 튜플로 돌려준다. #(문자열, 바꾼 횟수)
p= re.compile('(blue|white|red)')
print(p.subn('colour','blue socks and red shoes'))

colour socks and colour shoes
colour socks and red shoes
('colour socks and colour shoes', 2)


In [25]:
p= re.compile(r"(?P<name>\w+)\s+(?P<phone>(\d+)[-]\d+[-]\d+)")
print(p.sub("\g<phone> \g<name>", "park 010-1234-1234")) #\g<그룹이름>; 그룹이름 참조 가능

010-1234-1234 park


In [26]:
#sub의 첫매개변수-> 함수; 해당함수의 첫매개변수에 match객체가 입력되고 이는 함수의 반환 값으로 변환
def hexrepl(match):
    value= int(match.group())
    return hex(value) #16진수로 변환
p= re.compile(r'\d+')
p.sub(hexrepl, 'Call 65490 for printing, 49152 for user code.')

'Call 0xffd2 for printing, 0xc000 for user code.'

In [27]:
s= '<html><head><title>Title</title>'
len(s)
print(re.match('<.*>',s).span())
print(re.match('<.*>',s).group()) #greedy

print(re.match('<.*?>',s).group()) #non-greedy; 가장 최소한의 반복을 수행하도록 도와준다.

(0, 32)
<html><head><title>Title</title>
<html>
