#### 정규 표현식
- 특정한 패턴과 일치하는 문자열을 검색, 치환, 제거 하는 기능을 지원
- re 모듈 사용
    - match(), search(), findall(), finditer(), ...
    - [a-zA-Z] : 모든 알파벳 패턴
    - . : \n 을 제외한 모든 문자
    - * : 0 ~ 무한대
    - + : 1 ~ 무한대
    - ? : 0 ~ 1
    - {} : {2} 최대 2, {2,10} 최소 2 ~ 최대 10

In [1]:
import re

# 검색할 패턴 작성
pattern = re.compile("D.A")

# 원본 문자열
origin = "DAA"

# 원본 문자열과 패턴이 일치하는가 ?
result = pattern.search(origin)
print(result) # <re.Match object; span=(0, 3), match='DAA'>

print("패턴 시작 위치", result.start()) # 패턴 시작 위치 0
print("패턴 끝 위치", result.end()) # 패턴 끝 위치 3
print("re와 일치하는 문자열 반환", result.group()) # re와 일치하는 문자열 반환 DAA
print("패턴 위치", result.span()) # 패턴 위치 (0, 3)

<re.Match object; span=(0, 3), match='DAA'>
패턴 시작 위치 0
패턴 끝 위치 3
re와 일치하는 문자열 반환 DAA
패턴 위치 (0, 3)


In [4]:
origin = "D00A"

# 원본 문자열과 패턴이 일치하는가 ?
result = pattern.search(origin)
print(result) # None

None


In [5]:
origin = "d0A D1A 0111"

# 원본 문자열과 패턴이 일치하는가 ?
result = pattern.search(origin)
print(result) # <re.Match object; span=(4, 7), match='D1A'>

<re.Match object; span=(4, 7), match='D1A'>


In [6]:
re.search(r"D.A", "DAA") # re.search(r"D.A", "DAA")

<re.Match object; span=(0, 3), match='DAA'>

In [11]:
# 개수 : ?, *, +, {}
pattern = re.compile("D?A") # D 가 최소 0, 최대 1개 까지 가능, A 문자가 있어야 함
print(pattern.search("A")) # <re.Match object; span=(0, 1), match='A'>
print(pattern.search("DA")) # <re.Match object; span=(0, 2), match='DA'>
print(pattern.search("AA")) # <re.Match object; span=(0, 1), match='A'>

pattern = re.compile("D*A") # D 가 최소 0, 최대 무한대 가능, A 문자가 있어야 함
print(pattern.search("A")) # <re.Match object; span=(0, 1), match='A'>
print(pattern.search("DA")) # <re.Match object; span=(0, 2), match='DA'>
print(pattern.search("DDDDDDDDDDDDDDDDDDDDDDAA")) # <re.Match object; span=(0, 23), match='DDDDDDDDDDDDDDDDDDDDDDA'>

pattern = re.compile("D+A") # D 가 최소 1, 최대 무한대 가능, A 문자가 있어야 함
print(pattern.search("A")) # None
print(pattern.search("DA")) # <re.Match object; span=(0, 2), match='DA'>
print(pattern.search("DDDDDDDDDDDDDDDDDDDDDDDAA")) # <re.Match object; span=(0, 24), match='DDDDDDDDDDDDDDDDDDDDDDDA'>

pattern = re.compile("AD{2}A") # D 가 최소 2, 최대 2, A 문자가 있어야 함
print(pattern.search("ADA")) # None
print(pattern.search("ADDA")) # <re.Match object; span=(0, 4), match='ADDA'>
print(pattern.search("ADDDDDDDDDDDDDDDDDDDDDDDAA")) # None

pattern = re.compile("AD{2,6}A") # D 가 최소 2, 최대 6, A 문자가 있어야 함
print(pattern.search("ADA")) # None
print(pattern.search("ADDA")) # <re.Match object; span=(0, 4), match='ADDA'>
print(pattern.search("ADDDDDDDDDDDDDDDDDDDDDDDAA")) # None

<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 23), match='DDDDDDDDDDDDDDDDDDDDDDA'>
None
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 24), match='DDDDDDDDDDDDDDDDDDDDDDDA'>
None
<re.Match object; span=(0, 4), match='ADDA'>
None
None
<re.Match object; span=(0, 4), match='ADDA'>
None


In [12]:
pattern = re.compile("[ABCDEFGabcdefg]") # 
print(pattern.search("aa1234")) # <re.Match object; span=(0, 1), match='a'>
print(pattern.search("A4567")) # <re.Match object; span=(0, 1), match='A'>

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='A'>


In [13]:
pattern = re.compile("[A-Ga-g]") # 
print(pattern.search("aa1234")) # <re.Match object; span=(0, 1), match='a'>
print(pattern.search("A4567")) # <re.Match object; span=(0, 1), match='A'>

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='A'>


In [14]:
pattern = re.compile("[A-Ga-g]+") # 
print(pattern.search("aa1234")) # <re.Match object; span=(0, 2), match='aa'>
print(pattern.search("A4567")) # <re.Match object; span=(0, 1), match='A'>

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 1), match='A'>


In [15]:
pattern = re.compile("[A-Za-z0-9]+")  
print(pattern.search("aa1234")) # <re.Match object; span=(0, 6), match='aa1234'>
print(pattern.search("A4567")) # <re.Match object; span=(0, 5), match='A4567'>

<re.Match object; span=(0, 6), match='aa1234'>
<re.Match object; span=(0, 5), match='A4567'>


In [16]:
# [^찾을패턴] : 찾을 패턴이 아닌(NOT)
pattern = re.compile("[^A-Za-z0-9]+")  
print(pattern.search("aa1234!@#$%^")) # <re.Match object; span=(6, 12), match='!@#$%^'>
print(pattern.search("A4567")) # None

<re.Match object; span=(6, 12), match='!@#$%^'>
None


In [17]:
# 한글
pattern = re.compile("[가-힣]+")  
print(pattern.search("aa1234대한민국")) # <re.Match object; span=(6, 10), match='대한민국'>
print(pattern.search("A백두산4567")) # <re.Match object; span=(1, 4), match='백두산'>

<re.Match object; span=(6, 10), match='대한민국'>
<re.Match object; span=(1, 4), match='백두산'>


In [18]:
# search 와 match : 동작방식은 동일
pattern = re.compile("[a-z]+")  
print(pattern.search("aa1234대한민국")) # <re.Match object; span=(0, 2), match='aa'>
print(pattern.match("aa1234대한민국")) # <re.Match object; span=(0, 2), match='aa'>

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 2), match='aa'>


In [21]:
# sub 원본문자열에 사용
origin = "DDA D1A DDA DA"
# sub(패턴, 바꿀문자열, 원본문자열)
print(re.sub("D.A","Dave", origin)) # Dave Dave Dave DA

# sub 패턴에 사용
pattern = re.compile("D.A")
# sub(바꿀문자열, 원본문자열)
pattern.sub("Dave", origin) # 'Dave Dave Dave DA'

Dave Dave Dave DA


'Dave Dave Dave DA'

In [23]:
# findall() : 표현식과 매칭되는 문자들을 리스트로 반환
pattern = re.compile("[a-z]+")
origin = "Game of Life in Python"

# pattern.findall(origin) # ['ame', 'of', 'ife', 'in', 'ython']
for w in pattern.findall(origin):
    print(w) # ame of ife in ython

ame
of
ife
in
ython


In [25]:
# finditer()
for w in pattern.finditer(origin):
    print(w.group()) # ame of ife in ython

ame
of
ife
in
ython


In [26]:
pattern = re.compile(":")
pattern.split("python:java:javascript") # ['python', 'java', 'javascript']

['python', 'java', 'javascript']

In [32]:
# VS 를 기준으로 문자열 분리
origin = "python VS java"
pattern = re.compile(" VS ")
print(pattern.split(origin)) # ['python', 'java']

# - 기호를 * 로 바꿔서 출력
jumin = "801210-10111323"
pattern = re.compile("-")
print(pattern.sub("*" , jumin)) # 801210*10111323

['python', 'java']
801210*10111323


In [40]:
# data_kr 엑셀 읽기
# 주민번호 뒷자리를 * 로 바꿔서 보여주기
from openpyxl import Workbook

from openpyxl import load_workbook
excel_file = load_workbook("./file/data_kr.xlsx")

# 워크시트 가져오기
work_sheet = excel_file.active

pattern = re.compile(r"[0-9]{7}")

for row in work_sheet.rows:
    print(re.sub(pattern, "*******", row[1].value))

excel_file.close()

주민등록번호
800215-*******
821030-*******
841230-*******
790903-*******
800125-*******
820612-*******


In [42]:
origin = "<b>아이폰</b>"

# +?, *? : 매칭을 최소화
pattern = re.compile("<.*?>")
pattern.search(origin) # <re.Match object; span=(0, 3), match='<b>'>

<re.Match object; span=(0, 3), match='<b>'>

In [47]:
import requests
from bs4 import BeautifulSoup

res = requests.get("https://www.naver.com")
soup = BeautifulSoup(res.text, "lxml")

# h 로 시작하는 모든 태그 요소 찾기
print(soup.find_all(string=re.compile(r"h\d")))

# 이미지 요소 찾기(jpg or png)
print(soup.find_all("img", attrs={"src":re.compile(r".+\.jpg|png")}))

['\nwindow["EAGER-DATA"] = window["EAGER-DATA"] || {};\nwindow["EAGER-DATA"]["PC-FEED-WRAPPER"] = {"@type":"BLOCK","blocks":[{"@type":"BLOCK","blocks":[{"@type":"BLOCK","blocks":null,"materials":null,"excludeInPaging":false,"positionForPaging":0,"realtime":false,"_id":null,"@type":"BLOCK","@code":"PC-FEED-LIVINGHOME-CAS-EDIT","@template":"PC-FEED-CAS-EDIT","@flowId":null,"@flowExecutionId":null,"@provider":null,"@lastModifiedAt":null},{"@type":"BLOCK","blocks":null,"materials":[{"@type":"MATERIAL-PC-FEED","title":"단독주택 작은 거실 인테리어 천장 커튼 웨이브레일 집꾸미기","url":"https://in.naver.com/chewing/contents/internal/687473894307360","image":{"url":"https://s.pstatic.net/dthumb.phinf/?src=%22http%3A%2F%2Fblogfiles.naver.net%2FMjAyNDA0MjZfMjc4%2FMDAxNzE0MTM4NjMyMzkw.y97bG6u9LOkwDT_hfpIHwOXxFzzkdGYlgG00JDQyfMcg.7QF5c1EDUhmIMPhxzUhQhSuUH2G9OF_MvScb1kKS00Ig.JPEG%2F20240418175819%EF%BC%BFIMG%EF%BC%BF6193.jpg%22&type=ff364_236&service=navermain"},"source":{"name":"쥬잉","image":{"url":"https://s.pstatic.net/dt

In [57]:
excel_file = load_workbook("./file/train.xlsx")

# 워크시트 가져오기
work_sheet = excel_file.active

pattern = re.compile(r" Mr.")

for row in work_sheet.rows:
    if len(pattern.findall(row[3].value)) > 0:
        if pattern.findall(row[3].value)[0].strip() == "Mr.":
            print(row[3].value)

excel_file.close()

Braund, Mr. Owen Harris
Allen, Mr. William Henry
Moran, Mr. James
McCarthy, Mr. Timothy J
Saundercock, Mr. William Henry
Andersson, Mr. Anders Johan
Williams, Mr. Charles Eugene
Fynney, Mr. Joseph J
Beesley, Mr. Lawrence
Sloper, Mr. William Thompson
Emir, Mr. Farred Chehab
Fortune, Mr. Charles Alexander
Todoroff, Mr. Lalio
Wheadon, Mr. Edward H
Meyer, Mr. Edgar Joseph
Holverson, Mr. Alexander Oskar
Mamee, Mr. Hanna
Cann, Mr. Ernest Charles
Kraeff, Mr. Theodor
Rogers, Mr. William John
Lennon, Mr. Denis
Samaan, Mr. Youssef
Nosworthy, Mr. Richard Cater
Ostby, Mr. Engelhart Cornelius
Woolner, Mr. Hugh
Novel, Mr. Mansouer
Sirayanian, Mr. Orsen
Harris, Mr. Henry Birkhardt
Stewart, Mr. Albert A
Crease, Mr. Ernest James
Kink, Mr. Vincenz
Jenkin, Mr. Stephen Curnow
Hood, Mr. Ambrose Jr
Chronopoulos, Mr. Apostolos
Bing, Mr. Lee
Moen, Mr. Sigurd Hansen
Staneff, Mr. Ivan
Moutal, Mr. Rahamin Haim
Waelens, Mr. Achille
Sheerlinck, Mr. Jan Baptist
Carrau, Mr. Francisco M
Ford, Mr. William Neal
Slocovs

In [67]:
# Mr. → 남성, Miss. → 미혼여성, Mrs. → 기혼여성, X → 없음
from openpyxl import Workbook

# 엑셀파일 읽어오기
excel_file = load_workbook("./file/train.xlsx")
# 활성시트 가져오기
work_sheet = excel_file.active

# 새 엑셀 파일 작성
wb = Workbook()
work_sheet_man = wb.active
work_sheet_man.column_dimensions["D"].width = 70
work_sheet_man.title = "남성"

work_sheet_women = wb.create_sheet(title="기혼여성")
work_sheet_women.column_dimensions["D"].width = 70

work_sheet_solo_women = wb.create_sheet(title="미혼여성")
work_sheet_solo_women.column_dimensions["D"].width = 70

work_sheet_other = wb.create_sheet(title="기타")
work_sheet_other.column_dimensions["D"].width = 70

pattern = re.compile(r" [A-Za-z]+\.")

list1 = []
for row in work_sheet.rows:
    if pattern.search(row[3].value):
        data = pattern.search(row[3].value).group()
        # print(data)

    # 제목 행 옮기기
    if row[0].row == 1:
        # for title in row:
        #     list1.append(title.value)
            
        # work_sheet_man.append(list1)
        # work_sheet_women.append(list1)
        # work_sheet_solo_women.append(list1)
        # work_sheet_other.append(list1)

        # list 만들지 않고, for 쓰지 않고 쓰는 코드
        work_sheet_man.append([title.value for title in row])
        work_sheet_women.append([title.value for title in row])
        work_sheet_solo_women.append([title.value for title in row])
        work_sheet_other.append([title.value for title in row])
    else:
        if data:
            if data == " Mr.":
                work_sheet_man.append([col.value for col in row])
            elif data == " Mrs.":
                work_sheet_women.append([col.value for col in row])
            elif data ==" Miss.":
                work_sheet_solo_women.append([col.value for col in row])
            else:
                work_sheet_other.append([col.value for col in row])


wb.save("./file/train_gender.xlsx")
wb.close()

excel_file.close()