#### 정규 표현식
- 특정한 패턴과 일치하는 문자열을 검색, 치환, 제거 하는 기능을 지원
- re 모듈 사용
    - match(), search(), findall(), finditer()...
    - [a-zA-Z] : 모든 알파벳 패턴
    - . : \n 을 제외한 모든 문자
    - '*' : 0 ~ 무한대, + : 1 ~ 무한대, ? : 0 ~ 1, {2}, {2, 10}

In [1]:
# 정규식
import re

# 검색할 패턴 작성
pattern = re.compile("D.A")

# 원본 문자열
origin = "DAA"

# 원본 문자열과 패턴이 일치하는가?
result = pattern.search(origin)
print(result)

print("패턴 시작 위치 ", result.start())
print("패턴 끝 위치 ", result.end())
print("re와 일치하는 문자열 반활 ", result.group())
print("패턴 위치 ", result.span())


<re.Match object; span=(0, 3), match='DAA'>
패턴 시작 위치  0
패턴 끝 위치  3
re와 일치하는 문자열 반활  DAA
패턴 위치  (0, 3)


In [3]:
origin = "D00A"

# 원본 문자열과 패턴이 일치하는가?
result = pattern.search(origin)
result

In [4]:
origin = "d0A D1A 0111"

# 원본 문자열과 패턴이 일치하는가?
result = pattern.search(origin)
result

<re.Match object; span=(4, 7), match='D1A'>

In [5]:
re.search(r"D.A", "DAA")

<re.Match object; span=(0, 3), match='DAA'>

In [6]:
pattern = re.compile("D?A") # D 가 최소 0 최대 1 가능, A 문자가 잇어야 함
print(pattern.search("A"))
print(pattern.search("DA"))
print(pattern.search("AA"))

<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 1), match='A'>


In [8]:
pattern = re.compile("D*A") # D 가 최소 0 최대 무한대 가능, A 문자가 잇어야 함
print(pattern.search("A"))
print(pattern.search("DA"))
print(pattern.search("DDDDDDDDDDDDAA"))

<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 13), match='DDDDDDDDDDDDA'>


In [9]:
pattern = re.compile("D+A") # D 가 최소 1 최대 무한대 가능, A 문자가 잇어야 함
print(pattern.search("A"))
print(pattern.search("DA"))
print(pattern.search("DDDDDDDDDDDDAA"))

None
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 13), match='DDDDDDDDDDDDA'>


In [10]:
pattern = re.compile("AD{2}A") # D 가 최소 2 최대 2 가능, A 문자가 잇어야 함
print(pattern.search("ADA"))
print(pattern.search("ADDA"))
print(pattern.search("ADDDDDDDDDDDDAA"))

None
<re.Match object; span=(0, 4), match='ADDA'>
None


In [11]:
pattern = re.compile("AD{2,6}A") # D 가 최소 2 최대 6 가능, A 문자가 잇어야 함
print(pattern.search("ADA"))
print(pattern.search("ADDA"))
print(pattern.search("ADDDDDDDDDDDDAA"))

None
<re.Match object; span=(0, 4), match='ADDA'>
None


In [12]:
pattern = re.compile("[ABCDEFGabcdefg]")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='A'>


In [13]:
pattern = re.compile("[A-Ga-g]")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='A'>


In [14]:
pattern = re.compile("[A-Ga-g]+")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 1), match='A'>


In [16]:
pattern = re.compile("[A-Za-z0-9]+")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 6), match='aa1234'>
<re.Match object; span=(0, 5), match='A4567'>


In [18]:
pattern = re.compile("[^A-Za-z0-9]+") # [^찾을패턴] : 찾을 패턴이 아닌(not)
print(pattern.search("aa1234!@#$%"))
print(pattern.search("A4567"))

<re.Match object; span=(6, 11), match='!@#$%'>
None


In [19]:
pattern = re.compile("[가-힣]+")
print(pattern.search("aa1234대한민국"))
print(pattern.search("A백두산4567"))

<re.Match object; span=(6, 10), match='대한민국'>
<re.Match object; span=(1, 4), match='백두산'>


In [20]:
pattern = re.compile("[a-z]+")
print(pattern.search("aa1234대한민국"))
print(pattern.match("aa1234대한민국"))

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 2), match='aa'>


In [21]:
origin = "DDA D1A DDA DA"
# sub(패턴, 바꿀문자열, 원본문자열) 
print(re.sub("D.A", "Dave", origin))

Dave Dave Dave DA


In [23]:
pattern = re.compile("D.A")
pattern.sub("Dave", origin)

'Dave Dave Dave DA'

In [25]:
# findall() : 표현식과 매칭되는 문자들을 리스트로 반환

pattern = re.compile("[a-z]+")
origin = "Game of Life in Python"

# pattern.findall(origin)
for w in pattern.findall(origin):
    print(w)

ame
of
ife
in
ython


In [27]:
for w in pattern.finditer(origin):
    print(w.group())

ame
of
ife
in
ython


In [28]:
pattern = re.compile(":")
pattern.split("python:java:javascript")

['python', 'java', 'javascript']

In [34]:
# VS 로 문자열 분리 => python, java
origin = "python VS java"
pattern = re.compile(" VS ")
print(pattern.split(origin))

# print(re.split(" VS ", origin))

# - 기호를 * 로 바꿔서 출력
jumin = "801210-1011323"
pattern = re.compile("-")
print(pattern.sub("*", jumin))

# print(re.sub("-", "*", jumin))

['python', 'java']
801210*1011323


In [2]:
# data_kr.xlsx 파일 읽기
from openpyxl import load_workbook

excel_file = load_workbook("./file/data_kr.xlsx")
work_sheet = excel_file.active

# 주민번호 뒷자리를 * 로 바꿔서 화면출력
pattern = re.compile(r"[0-9]{7}")

for row in work_sheet.rows:
    print(re.sub(pattern, "*******", row[1].value))

excel_file.close()

주민등록번호
800215-*******
821030-*******
841230-*******
790903-*******
800125-*******
820612-*******


In [45]:
origin = "<b>아이폰</b>"

# +?, *? : 매칭을 최소화
pattern = re.compile("<.*?>")
print(pattern.search(origin))

<re.Match object; span=(0, 3), match='<b>'>


In [3]:
import requests
from bs4 import BeautifulSoup

res = requests.get("https://www.naver.com")
soup = BeautifulSoup(res.text, "lxml")

# h 로 시작하는 모든 태그 요소 찾기
print(soup.find_all(string=re.compile(r"h\d")))


# 이미지 요소 찾기(jpg or png)
print(soup.find_all("img", attrs={"src":re.compile(r".+\.jpg|png")}))

['\nwindow["EAGER-DATA"] = window["EAGER-DATA"] || {};\nwindow["EAGER-DATA"]["PC-FEED-WRAPPER"] = {"@type":"BLOCK","blocks":[{"@type":"BLOCK","blocks":[{"@type":"BLOCK","blocks":null,"materials":[{"@type":"MATERIAL-PC-FEED","title":"여름에 더 빛나는 촉촉한 피부를 위해","url":"https://in.naver.com/jji_bro/topic/698634502602816","image":{"url":"https://s.pstatic.net/dthumb.phinf/?src=%22https%3A%2F%2Finfluencer-phinf.pstatic.net%2FMjAyNDA1MjhfMTU5%2FMDAxNzE2ODY1NjY0MjIx.FznkZw8p8JaSddGd50dNeYZp9gBDtTsZ-Fo0yr1W1Fog.aoGdPLexFBtCFyBBzhAA0inZFVVILyCaiJLXgA_2tCMg.PNG%2Fc39f.png%22&type=ff364_236&service=navermain"},"source":{"name":"하이은비","image":{"url":"https://s.pstatic.net/dthumb.phinf/?src=%22https%3A%2F%2Fhomebuilder-phinf.pstatic.net%2FMjAyNDA0MjhfMjM0%2FMDAxNzE0MjMwMDA1MDI1.Vf_pmtfBQDnHhrtJZC8fY-jHrGnO9RtAJJvRqlub5UAg.GC_rEPMnIQEnHp5cfPcRG-LPWmL_xgoJHtE9TEEooygg.JPEG%2F1714230005009_IMG_7548.jpeg%22&type=nf40_40&service=navermain"}},"showLiveBadge":false,"_id":"654ad80598c6dbca70ba6cce"},{"@type":"MA

In [53]:
excel_file = load_workbook("./file/train.xlsx")
work_sheet = excel_file.active

pattern = re.compile(r" Mr.")

for row in work_sheet.rows:
    if len(pattern.findall(row[3].value)) > 0:
        if pattern.findall(row[3].value)[0].strip() == "Mr.":
            print(row[3].value)

excel_file.close()

Braund, Mr. Owen Harris
Allen, Mr. William Henry
Moran, Mr. James
McCarthy, Mr. Timothy J
Saundercock, Mr. William Henry
Andersson, Mr. Anders Johan
Williams, Mr. Charles Eugene
Fynney, Mr. Joseph J
Beesley, Mr. Lawrence
Sloper, Mr. William Thompson
Emir, Mr. Farred Chehab
Fortune, Mr. Charles Alexander
Todoroff, Mr. Lalio
Wheadon, Mr. Edward H
Meyer, Mr. Edgar Joseph
Holverson, Mr. Alexander Oskar
Mamee, Mr. Hanna
Cann, Mr. Ernest Charles
Kraeff, Mr. Theodor
Rogers, Mr. William John
Lennon, Mr. Denis
Samaan, Mr. Youssef
Nosworthy, Mr. Richard Cater
Ostby, Mr. Engelhart Cornelius
Woolner, Mr. Hugh
Novel, Mr. Mansouer
Sirayanian, Mr. Orsen
Harris, Mr. Henry Birkhardt
Stewart, Mr. Albert A
Crease, Mr. Ernest James
Kink, Mr. Vincenz
Jenkin, Mr. Stephen Curnow
Hood, Mr. Ambrose Jr
Chronopoulos, Mr. Apostolos
Bing, Mr. Lee
Moen, Mr. Sigurd Hansen
Staneff, Mr. Ivan
Moutal, Mr. Rahamin Haim
Waelens, Mr. Achille
Sheerlinck, Mr. Jan Baptist
Carrau, Mr. Francisco M
Ford, Mr. William Neal
Slocovs

In [8]:
# Mr. ==> 남성, Miss. ==> 미혼여성, Mrs. ==> 기혼여성, X
from openpyxl import Workbook, load_workbook
import re

# 엑셀 파일 읽어오기
excel_file = load_workbook("./file/train.xlsx")

# 활성시트 가져오기
work_sheet = excel_file.active

# 새 엑셀 파일 작성
wb = Workbook()
work_sheet_man = wb.active
work_sheet_man.column_dimensions["D"].width = 70
work_sheet_man.title = "남성"

work_sheet_women = wb.create_sheet(title="기혼여성")
work_sheet_women.column_dimensions["D"].width = 70

work_sheet_solo_women = wb.create_sheet(title="미혼여성")
work_sheet_solo_women.column_dimensions["D"].width = 70

work_sheet_others = wb.create_sheet(title="기타")
work_sheet_others.column_dimensions["D"].width = 70

work_sheet_report = wb.create_sheet(title="보고서")

# 사망자 수, 생존자 수
man_survived, man_unsurvived = 0, 0
merried_survived, merried_unsurvived = 0, 0
single_survived, single_unsurvived = 0, 0
others_survived, others_unsurvived = 0, 0

pattern = re.compile(r" [A-Za-z]+\.")

list1 = []
for row in work_sheet.rows:
    if pattern.search(row[3].value):
        data = pattern.search(row[3].value).group()
        # print(data)

    # 제목 행 옮기기
    if row[0].row == 1:
        # for title in row:
        #     list1.append(title.value)

        # work_sheet_man.append(list1)
        # work_sheet_women.append(list1)
        # work_sheet_solo_women.append(list1)
        # work_sheet_others.append(list1)

        work_sheet_man.append([title.value for title in row])
        work_sheet_women.append([title.value for title in row])
        work_sheet_solo_women.append([title.value for title in row])
        work_sheet_others.append([title.value for title in row])
    else:
        if data:
            if data == " Mr.":
                work_sheet_man.append([col.value for col in row])
                # surivived 컬럼 값이 1(생존자) or 0(사망자)
                if row[1].value == 1:
                    man_survived += 1
                else:
                    man_unsurvived += 1
            elif data == " Mrs.":
                work_sheet_women.append([col.value for col in row])
                if row[1].value == 1:
                    merried_survived += 1
                else:
                    merried_unsurvived += 1
            elif data == " Miss.":
                work_sheet_solo_women.append([col.value for col in row])
                if row[1].value == 1:
                    single_survived += 1
                else:
                    single_unsurvived += 1
            else: 
                work_sheet_others.append([col.value for col in row])
                if row[1].value == 1:
                    others_survived += 1
                else:
                    others_unsurvived += 1

# 보고서 작성
work_sheet_report.append(["분류","생존자수","사망자수","생존률"])

man_survived_rate = "%.2f%%" % (man_survived / (man_survived + man_unsurvived) * 100)
work_sheet_report.append(["남성", man_survived, man_unsurvived, man_survived_rate])

merried_survived_rate = "%.2f%%" % (merried_survived / (merried_survived + merried_unsurvived) * 100)
work_sheet_report.append(["기혼여성", merried_survived, merried_unsurvived, merried_survived_rate])

single_survived_rate = "%.2f%%" % (single_survived / (single_survived + single_unsurvived) * 100)
work_sheet_report.append(["미혼여성", single_survived, single_unsurvived, single_survived_rate])

others_survived_rate = "%.2f%%" % (others_survived / (others_survived + others_unsurvived) * 100)
work_sheet_report.append(["기타", others_survived, others_unsurvived, others_survived_rate])


wb.save("./file/train_gender.xlsx")
wb.close()

excel_file.close()