<a href="https://colab.research.google.com/github/HyuksuRyu/regex_tutorial/blob/main/Learning_Regular_Expressions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# chapter 2
# 문자 하나 찾기

import re
text = """
sales1.xls
order3.xls
sales2.xls
na1.xls
na2.xls
sa1.xls
ja4.doc
"""

re_pat = re.compile(r'sales.')

print(re_pat.findall(text))

re_pat1 = re.compile(r'.a.\.')
print(re_pat1.findall(text))



['sales1', 'sales2']
['na1.', 'na2.', 'sa1.', 'ja4.']


In [None]:
re_pat2 = re.compile(r'.a.\.xls')
print(re_pat2.findall(text))

['na1.xls', 'na2.xls', 'sa1.xls']


In [5]:
# chapter 3
# 미국/캐나다 우편번호 데이터베이스
import re

post_number = """
11213
A1C2E3
48075
48345
M1B3F2
90046
H1H2H3
"""

re_pat_us = re.compile("\w\d\w\d\w\d")
print(re_pat_us.findall(post_number))

re_pat_ca = re.compile("\d{5}")
print(re_pat_ca.findall(post_number))

['A1C2E3', 'M1B3F2', 'H1H2H3']
['11213', '48075', '48345', '90046']


In [10]:
import re
text = """
Send personal email to ben@forta.com or 
ben.forta@forta.com. For questions about a book use
support@forta.com. If your message is
urgent try ben@urgent.forta.com.
Feel free to send unsolicited email to 
spam@forta.com (simple)
"""

re_pat = re.compile(r'[\w.]+@[\w.]+\.\w+')
print(re_pat.findall(text))
re_pat2 = re.compile(r'[\w\.]+@[\w\.]+\.\w+')
print(re_pat2.findall(text))

['ben@forta.com', 'ben.forta@forta.com', 'support@forta.com', 'ben@urgent.forta.com', 'spam@forta.com']
['ben@forta.com', 'ben.forta@forta.com', 'support@forta.com', 'ben@urgent.forta.com', 'spam@forta.com']


In [13]:
# get url
import re
text = """
The URL is http://www.forta.com, to connect
securely us https://www.forta.com/ instead
"""

re_pat = re.compile(r"https?:\/\/[\w.\/]+")
print(re_pat.findall(text))

['http://www.forta.com', 'https://www.forta.com/']


In [20]:
# 5장 반복찾기
# date
text = """
4/8/17
10-6-2018
2/2/2
01-01-01
"""

re_date = re.compile(r"\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}")
print(re_date.findall(text))

['4/8/17', '10-6-2018', '01-01-01']


In [21]:
# find minimum period
text = """
1001: $496.80
1002: $1290.69
1003: $26.43
1004: $613.42
1005: $7.61
1006: $414.90
1007: $25.00
"""

re_pat = re.compile(r"\d+: \$\d{3,}\.\d{2}") # find cases over 100 dollars
print(re_pat.findall(text))

['1001: $496.80', '1002: $1290.69', '1004: $613.42', '1006: $414.90']


In [24]:
# to prevent too much fit
text = """
This offer is not available to customers
living in <b>AK</b> and <B>HI</B>.
"""

re_pat = re.compile(r'<[Bb]>.*<\/[Bb]>')
print(re_pat.findall(text))   ## ['<b>AK</b> and <B>'] Greedy search

## Need to make lazy quantifier, not greed
## * -> *?
## + -> +?
## {n,} -> {n,}?

re_pat_lazy = re.compile(r'<[Bb]>.*?<\/[Bb]>')
print(re_pat_lazy.findall(text))  ## ['<b>AK</b>', '<B>HI</B>'] Lazy search 



['<b>AK</b> and <B>HI</B>']
['<b>AK</b>', '<B>HI</B>']


In [31]:
# Chapter 6
# Position matching
# Designate boundary

text = "The cat scattered his food all over the room."
re_cat = re.compile(r'cat')
print(re_cat.findall(text))

# word boundary \b
re_cat_b = re.compile(r'\bcat\b')
print(re_cat_b.findall(text))

# 단어 경계를 일치시키고 싶지 않을 때는 \B를 사용
text = """
Please enter the nine-digit id as it
appears on your color - coded pass-key.
"""

re_pat_B = re.compile(r"(\B-\B)")
re_pat_b = re.compile(r"(\b-\b)")
print(re_pat_B.findall(text))
print(re_pat_B.search(text))
print(re_pat_b.findall(text))
print(re_pat_b.search(text))




['cat', 'cat']
['cat']
['-']
<re.Match object; span=(60, 61), match='-'>
['-', '-']
<re.Match object; span=(22, 23), match='-'>
-


In [36]:
# Chapter 7
# 하위표현식
text = "Regular&nbsp;&nbsp;Expressions, and other subjects."
re_pat1 = re.compile(r'&nbsp;{2,}')
re_pat2 = re.compile(r'(&nbsp;){2,}')

print(re_pat1.findall(text)) #[]
print(re_pat1.search(text)) # None
print(re_pat2.findall(text)) #['&nbsp;']
print(re_pat2.search(text).group()) # &nbsp;&nbsp;

[]
None
['&nbsp;']
&nbsp;&nbsp;


In [37]:
# Find IP
text = "Pinging hog.forta.com [12.159.46.200]"
re_ip = re.compile(r"(\d{1,3}\.){3}\d{1,3}")

print(re_ip.search(text).group()) # 12.159.46.200



12.159.46.200


In [39]:
# find year
import re

text = "DOB: 1967-08-17"
re_year = re.compile(r'19|20\d{2}')  # which means 19 or 20\d{2}
re_year_paren = re.compile(r'(19|20)\d{2}') # which means 19 or 20

print(re_year.search(text).group())  # 19
print(re_year_paren.search(text).group()) # 1967



19
1967


In [53]:
# Nested parenthesis
# Find IP
# Criteria
# 1. every 1-digit or 2-digits number
# 2. every 3-digit numbers which start with 1
# 3. 3-digit number which start with 2 and the second digit b/w 0 and 4
# 4. 25+[0~5]

text = """
Pinging hog.forta.com [12.159.46.200]
Pinging hog.forta.com [260.567.190.892]
"""
re_ip = re.compile(r"(\d{1,3}\.){3}\d{1,3}")

print(re_ip.finditer(text)) # <callable_iterator object at 0x7fa0c985bb10>
print([x.group() for x in re_ip.finditer(text)]) 
#['12.159.46.200', '260.567.190.892']
# invalid ip is also searched

re_ip_nested = re.compile(r"(((25[0-5])|(2[0-4]\d)|(1\d{2})|(\d{1,2}))\.){3}((25[0-5])|(2[0-4]\d)|(1\d{2})|(\d{1,2}))")
print([x.group() for x in re_ip_nested.finditer(text)])





<callable_iterator object at 0x7fa0c985be90>
['12.159.46.200', '260.567.190.892']
['12.159.46.200']


In [54]:
# Chapter 8
# Back Reference

# check repeated word
text = """
this is a block of of text,
serveral words here are are
repeated, and and they
should not be
"""

re_pat = re.compile(r"\s+(\w+)\s+\1")
print([x.group() for x in re_pat.finditer(text)]) #[' of of', ' are are', ' and and']


[' of of', ' are are', ' and and']


In [55]:
# find header tag
# exclude unmatched pair such as <h1> </h2>
text = """
<body>
<h1>Welcome to my homepage</h1>
blah blah blah <br/>
<h2>SQL</h2>
blah blah
<h2>RegEx</h2>
Infromation about Regex
<h2> This is not vaild HTML</h3>
</body>
"""

re_pat = re.compile(r"<([hH][1-6])>.*?<\/\1>")
print([x.group() for x in re_pat.finditer(text)]) 
# ['<h1>Welcome to my homepage</h1>', '<h2>SQL</h2>', '<h2>RegEx</h2>']
# <h2> This is not vaild HTML</h3> is excluded

['<h1>Welcome to my homepage</h1>', '<h2>SQL</h2>', '<h2>RegEx</h2>']


In [60]:
# substitution using back reference
# e-mail -> a href
# ex. ben@forta.com -> <a href="mailto:ben@forta.com">ben@forta.com</a>

import re
text = "Hello, ben@forta.com is my email address"

re_email = re.compile(r"(?P<mail>\w+[\w\.]*@[\w\.]+\.\w+)")
print(re_email.sub(r'<a href="mailto:\1">\1</a>',text))
# Hello, <a href="ben@forta.com">ben@forta.com</a> is my email address

Hello, <a href="mailto:ben@forta.com">ben@forta.com</a> is my email address


In [63]:
# substitution using back reference
# phone number
# 313-555-1234 --> (313) 555-1234
import re

text = """
313-555-1234
248-846-8753
820-784-9687
"""

re_phone = re.compile(r"(?P<n1>\d{3})-(?P<n2>\d{3})-(?P<n3>\d{4})")
print(re_phone.sub(r"(\1) \2-\3", text))
#(313) 555-1234
#(248) 846-8753
#(820) 784-9687
print(re_phone.sub(r"(\g<n1>) \g<n2>-\g<n3>", text))
#(313) 555-1234
#(248) 846-8753
#(820) 784-9687



(313) 555-1234
(248) 846-8753
(820) 784-9687


(313) 555-1234
(248) 846-8753
(820) 784-9687



In [65]:
# Chapter 9
# Lookaround
# Look ahead ?=

# task: extract protocol from url
# ex. http://www.forta.com -> http
# ex. http -> no extraction

import re

text = """
http://www.forta.com
http
https
https://mail.forta.com/
ftp://ftp.forta.com/
"""
re_protocol = re.compile(r".+(?=:)")

print([x.group() for x in re_protocol.finditer(text)]) # ['http', 'https', 'ftp']

['http', 'https', 'ftp']


In [66]:
# Look behind ?<=

text = """
ABC01: $23.45
HGG42: $5.31
CFMX1: $899.00
XTC99: $69.96
Total items found: 4
"""

# Extract dollars without dollar mark
re_dollar = re.compile(r"(?<=\$)\d+\.\d+")
re_dollar.findall(text) # ['23.45', '5.31', '899.00', '69.96']

['23.45', '5.31', '899.00', '69.96']

In [74]:
# use look ahead and look behind together
import re

text = """
<head>
<title>Ben Forta's Homepage</title>
</head>
"""

re_title = re.compile(r"(?<=\<(title|TITLE)>).+(?=\</(title|TITLE)\>)")
print([x.group() for x in re_title.finditer(text)])

["Ben Forta's Homepage"]
[('title', 'title')]


In [79]:
# Negative look ahead & look behind
# Extract price only
import re

text = """
I paid $30 for 100 apples,
50 oranges, and 60 pears.
I saved $5 on this order.
"""

re_price = re.compile(r"(?<=\$)\d+")
print(re_price.findall(text))   # ['30', '5']
print([x.group() for x in re_price.finditer(text)]) # ['30', '5']

# Task 2: Extract quantity
re_quant1 = re.compile(r"(?<!\$)\d+")
print([x.group() for x in re_quant1.finditer(text)]) # ['0', '100', '50', '60']

re_quant2 = re.compile(r"\b(?<!\$)\d+\b")
print([x.group() for x in re_quant2.finditer(text)]) # ['100', '50', '60']




['30', '5']
['30', '5']
['0', '100', '50', '60']
['100', '50', '60']


In [80]:
# Chapter 10
# make condition

text = """
123-456-7890
(123)456-7890
(123)-456-7890
(123-456-7890 
1234567890
123 456 7890
"""
# (123-456-7890, (123)-456-7890, 1234567890 is not correct format
# How to extract phone numbers w/o invalid format

re_phone = re.compile(r"\(?\d{3}\)?-?\d{3}-\d{4}")
print([x.group() for x in re_phone.finditer(text)]) 
# ['123-456-7890', '(123)456-7890', '(123)-456-7890', '(123-456-7890']
# ????

['123-456-7890', '(123)456-7890', '(123)-456-7890', '(123-456-7890']


In [82]:
# Chapter 11
# General problems using Regex

# North America Telephone Number
text = """
J. Doe: 248-555-1234
B. Smith: (313) 555-1234
A. Lee: (810)555-1234
M. Jones: 734.555.9999
"""

re_ne_number = re.compile(r"(?<=: )[\(\.]?[2-9]\d{2}[\)\.]?[\s\-\.]?[2-9]\d{2}[\-\.]\d{4}")
print([x.group() for x in re_ne_number.finditer(text)])

['248-555-1234', '(313) 555-1234', '(810)555-1234', '734.555.9999']


In [83]:
# USA Post number
# 2 types
# type1. {5 digits}
# ex. 11222
# type2. {5 digits}-{4 digits}
# ex. 48034-1234

text = """
999 1st Avenue, Bigtown, NY, 11222
123 High Street, Any City, MI, 48034-1234
"""

re_post = re.compile(r"\d{5}(-\d{4})?")
print([x.group() for x in re_post.finditer(text)])
# ['11222', '48034-1234']


['11222', '48034-1234']


In [84]:
# UK Post Number
# {outcode} {incode}
# outcode: (1/2 characters) + (1/2 digits)
# incode: 1 digit + 2 characters (No C,I,K,M,O,V)

text = """
171 Kyverdale Road, London N16 6PS
33 Main Street, Portsmouth, P01 3AX
18 High Street, London NW11 8AB
18 High Street, London NW1P 8AB
"""

re_post_uk = re.compile(r"(?P<out>[A-Z]{1,2}\d[A-Z\d]?) (?P<in>\d{1}[ABD-HJLNP-UW-Z]{2})")
print([x.group('out') for x in re_post_uk.finditer(text)])
print([x.group('in') for x in re_post_uk.finditer(text)])
print([x.group() for x in re_post_uk.finditer(text)])




['N16', 'P01', 'NW11', 'NW1P']
['6PS', '3AX', '8AB', '8AB']
['N16 6PS', 'P01 3AX', 'NW11 8AB', 'NW1P 8AB']


In [90]:
# URL
text = """
http://www.forta.com/blog
https://www.forta.com:80/blog/index.cfm
http://www.forta.com
http://ben:password@www.forta.com/
http://localhost/index.php?ab=1&c=2
http://localhost:8500/
"""

re_url = re.compile(r"https?:\/\/[\w\.]+(\:\d+)?(\/([\w+\/_.]*)?)?")

print([x.group() for x in re_url.finditer(text)])
# ['http://www.forta.com/blog', 'https://www.forta.com:80/blog/index.cfm', 'http://www.forta.com', 'http://ben', 'http://localhost/index.php', 'http://localhost:8500/']


['http://www.forta.com/blog', 'https://www.forta.com:80/blog/index.cfm', 'http://www.forta.com', 'http://ben', 'http://localhost/index.php', 'http://localhost:8500/']


In [101]:
# Register Number 주민등록번호 

text = """
790814-1234567
135-600
799999-1234567
791231-1234567
080601-3456789
830105-5678123
100406-4567890
"""

#re_reg = re.compile(r"\d{6}-\d{7}")
re_reg = re.compile(r"\d{2}(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])-\d{7}")

print([x.group() for x in re_reg.finditer(text)])
# ['790814-1234567', '791231-1234567', '080601-3456789', '830105-5678123', '100406-4567890']


['790814-1234567', '791231-1234567', '080601-3456789', '830105-5678123', '100406-4567890']


In [98]:
# date change
# mm/dd/yyyy -> yyyy-mm-dd

text = "08/14/1979"

re_date = re.compile(r"(?P<month>\d{2})\/(?P<date>\d{2})\/(?P<year>\d{4})")
print(re_date.sub(r"\g<year>-\g<month>-\g<date>", text))

1979-08-14
