# Searching for a target

findall()    Returns a list containing all matches

search()     Returns a match object if there is a match anywhere in the
             string, None on failure
split()      Returns a list where the string has been split at each match

sub()        Replaces one or many matches with a string

compile()    Returns a RegEx pattern

In [1]:
import re

In [2]:
#search returns a match object if there is a match anywhere in the string, or None on failure.
text = 'HKU Business School'
findHKU = re.search('HKU', text)
findMSBA = re.search('MSBA', text)
print(findHKU)
print(type(findHKU))
print(findMSBA)

<re.Match object; span=(0, 3), match='HKU'>
<class 're.Match'>
None


span()      Returns a tuple containing the start and end positions of the match
start()     Returns the start position of the match
end()       Returns the end position of the match
string      Returns the string passed into the method

In [3]:
print(type(findHKU))
print(findHKU.span())
print(findHKU.start())
print(findHKU.end())
print(findHKU.string)

<class 're.Match'>
(0, 3)
0
3
HKU Business School


In [4]:
if findHKU: 
    print("Yes, HKU") 
else:
    print("No, HKU")
if findMSBA:
    print("Yes, MSBA") 
else:
    print("No, MSBA")


Yes, HKU
No, MSBA


#Compiling a RegEx Object

In [5]:
text = 'HKU Business School'
pattern = re.compile('HKU')
if pattern.search(text): print('yes')

yes


In [6]:
text = 'HKU Business School'
if re.search('HKU', text): print('yes')

yes


In [7]:
text = 'HKU Business School'
result = re.findall('s', text)
print(result)

['s', 's', 's']


# Using metacharacters and sets

[]  A set of characters
\   Escape character, used to formulate special characters
.   Any character, except newline character
^   Starts with
$   Ends with
*   Zero or more occurrences
+   One or more occurrences
?   Turns greedy matching to non-greedy matching
{}  Exactly the specified number of occurrences
|   Either or
()  Capture and group

In [8]:
re.findall('.', text[-6:])
# match with any one character, except for a newline

['S', 'c', 'h', 'o', 'o', 'l']

In [9]:
re.findall('.+', text[-6:])
# match as many as possible, except for a newline
# this is greedy matching

['School']

In [10]:
re.findall('.+?', text[-6:])
# ? turns off greedy matching

['S', 'c', 'h', 'o', 'o', 'l']

In [11]:
text = 'From <chao.ding@hku.hk> Assignment 1'
x = re.findall('c.+k', text)
print(x)

y = re.findall('c.+?k', text)
print(y)


['chao.ding@hku.hk']
['chao.ding@hk']


In [12]:
text2 = '''From chao.ding@hku.hk end
From: eric.wong@hku.hk over
from michael.chau4@hku.hk done
exam@friday'''

text2    # notice the \n in the text

'From chao.ding@hku.hk end\nFrom: eric.wong@hku.hk over\nfrom michael.chau4@hku.hk done\nexam@friday'

In [13]:
re.findall('F', text2)
# extract all the "F"s from the text

['F', 'F']

In [14]:
re.findall('^F', text2)
# extract only the one at the begining of the text

['F']

In [15]:
re.findall('e', text2)
# extract all the "e"s from the text

['e', 'e', 'e', 'e', 'e', 'e']

In [16]:
re.findall('e$', text2)
# no e at the end of the text

[]

In [17]:
re.findall('From|from', text2)
# extract either From or from

['From', 'From', 'from']

In [18]:
re.findall('[Ff]rom', text2)

['From', 'From', 'from']

#Extracting a Portion of the Match

In [19]:
text = 'From <chao.ding@hku.hk> Assignment 1'
x = re.findall('<.+@.+>', text)
print(x)

y = re.findall('<(.+@.+)>', text)
print(y)


['<chao.ding@hku.hk>']
['chao.ding@hku.hk']


In [20]:
pattern = re.compile("[a-z.]+@[a-z.]+")
# create a pattern to extract email address
# a dot in a set matches with a real dot

pattern.findall(text2)
# but it does not match with michael.chau4@hku.hk
# and it does match with exam@friday, which is not desired

['chao.ding@hku.hk', 'eric.wong@hku.hk', 'exam@friday']

In [21]:
pattern = re.compile("[a-z0-9.]+@[a-z]+[.][a-z]+")
# add another range 0-9 in the template

pattern.findall(text2)
# now you have all three email addresses

['chao.ding@hku.hk', 'eric.wong@hku.hk', 'michael.chau4@hku.hk']

In [23]:
# extract the part before @, use a pair of parenthese

pattern = re.compile("([a-z0-9.]+)@[a-z]+[.][a-z]+")
pattern.findall(text2)

['chao.ding', 'eric.wong', 'michael.chau4']

In [24]:
text3 = 'My 2 favorite numbers are 19 and 42'
re.findall('[0-9]', text3)

['2', '1', '9', '4', '2']

In [25]:
re.findall('[0-9]+', text3)
# this is greedy matching

['2', '19', '42']

In [26]:
re.findall('[0-9]+?', text3)
# use ? to turn greedy matching to non-greedy

['2', '1', '9', '4', '2']

In [27]:
re.findall('[0-9]{2}', text3)
# use {} to specify the exact number of occurences

['19', '42']

In [28]:
re.findall('[^0-9 ]{3,5}', text3)
# matches with substrings with a length from 3 to 5, but without any digits or spaces
# note there is no space after ,
# {} is also greedy matchin

['favor', 'ite', 'numbe', 'are', 'and']

In [29]:
re.findall('[^0-9 ]{3,}', text3)
# as many as possible, but at least three

['favorite', 'numbers', 'are', 'and']

#In sets [], punctuation has no special meaning , thus ,  [+] means a + character

\A  Returns a match if the specified characters are at the beginning of the string
\d  Returns a match where the string contains digits (numbers from 0-9)
\D  Returns a match where the string DOES NOT contain digits
\s  Returns a match where the string contains a white space character
\S  Returns a match where the string DOES NOT contain a white space character
\w  Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)
\W  Returns a match where the string DOES NOT contain any word characters
\Z  Returns a match if the specified characters are at the end of the string
\t  Returns a match with a tab
\.  Returns a match with a dot
\\  Returns a match with a backslash
\[  Returns a match with a left square bracket

# Using escape characters

In [30]:
text4 = 'From <chao.ding@hku.hk> Assignment 1'
pattern = re.compile(r'<(\S+@\S+)>')  # r is for raw string, use raw string when there are escape characters
matches = pattern.findall(text4)
print(matches)

['chao.ding@hku.hk']


In [31]:
re.findall(r'@(\S+)>', text4)[0]    
# using list index to get a specific string in the list

'hku.hk'

In [32]:
re.findall(r'\w+', text4)
# matches with string containing any word characters
# characters from a to Z, digits from 0-9, and the underscore _ character

['From', 'chao', 'ding', 'hku', 'hk', 'Assignment', '1']

In [33]:
re.findall(r'\w+\.\w+', text4)
# \. matches with a real dot as well
# similar to [.]

['chao.ding', 'hku.hk']

In [34]:
text5 = 'We just received $10.88 for 20 cookies.'
re.findall(r'\d+', text5)

['10', '88', '20']

In [35]:
re.findall(r'\$\d+', text5)[0]

'$10'

# Splitting strings & Substituting substrings

In [36]:
text7 = 'The University of Hong Kong (HKU)'
re.split(r'\s', text7, 2)
# third argument: makes 2 splits

['The', 'University', 'of Hong Kong (HKU)']

In [37]:
re.split(r'\W+', text7)

['The', 'University', 'of', 'Hong', 'Kong', 'HKU', '']

In [38]:
re.sub(r'\(|\)', '--', text7)
# replace parenthese with --

'The University of Hong Kong --HKU--'

In [39]:
re.sub(r'\s\(.+\)', '', text7)
# remove the content in the parenthese

'The University of Hong Kong'

In [40]:
re.sub(r'\s[(].+[)]', '', text7)

'The University of Hong Kong'

In [41]:
# another way to remove the content in the parenthese: using string slicing
text7[0 : text7.find('(')-1]

'The University of Hong Kong'

# Special uses

## `\b`: word boundary

In [42]:
# the use of \b needs to be combined with a raw string

text6 = "Lis1bon is2 an oasis3"
re.findall(r'is[0-9]', text6)

['is1', 'is2', 'is3']

In [43]:
re.findall(r'\bis[0-9]', text6)

['is2']

In [44]:
re.findall(r'is[0-9]\b', text6)

['is2', 'is3']

In [45]:
re.findall(r'\bis[0-9]\b', text6)

['is2']

## Capture and group, non-capture and group

In [46]:
re.findall(r'\d+(?=\S)', 
           'Here are 2 numbers: 3.141, 20.5')

['3', '141', '20']

In [47]:
re.findall(r'\d+(?!\.)', 
           'Here are 2 numbers: 3.141, 20.5')

['2', '141', '2', '5']

In [48]:
re.findall(r'(?<=\.)\d+', 
           'Here are 2 numbers: 3.141, 20.5')

['141', '5']

In [49]:
re.findall(r'\d+(?=\s[A-Z]+)', 
           'I spent 5 USD (approx. 40 HKD) to buy 10 cakes 2 days ago.')

['5', '40']

In [50]:
re.findall(r'\d+(?=\s\w{4}\b)', 
           'I spent 5 USD (approx. 40 HKD) to buy 10 cakes 2 days ago.')

['2']

In [51]:
re.findall(r'(?:HK|US)D', 
           'I spent 5 USD (approx. 40 HKD) to buy 10 cakes 2 days ago.')


['USD', 'HKD']

In [52]:
re.findall(r'(?:\d{1,3}\.){3}\d{1,3}', 
           'my ip address is 129.66.19.10 and phone number is 321.642')

['129.66.19.10']

# about flags

In [53]:
re.findall(r'[a-z]+', text7, flags = re.I)

['The', 'University', 'of', 'Hong', 'Kong', 'HKU']

In [54]:
# to add multiple flags
re.findall(r"""[a-z]{2}            # {4} means to match exactly 4
                    \s[a-z]{4}    # \s means to match with a space""",
          text7, flags = re.I | re.X)

#  re.X: to allow comments in the pattern.

['he Univ', 'of Hong']

In [55]:
s = """Regex
Flags"""

re.findall(r'^\w+', s, re.M)  
# re.M: to match in multiple lines, each has a start and an end

['Regex', 'Flags']