#Why use regex
regex can be used three main reasons.
1. to check whether data exist or not
2. to get data from complex pattern from source data
3. cleaning data from source data using splitting string

In [16]:
import re

some_text = "It is a good day for everyone"

print(re.match('It',some_text))
print(re.search('good',some_text))

<re.Match object; span=(0, 2), match='It'>
<re.Match object; span=(8, 12), match='good'>


Splitting Function

In [35]:
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful."

print(re.split("Amy ",text))
print(re.findall("Amy",text))

# ^ is the begining of text and $ is the ending of text
print(re.search("^Amy",text))

['', 'works diligently. ', 'gets good grades. Our student ', 'is succesful.']
['Amy', 'Amy', 'Amy']
<re.Match object; span=(0, 3), match='Amy'>


#Pattern and charactor class

In [51]:
grades="ACAAAABCBCBAA"

# If we wanted to count the number of A's or B's in the list, we can't use "AB" since this is used to match
# all A's followed immediately by a B. Instead, we put the characters A and B inside square brackets
re.findall("[AB]",grades)

# This is called the set operator. You can also include a range of characters, which are ordered
# alphanumerically. For instance, if we want to refer to all lower case letters we could use [a-z] Lets build
# a simple regex to parse out all instances where this student receive an A followed by a B or a C
re.findall("[A][B-C]",grades)

# We can use the caret with the set operator to negate our results. For instance, if we want to parse out only
# the grades which were not A's
re.findall("[^A]",grades)

# Note this carefully - the caret was previously matched to the beginning of a string as an anchor point, but
# inside of the set operator the caret, and the other special characters we will be talking about, lose their
# meaning. This can be a bit confusing. What do you think the result would be of this?
re.findall("^[^A]",grades)

[]

#Quantifier

In [53]:
# Quantifiers are the number of times you want a pattern to be matched in order to match. The most basic
# quantifier is expressed as e{m,n}, where e is the expression or character we are matching, m is the minimum
# number of times you want it to matched, and n is the maximum number of times the item could be matched.

# Let's use these grades as an example. How many times has this student been on a back-to-back A's streak?
re.findall("A{2,10}",grades) # we'll use 2 as our min, but ten as our max

# It's important to note that the regex quantifier syntax does not allow you to deviate from the {m,n}
# pattern. In particular, if you have an extra space in between the braces you'll get an empty result
re.findall("A{2, 2}",grades)

# Oh, and if you just have one number in the braces, it's considered to be both m and n
re.findall("A{2}",grades)

[]

In [37]:
#There are three other quantifiers that are used as short hand, an asterix * to match 0 or more times, 
#a question mark ? to match one or more times, or a + plus sign to match one or more times. Lets look 
#at a more complex example, and load some data scraped from wikipedia
import re
with open("resource/ferpa.txt","r") as file:
    # we'll read that into a variable called wiki
    wiki=file.read()
# and lets print that variable out to the screen


#Now let's find title of the wikipedia content. Each content ends with [edit] so let's implement this.
re.findall("[\\w ]*\\[edit]",wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

#Groups

In [72]:
# Ok, this works, but it's a bit of a pain. To this point we have been talking about a regex as a single
# pattern which is matched. But, you can actually match different patterns, called groups, at the same time,
# and then refer to the groups you want. To group patterns together you use parentheses, which is actually
# pretty natural. Lets rewrite our findall using groups
import re

pattern_string = "([\\w ]*)(\\[edit])"
re.findall(pattern_string,wiki)

#To select group we can use re.finditer()
for title in re.finditer(pattern_string,wiki):
    print(title.group(1))

# We can use this to name group by using ?P<name>
pattern_string = "(?P<title>[\\w ]*)(?P<edit_link>\\[edit])"
for title in re.finditer(pattern_string,wiki):
    print(title.groupdict()['title'])

#To remove look ahead and look behind,we use ?= operator in unwanted group
pattern_string = "(?P<title>[\\w ]*)(?=\\[edit])"

Overview
Access to public records
Student medical records
Overview
Access to public records
Student medical records
Overview

Access to public records

Student medical records



#Example Wipideia Buddhist Data

In [259]:
#Let's find university name,city and state where buddhist located
import re

with open('resource/buddhist.txt') as file:
    buddhist_data = file.read()
# university_name , city , state
#We have 10 university in this data let's extrct
regex_pattern_for_buddhist_university_location = """(?P<university_name>.*)( – located in | is located in )(?P<city>.{3,15})(,)(?P<state>.*)"""

re.findall(regex_pattern_for_buddhist_university_location,buddhist_data)

for data in re.finditer(regex_pattern_for_buddhist_university_location,buddhist_data):
    print(data.groupdict())

{'university_name': 'Dhammakaya Open University', 'city': 'Azusa', 'state': ' California, part of the Thai Wat Phra Dhammakaya[1]'}
{'university_name': 'Dharmakirti College', 'city': 'Tucson', 'state': ' Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)'}
{'university_name': 'Dharma Realm Buddhist University', 'city': 'Ukiah', 'state': ' California'}
{'university_name': 'Ewam Buddhist Institute', 'city': 'Arlee', 'state': ' Montana'}
{'university_name': 'Naropa University', 'city': 'Boulder', 'state': ' Colorado (Accredited by the Higher Learning Commission)'}
{'university_name': 'Institute of Buddhist Studies', 'city': 'Berkeley', 'state': ' California'}
{'university_name': 'Maitripa College', 'city': 'Portland', 'state': ' Oregon'}
{'university_name': 'Soka University of America', 'city': 'Aliso Viejo', 'state': ' California'}
{'university_name': 'University of the West', 'city': 'Rosemead', 'state': ' California'}
{'university_name': 'Won Institute of Gr