# Let's play with some Regex AKA Regular Expressions

In [4]:
import re

In [5]:
text = 'This is a good day.'

if re.search('good',text):
    print("Wonderful!")
else:
    print('The day sucks.')

Wonderful!


In [6]:
test = 'AAABBCAAAA'

re.findall('AB|AC',test)

['AB']

In [7]:
re.findall('[A][B-C]',test)

['AB']

In [8]:
re.findall('A{1,4}', test)

['AAA', 'AAAA']

In [9]:
re.findall('[^A]', test)

['B', 'B', 'C']

In [10]:
re.findall('A{0}A{1,3}', test)

['AAA', 'AAA', 'A']

# Using Regex on Read dataset

In [11]:
with open('coursera_resources/dataset/ferpa.txt') as f:
    wiki = f.read()

wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [12]:
re.findall('[a-zA-z]{1,100}\[edit\]', wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [13]:
re.findall('[\w]{1,100}\[edit\]', wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [14]:
re.findall('[\w ]*\[edit\]', wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [15]:
for title in re.findall('[\w ]*\[edit\]', wiki):
    print('Edit V', title)
    print('Edit removed', re.split('[\[]',title)[0])

Edit V Overview[edit]
Edit removed Overview
Edit V Access to public records[edit]
Edit removed Access to public records
Edit V Student medical records[edit]
Edit removed Student medical records


# Groups

In [16]:
re.findall('([\w ]*)(\[edit\])', wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [17]:
for item in re.finditer('([\w ]*)(\[edit\])', wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [18]:
for item in re.finditer('(?P<title>[\w ]*)(?P<edit_link>\[edit\])', wiki):
    print(item.groupdict()['title'])

Overview
Access to public records
Student medical records


In [19]:
item.groupdict()

{'title': 'Student medical records', 'edit_link': '[edit]'}

# Look Ahead and Look Behind

In [20]:
for item in re.finditer('(?P<title>[\w ]+)(?=\[edit\])', wiki):
    print(item.groups()[0])

Overview
Access to public records
Student medical records


# Example Wikipedia Data

In [21]:
# Let's look at some more wikipedia data. Here's some data on universities in the US which are buddhist-based
with open("coursera_resources/dataset/buddhist.txt", encoding='utf8') as file:
    # we'll read that into a variable called wiki
    wiki=file.read()
# and lets print that variable out to the screen
wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [24]:
pattern="""(?P<title>.*)(–\ located\ in\ )(?P<city>\w*)(,\ )(?P<state>\w*)"""


for item in re.finditer(pattern, wiki, re.VERBOSE):
    print(item.groupdict())

    

{'title': 'Dhammakaya Open University ', 'city': 'Azusa', 'state': 'California'}
{'title': 'Dharmakirti College ', 'city': 'Tucson', 'state': 'Arizona'}
{'title': 'Dharma Realm Buddhist University ', 'city': 'Ukiah', 'state': 'California'}
{'title': 'Ewam Buddhist Institute ', 'city': 'Arlee', 'state': 'Montana'}
{'title': 'Institute of Buddhist Studies ', 'city': 'Berkeley', 'state': 'California'}
{'title': 'Maitripa College ', 'city': 'Portland', 'state': 'Oregon'}
{'title': 'University of the West ', 'city': 'Rosemead', 'state': 'California'}
{'title': 'Won Institute of Graduate Studies ', 'city': 'Glenside', 'state': 'Pennsylvania'}


In [32]:
with open('coursera_resources/dataset/nytimeshealth.txt', encoding='utf-8') as file:
    ny_hlth = file.read()

In [42]:
pattern = "#[\w\d]*(?=\s)"

for item in re.findall(pattern, ny_hlth):
    print("{} ".format(item))

#askwell 
#pregnancy 
#Colorado 
#VegetarianThanksgiving 
#FallPrevention 
#Ebola 
#Ebola 
#ebola 
#Ebola 
#Ebola 
#EbolaHysteria 
#AskNYT 
#Ebola 
#Ebola 
#Liberia 
#Excalibur 
#ebola 
#Ebola 
#dallas 
#nobelprize2014 
#ebola 
#ebola 
#monrovia 
#ebola 
#nobelprize2014 
#ebola 
#nobelprize2014 
#Medicine 
#Ebola 
#Monrovia 
#Ebola 
#smell 
#Ebola 
#Ebola 
#Ebola 
#Monrovia 
#Ebola 
#ebola 
#monrovia 
#liberia 
#benzos 
#ClimateChange 
#Whole 
#Wheat 
#Focaccia 
#Tomatoes 
#Olives 
#Recipes 
#Health 
#Ebola 
#Monrovia 
#Liberia 
#Ebola 
#Ebola 
#Liberia 
#Ebola 
#blood 
#Ebola 
#organtrafficking 
#EbolaOutbreak 
#SierraLeone 
#Freetown 
#SierraLeone 
#ebolaoutbreak 
#kenema 
#ebola 
#Ebola 
#ebola 
#ebola 
#Ebola 
#ASMR 
#AIDS2014 
#AIDS 
#MH17 
#benzos 
