# Advanced Web Scraping and Data Gathering

### Using the Requests Library to Get a Response from the Wikipedia Home Page

In [1]:
# import the requests library
import requests

In [2]:
# Assign the home page URL to a variable
wiki_home = "https://en.wikipedia.org/wiki/Main_Page"

In [3]:
# Get a response from this page
response = requests.get(wiki_home)
response

<Response [200]>

### Checking the Status of the Web Request

In [4]:
# create a status_check function
def status_check(r):
    if r.status_code == 200:
        print("Success")
        return 1
    else:
        print("Failed")
        return -1

In [5]:
# Pass the response object to the status_check function
status_check(response)

Success


1

### Decoding the Contents of a Response and Checking Its Length

In [6]:
# Write a utility function to decode the contents of the response
def encoding_check(r):
    return r.encoding

def decode_content(r, encoding):
    return r.content.decode(encoding)

In [7]:
contents = decode_content(response, encoding_check(response))

In [8]:
# Check the type of the decoded object
type(contents)

str

In [9]:
# Check the length of the object
len(contents)

84245

In [10]:
# print the first 10,000 characters of this string
contents[:10000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Wikipedia, the free encyclopedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"2a0f3101-89fc-409e-88c1-18e1b21e6623","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Main_Page","wgTitle":"Main Page","wgCurRevisionId":1004593520,"wgRevisionId":1004593520,"wgArticleId":15580374,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Main_Page","wgRelevantArticleId":15580374,"wgIsProbablyEditable":!1,"wgReleva

### Extracting Readable Text from a BeautifulSoup Object

In [11]:
# Import the package and then pass on the whole string (HTML content)
from bs4 import BeautifulSoup

soup = BeautifulSoup(contents, 'html.parser')
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"2a0f3101-89fc-409e-88c1-18e1b21e6623","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Main_Page","wgTitle":"Main Page","wgCurRevisionId":1004593520,"wgRevisionId":1004593520,"wgArticleId":15580374,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Main_Page","wgRelevantArticleId":15580374,"wgIsProbablyEditable":!1,"wgRelevantPag

In [12]:
txt_dump = soup.text
txt_dump

'\n\n\n\nWikipedia, the free encyclopedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMain Page\n\nFrom Wikipedia, the free encyclopedia\n\n\n\nJump to navigation\nJump to search\n\n\n\n\nWelcome to Wikipedia,\nthe free encyclopedia that anyone can edit.\n6,333,223 articles in English\n\n\nThe arts\nBiography\nGeography\nHistory\nMathematics\nScience\nSociety\nTechnology\nAll portals\n\n\n\n\n\nFrom today\'s featured article\n\n\n\n\nThe red-bellied black snake (Pseudechis porphyriacus) is a species of elapid snake native to Australia. Described by George Shaw in 1794, it is one of eastern Australia\'s most commonly encountered snakes. Averaging around 1.25 metres (4\xa0ft) in length when fully grown, it has glossy black upperparts, bright red or orange flanks and a pink or dull red belly. It generally avoids people, but can attack if provoked.  Although its venom is capable of causing significant illness, containing neurotoxins, myotoxins, coagulants and haemo

In [13]:
# Find the length of the txt_dmp
len(txt_dump)

9826

### Using Advanced BS4 Techniques to Extract Relevant Text

In [14]:
# Scan all the tags of the HTML page to find and extract the text associated with a particular <div> element
text_list = []

for d in soup.find_all('div'):
    if d.get('id') == 'mp-otd':
        for i in d.find_all('ul'):
            text_list.append(i.text)

In [15]:
# Print the elements separated by a marker
for i in text_list:
    print(i)
    print('-'*100)

1640 – The Virginia Governor's Council made John Punch the first legally recognized slave in England's North American colonies.
1790 – Russo-Swedish War: During the Battle of Svensksund in the Baltic Sea, the Swedish Navy captured a third of the Russian fleet.
1958 – An earthquake struck Lituya Bay, Alaska; the subsequent megatsunami, the largest in modern times, reached an elevation of 1,720 ft (524 m).
1981 – Nintendo released the arcade game Donkey Kong (cabinet pictured), which featured the debut of Mario, one of the most famous characters in video-game history.
1995 – Sri Lankan Civil War: After advising civilians to take shelter in places of worship, the Sri Lanka Air Force bombed a church in Navaly, killing at least 147 people.
----------------------------------------------------------------------------------------------------
Anastasius I Dicorus  (d. 518)Sebald Heyden  (d. 1561)Mercedes Sosa  (b. 1935)
---------------------------------------------------------------------------

### Creating a Compact Function to Extract the On this day Text from the Wikipedia Home Page

In [16]:
# Create a function that extract the text from the On this day section of the Wikipedia home page
def wiki_on_this_day(url='https://en.wikipedia.org/wiki/Main_Page'):
    import requests
    from bs4 import BeautifulSoup
    wiki_home = str(url)
    response = requests.get(wiki_home)
    
    def status_check(r):
        if r.status_code==200:
            return 1
        else:
            return -1
        
    def encoding_check(r): 
        return (r.encoding)
    
    def decode_content(r, encoding): 
        return (r.content.decode(encoding))
    
    status = status_check(response)
    if status == 1:
        contents = decode_content(response, encoding_check(response))
    else:
        print("Sorry could not reach the web page!")
        return -1
    
    # Create a BeautifulSoup object and read the contents of the web page
    soup = BeautifulSoup(contents, 'html.parser')
    text_list = []
    for d in soup.find_all('div'):
        if d.get('id') == 'mp-otd':
            for i in d.find_all('ul'):
                text_list.append(i.text)
    return text_list[0]

In [17]:
# use the function
print(wiki_on_this_day())

1640 – The Virginia Governor's Council made John Punch the first legally recognized slave in England's North American colonies.
1790 – Russo-Swedish War: During the Battle of Svensksund in the Baltic Sea, the Swedish Navy captured a third of the Russian fleet.
1958 – An earthquake struck Lituya Bay, Alaska; the subsequent megatsunami, the largest in modern times, reached an elevation of 1,720 ft (524 m).
1981 – Nintendo released the arcade game Donkey Kong (cabinet pictured), which featured the debut of Mario, one of the most famous characters in video-game history.
1995 – Sri Lankan Civil War: After advising civilians to take shelter in places of worship, the Sri Lanka Air Force bombed a church in Navaly, killing at least 147 people.


### Creating an XML File and Reading XML Element Objects

In [18]:
# Create an XML file
data = '''
<person>
  <name>Dave</name>
  <surname>Piccardo</surname>
  <phone type="intl">
    +1 742 101 4456
  </phone>
  <email hide="yes">
    dave.p@gmail.com</email>
</person>'''

In [19]:
# read it as an Element object using the Python XML parser engine
import xml.etree.ElementTree as ET
tree = ET.fromstring(data)
type(tree)

xml.etree.ElementTree.Element

### Finding Various Elements of Data within a Tree (Element)

In [20]:
# Use the find method to find Name
print('Name:', tree.find('name').text)

Name: Dave


In [21]:
# Use the find method to find Surname
print('Surname:', tree.find('surname').text)

Surname: Piccardo


In [22]:
# Use the find method to find Phone
print('Phone:', tree.find('phone').text.strip())

Phone: +1 742 101 4456


In [23]:
# Use the find method to find email status and actual email
print('Emaill hidden:', tree.find('email').get('hide'))
print('Email:', tree.find('email').text.strip())

Emaill hidden: yes
Email: dave.p@gmail.com


### Traversing the Tree, Finding the Root, and Exploring All the Child Nodes and Their Tags and Attributes

In [24]:
# define the tree
tree2 = ET.parse('datasets/xml1.xml')
type(tree2)

xml.etree.ElementTree.ElementTree

In [25]:
# Explore these tags and attributes
root = tree2.getroot()
for child in root:
    print('Child:', child.tag, '| Child attribute:', child.attrib)

Child: country | Child attribute: {'name': 'Liechtenstein'}
Child: country | Child attribute: {'name': 'Singapore'}
Child: country | Child attribute: {'name': 'Panama'}


### Using the text Method to Extract Meaningful Data

In [26]:
# Access the root[0][2] element
root[0][2]

<Element 'gdppc' at 0x00000293AC73FF48>

In [27]:
# Use the text method to access the data
root[0][2].text

'141100'

In [28]:
# Use the tag method to access gdppc
root[0][2].tag

'gdppc'

In [29]:
# Check root[0]
root[0]

<Element 'country' at 0x00000293AB19B8B8>

In [30]:
# check the tag
root[0].tag

'country'

In [31]:
# We can use the attrib method to access it
root[0].attrib

{'name': 'Liechtenstein'}

### A Simple Demo of Using XML Data Obtained by Web Scraping

In [32]:
import urllib.parse

In [33]:
# Read from the Recipe Puppy website
serviceurl = 'http://www.recipepuppy.com/api/?'
item = str(input('Enter the name of a food item (enter \'quit\' to quit): '))
url = serviceurl + urllib.parse.urlencode({'q':item})+'&p=1&format=xml'
uh = requests.get(url)
data = uh.text
print('Retrieved', len(data), 'characters')

Enter the name of a food item (enter 'quit' to quit): rice
Retrieved 4374 characters


### Defining and Testing a Function to Pull Country Data from an API

In [34]:
# import libraries
import urllib.request, urllib.parse
from urllib.error import HTTPError, URLError
import json
import pandas as pd

In [35]:
# Define the service_url variable
serviceurl = 'https://restcountries.eu/rest/v2/name/'

In [36]:
# Define a function to pull out data when we pass the name of a country as an argument
country_name = 'Switzerland'
url = serviceurl + country_name
uh = urllib.request.urlopen(url)

In [37]:
# Define the get_country_data function
def get_country_data(country):
    """
    Function to get data about country from "https://restcountries.eu" API
    """
    country_name = str(country)
    url = serviceurl + country_name
    try:
        uh = urllib.request.urlopen(url)
    except HTTPError as e:
        print(f"Sorry! Could not retrieve anything on {country_name}")
        return None
    except URLError as e:
        print('Failed to reach a server.')
        print('Reason: ', e.reason)
        return None
    else:
        data = uh.read().decode()
        print(f"Retrieved data on {country_name}. Total {len(data)} characters read.")
        return data

In [38]:
data = get_country_data(country_name)

Retrieved data on Switzerland. Total 1090 characters read.


In [39]:
# Feed erroneous data in country_name1
get_country_data('Swiitzerland')

Sorry! Could not retrieve anything on Swiitzerland


### Testing the Function by Building a Small Database of Country Information

In [40]:
# Load from string data 
x = json.loads(data)

In [41]:
# Load the only element
y = x[0]

In [42]:
# check the type of y
type(y)

dict

In [43]:
# print the keys of y
y.keys()

dict_keys(['name', 'topLevelDomain', 'alpha2Code', 'alpha3Code', 'callingCodes', 'capital', 'altSpellings', 'region', 'subregion', 'population', 'latlng', 'demonym', 'area', 'gini', 'timezones', 'borders', 'nativeName', 'numericCode', 'currencies', 'languages', 'translations', 'flag', 'regionalBlocs', 'cioc'])

In [44]:
# terate over the dictionary and print the key/item pairs
for k, v in y.items():
    print(k, ":", v)

name : Switzerland
topLevelDomain : ['.ch']
alpha2Code : CH
alpha3Code : CHE
callingCodes : ['41']
capital : Bern
altSpellings : ['CH', 'Swiss Confederation', 'Schweiz', 'Suisse', 'Svizzera', 'Svizra']
region : Europe
subregion : Western Europe
population : 8341600
latlng : [47.0, 8.0]
demonym : Swiss
area : 41284.0
gini : 33.7
timezones : ['UTC+01:00']
borders : ['AUT', 'FRA', 'ITA', 'LIE', 'DEU']
nativeName : Schweiz
numericCode : 756
currencies : [{'code': 'CHF', 'name': 'Swiss franc', 'symbol': 'Fr'}]
languages : [{'iso639_1': 'de', 'iso639_2': 'deu', 'name': 'German', 'nativeName': 'Deutsch'}, {'iso639_1': 'fr', 'iso639_2': 'fra', 'name': 'French', 'nativeName': 'français'}, {'iso639_1': 'it', 'iso639_2': 'ita', 'name': 'Italian', 'nativeName': 'Italiano'}]
translations : {'de': 'Schweiz', 'es': 'Suiza', 'fr': 'Suisse', 'ja': 'スイス', 'it': 'Svizzera', 'br': 'Suíça', 'pt': 'Suíça', 'nl': 'Zwitserland', 'hr': 'Švicarska', 'fa': 'سوئیس'}
flag : https://restcountries.eu/data/che.svg
re

In [45]:
# Create a loop to extract the languages spoken in Switzerland
for lang in y['languages']:
    print(lang['name'])

German
French
Italian


In [46]:
# Define the build_country_database function
def build_country_database(list_country):
    """
    Takes a list of country names.
    Output a DataFrame with key information about those countries.
    """
    # Define an empty dictionary with keys
    country_dict={'Country':[],'Capital':[],'Region':[],'Sub-region':[],'Population':[],
                  'Latitude':[],'Longitude':[],'Area':[],'Gini':[],'Timezones':[],
                  'Currencies':[],'Languages':[]}
    
    for c in list_country:
        data = get_country_data(c)
        if data != None:
            x = json.loads(data)
            y = x[0]
            country_dict['Country'].append(y['name'])
            country_dict['Capital'].append(y['capital'])
            country_dict['Region'].append(y['region'])
            country_dict['Sub-region'].append(y['subregion'])
            country_dict['Population'].append(y['population'])
            country_dict['Latitude'].append(y['latlng'][0])
            country_dict['Longitude'].append(y['latlng'][1])
            country_dict['Area'].append(y['area'])
            country_dict['Gini'].append(y['gini'])
            # Note the code to handle possibility of multiple timezones as a list
            if len(y['timezones']) > 1:
                country_dict['Timezones'].append(','.join(y['timezones']))
            else:
                country_dict['Timezones'].append(y['timezones'][0])
            # Note the code to handle possibility of multiple currencies as dictionaries
            if len(y['currencies']) > 1:
                lst_currencies = []
                for i in y['currencies']:
                    lst_currencies.append(i['name'])
                country_dict['Currencies'].append(','.join(lst_currencies))
            else:
                country_dict['Currencies'].append(y['currencies'][0]['name'])
            # Note the code to handle possibility of multiple languages as dictionaries
            if len(y['languages']) > 1:
                lst_languages = []
                for i in y['languages']:
                    lst_languages.append(i['name'])
                country_dict['Languages'].append(','.join(lst_languages))
            else:
                country_dict['Languages'].append(y['languages'][0]['name'])
    
    # Return as a Pandas DataFrame
    return pd.DataFrame(country_dict)

In [47]:
# test its robustness
df1 = build_country_database(['Nigeria','Switzerland','France','Turmeric','Russia','Kenya', 'Singapore'])

Retrieved data on Nigeria. Total 1004 characters read.
Retrieved data on Switzerland. Total 1090 characters read.
Retrieved data on France. Total 1047 characters read.
Sorry! Could not retrieve anything on Turmeric
Retrieved data on Russia. Total 1120 characters read.
Retrieved data on Kenya. Total 1052 characters read.
Retrieved data on Singapore. Total 1223 characters read.


In [48]:
# print the dataframe
df1

Unnamed: 0,Country,Capital,Region,Sub-region,Population,Latitude,Longitude,Area,Gini,Timezones,Currencies,Languages
0,Nigeria,Abuja,Africa,Western Africa,186988000,10.0,8.0,923768.0,48.8,UTC+01:00,Nigerian naira,English
1,Switzerland,Bern,Europe,Western Europe,8341600,47.0,8.0,41284.0,33.7,UTC+01:00,Swiss franc,"German,French,Italian"
2,France,Paris,Europe,Western Europe,66710000,46.0,2.0,640679.0,32.7,"UTC-10:00,UTC-09:30,UTC-09:00,UTC-08:00,UTC-04...",Euro,French
3,Russian Federation,Moscow,Europe,Eastern Europe,146599183,60.0,100.0,17124442.0,40.1,"UTC+03:00,UTC+04:00,UTC+06:00,UTC+07:00,UTC+08...",Russian ruble,Russian
4,Kenya,Nairobi,Africa,Eastern Africa,47251000,1.0,38.0,580367.0,47.7,UTC+03:00,Kenyan shilling,"English,Swahili"
5,Singapore,Singapore,Asia,South-Eastern Asia,5535000,1.366667,103.8,710.0,48.1,UTC+08:00,"Brunei dollar,Singapore dollar","English,Malay,Tamil,Chinese"


### Using the match Method to Check Whether a Pattern Matches a String/Sequence

In [49]:
# import regex module
import re

In [50]:
# Define a string and a pattern
string1 = 'Python'
pattern = r"Python"

In [51]:
# Write a conditional expression to check for a match
if re.match(pattern, string1):
    print("matches!")
else:
    print("Doesn't match!")

matches!


In [52]:
# Test this with a string that only differs in the first letter
string2 = 'python'

if re.match(pattern, string2):
    print("matches")
else:
    print("Doesn't match")

Doesn't match


### Compiling Programs to Match Objects

In [53]:
# Use the compile function from the regex module
def print_match(s):
    if prog.search(s) == None:
        print("No match")
    else:
        print(prog.search(s).group())

prog = re.compile(pattern)

In [54]:
# Match it with the first string
if prog.match(string1)!= None:
    print("Matches!")
else:
    print("Doesn't match.")

Matches!


In [55]:
# Match it with the second string
if prog.match(string2)!= None:
    print("Matches!")
else:
    print("Doesn't match.")

Doesn't match.


### Using Additional Parameters in the match Method to Check for Positional Matching

In [56]:
# Match y in the second position
prog = re.compile(r'y')
prog.match('Python', pos=1)

<re.Match object; span=(1, 2), match='y'>

In [57]:
# Check for a pattern called thon starting from pos=2
prog = re.compile(r'thon')
prog.match('Python', pos=2)

<re.Match object; span=(2, 6), match='thon'>

### The search Method in RegEx

In [58]:
# Use the compile method to find matching strings
prog = re.compile(r'ing')
if prog.match('Spring') == None:
    print('None')

None


In [59]:
# use search method
prog.search('Spring')

<re.Match object; span=(3, 6), match='ing'>

In [60]:
# Let's use Ringtone as the search parameter
prog.search('Ringtone')

<re.Match object; span=(1, 4), match='ing'>

### Using the span Method of the Match Object to Locate the Position of the Matched Pattern

In [61]:
prog = re.compile(r'ing')
words = ['Spring', 'Cycling', 'Ringtone']

In [62]:
# Create a function to return a tuple of the start and end positions of the match
def verifying_pattern(s):
    for word in words:
        mt = prog.search(word)
        start_pos = mt.span()[0]
        end_pos = mt.span()[1]
        print(f'The word "{word}" contains "ing" in the position {start_pos}-{end_pos}')

In [63]:
verifying_pattern(words)

The word "Spring" contains "ing" in the position 3-6
The word "Cycling" contains "ing" in the position 4-7
The word "Ringtone" contains "ing" in the position 1-4


### Examples of Single-Character Pattern Matching with search

In [64]:
# Pass a regex expression with a dot inside the compile method.
# It matches any single character except a newline character
prog = re.compile(r'py.')
print(prog.search('pygmy').group())
print(prog.search('Jupyter').group())

pyg
pyt


In [65]:
# Pass a regex expression with \w (lowercase w) inside the compile method.
# It matches any single letter, digit, or underscore
prog = re.compile(r'c\wm')
print(prog.search('comedy').group())
print(prog.search('camera').group())
print(prog.search('pac_man').group())
print(prog.search('pac2man').group())

com
cam
c_m
c2m


In [66]:
# Pass a regex expression with \W (uppercase W) inside the compile method.
# It matches anything not covered by \w:
prog = re.compile(r'4\W1')
print(prog.search('4/1 was a wonderful day!').group())
print(prog.search('4-1 was a wonderful day!').group())
print(prog.search('4.1 was a wonderful day!').group())
print(prog.search('Remember the wonderful day 04/1?').group())

4/1
4-1
4.1
4/1


In [67]:
# Pass a regex expression with \s (lowercase s) inside the compile method.
# It matches a single whitespace character, such as a space, newline, tab, or return
prog = re.compile(r'Data\swrangling')
print(prog.search("Data wrangling is cool").group())
print("-"*80)
print("Data\twrangling is the full string")
print(prog.search("Data\twrangling is the full string").group())
print("-"*80)
print("Data\nwrangling is the full string")
print(prog.search("Data\nwrangling").group())

Data wrangling
--------------------------------------------------------------------------------
Data	wrangling is the full string
Data	wrangling
--------------------------------------------------------------------------------
Data
wrangling is the full string
Data
wrangling


In [68]:
# Pass a regex expression with \d inside the compile method. It matches numerical digits 0-9
prog = re.compile(r'score was \d\d')
print(prog.search('My score was 67').group())
print(prog.search('Your score was 73').group())

score was 67
score was 73


### Handling Pattern Matching at the Start or End of a String

In [69]:
# Use ^ (caret) to match a pattern at the start of the string
prog = re.compile(r'^India')
print_match("Russia implemented this law")
print_match("India implemented this law")
print_match("This law was implemented by India")

No match
India
No match


In [70]:
# Use $ (dollar sign) to match a pattern at the end of the string
prog = re.compile(r'Apple$')
print_match("Patent no 123456 belongs to Apple")
print_match("Patent no 345672 belongs to Samsung")
print_match("Patent no 987654 belongs to Apple")

Apple
No match
Apple


### Pattern Matching with Multiple Characters

In [71]:
# Use * to match 0 or more repetitions of the preceding regular expression
prog = re.compile(r'ab*')
print_match("a")
print_match("ab")
print_match("abbb")
print_match("b")
print_match("bbab")
print_match("something_abb_something")

a
ab
abbb
No match
ab
abb


In [72]:
# Using + causes the resulting RE to match 1 or more repetitions of the preceding regular expression
prog = re.compile(r'ab+')
print_match("a")
print_match("ab")
print_match("abbb")
print_match("b")
print_match("bbab")
print_match("something_abb_something")

No match
ab
abbb
No match
ab
abb


In [73]:
# ? causes the resulting re string to match precisely 0 or 1 repetitions of the preceding regular expression
prog = re.compile(r'ab?')
print_match("a")
print_match("ab")
print_match("abbb")
print_match("b")
print_match("bbab")
print_match("something_abb_something")

a
ab
ab
No match
ab
ab


### Greedy versus Non-Greedy Matching

In [74]:
# Write the code to check the greedy way of matching a string
prog = re.compile(r'<.*>')
print_match('<a> b <c>')

<a> b <c>


In [75]:
# Use ? by inserting it after any regex expression to make it non-greedy
prog = re.compile(r'<.*?>')
print_match('<a> b <c>')

<a>


### Controlling Repetitions to Match in a Text

In [76]:
# {m} specifies exactly m copies of RE to match.
# Fewer matches cause a non-match and return None
prog = re.compile(r'A{3}')
print_match("ccAAAdd")
print_match("ccAAAAdd")
print_match("ccAAdd")

AAA
AAA
No match


In [77]:
# {m,n} specifies exactly m to n copies of RE to match
prog = re.compile(r'A{2,4}B')
print_match("ccAAABdd")
print_match("ccABdd")
print_match("ccAABBBdd")
print_match("ccAAAAAAABdd")

AAAB
No match
AAB
AAAAB


In [78]:
# Omitting m specifies a lower bound of zero
prog = re.compile(r'A{,3}B')
print_match("ccAAABdd")
print_match("ccABdd")
print_match("ccAABBBdd")
print_match("ccAAAAAAABdd")

AAAB
AB
AAB
AAAB


In [79]:
# Omitting n specifies an infinite upper bound
prog = re.compile(r'A{3,}B')
print_match("ccAAABdd")
print_match("ccABdd")
print_match("ccAABBBdd")
print_match("ccAAAAAAABdd")

AAAB
No match
No match
AAAAAAAB


In [80]:
# {m,n}? specifies m to n copies of RE to match in a non-greedy fashion
prog = re.compile(r'A{2,4}')
print_match("AAAAAAA")
prog = re.compile(r'A{2,4}?')
print_match("AAAAAAA")

AAAA
AA


### Sets of Matching Characters

In [81]:
prog = re.compile(r'[a-zA-Z]+@+[a-zA-Z]+\.com')
print_match("My email is coolguy@xyz.com")
print_match("My email is coolguy12@xyz.com")

coolguy@xyz.com
No match


In [82]:
# Let's add the digits as well
prog = re.compile(r'[a-zA-Z0-9]+@+[a-zA-Z]+\.com')
print_match("My email is coolguy12@xyz.com")
print_match("My email is coolguy12@xyz.org")

coolguy12@xyz.com
No match


In [83]:
# try to address this last email
prog = re.compile(r'[a-zA-Z0-9]+@+[a-zA-Z]+\.+[a-zA-Z]{2,3}')
print_match("My email is coolguy12@xyz.org")
print_match("My email is coolguy12[AT]xyz[DOT]org")

coolguy12@xyz.org
No match


### The Use of OR in RegEx Using the OR Operator

In [84]:
# Let's start with the OR operator
prog = re.compile(r'[0-9]{10}')
print_match("3124567897")
print_match("312-456-7897")

3124567897
No match


In [85]:
# Use multiple smaller regexes and logically combine them
prog = re.compile(r'[0-9]{10}|[0-9]{3}-[0-9]{3}-[0-9]{4}')
print_match("3124567897")
print_match("312-456-7897")

3124567897
312-456-7897


In [86]:
# Create four strings and execute print_match on them
p1= r'[0-9]{10}'
p2=r'[0-9]{3}-[0-9]{3}-[0-9]{4}'
p3 = r'\([0-9]{3}\)[0-9]{3}-[0-9]{4}'
p4 = r'[0-9]{3}\.[0-9]{3}\.[0-9]{4}'
pattern= p1+'|'+p2+'|'+p3+'|'+p4
prog = re.compile(pattern)
print_match("3124567897")
print_match("312-456-7897")
print_match("(312)456-7897")
print_match("312.456.7897")

3124567897
312-456-7897
(312)456-7897
312.456.7897
