<a href="https://colab.research.google.com/github/M-110/automate-the-boring-stuff/blob/main/07_Pattern_Matching_With_Regex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project: Phone Number and Email Address Extractor

In [None]:
import re

import requests

In [None]:
def get_phone_numbers(text):
  """Returns phone numbers from the text."""
  pattern = r'(\s|^)(\d{3}|\(\d{3}\))?(\.|-| )?(\d{3})(\.|-|\s)?(\d{4})(\s|$)'
  matches = re.findall(pattern, text)
  return ['-'.join(match[1::2]) for match in matches]

In [None]:
text = requests.get('https://nostarch.com/contactus/').text
get_phone_numbers(text)

['800-420-7240',
 '415-863-9900',
 '415-863-9950',
 '800-420-7240',
 '415-863-9900',
 '415-863-9950',
 '800-420-7240',
 '415-863-9900',
 '415-863-9950',
 '800-420-7240',
 '415-863-9900']

In [None]:
def get_email_addresses(text):
  """Returns email addresses from the text."""
  pattern = r'\S+@\S+\.\S+'
  matches = re.findall(pattern, text)
  return matches


In [None]:
get_email_addresses(text)

['info@nostarch.com',
 'media@nostarch.com',
 'academic@nostarch.com',
 'conferences@nostarch.com',
 'info@nostarch.com',
 'media@nostarch.com',
 'academic@nostarch.com',
 'conferences@nostarch.com',
 'info@nostarch.com']

In [None]:
%%writefile scrape_contacts.py
#!/usr/bin/env python
"""Prints out phone numbers and email adddresses from the given webpage."""
import argparse
import re

import requests

def main():
  """Search for numbers and email addresses and print them."""
  url = get_args().url
  page_text = requests.get(url).text
  phone_numbers = get_phone_numbers(page_text)
  email_addresses = get_email_addresses(page_text)
  if phone_numbers:
    print('Phone numbers:')
    for phone_number in phone_numbers:
      print(phone_number)
  else:
    print('No phone numbers found.')

  if email_addresses:
    print('Email addresses:')
    for email_address in email_addresses:
      print(email_address)
  else:
    print('No email addresses were found.')
  


def get_args():
  """Get url from command line."""
  parser = argparse.ArgumentParser(
      description="Scrape phone numbers and email addresses from a url")
  parser.add_argument('url', help='URL to scrape')
  return parser.parse_args()


def get_phone_numbers(text):
  """Returns phone numbers from the text."""
  pattern = r'(\s|^)(\d{3}|\(\d{3}\))?(\.|-| )?(\d{3})(\.|-|\s)?(\d{4})(\s|$)'
  matches = re.findall(pattern, text)
  return {'-'.join(match[1::2]) for match in matches}


def get_email_addresses(text):
  """Returns email addresses from the text."""
  pattern = r'\S+@\S+\.\S+'
  return set(re.findall(pattern, text))


if __name__ == '__main__':
  main()


Overwriting scrape_contacts.py


In [None]:
!python scrape_contacts.py https://nostarch.com/contactus/

Phone numbers:
415-863-9950
800-420-7240
415-863-9900
Email addresses:
academic@nostarch.com
info@nostarch.com
media@nostarch.com
conferences@nostarch.com


# Practice Projects

## Date Detection



In [None]:
def find_dates(text):
  pattern = r'(0[1-9]|[1-2][0-9]|30|31)\/(0[1-9]|10|11|12)\/([1|2]\d{3})'
  matches = re.findall(pattern, text)
  if matches:
    day, month, year = matches[0]
    print(f'Day: {day}, Month: {month}, Year: {year}')
  else:
    print('No valid date found')

In [None]:
find_dates('hello world 31/12/1954')

Day: 31, Month: 12, Year: 1954


In [None]:
find_dates('hello world 31/15/1954')

No valid date found


## Strong Password Detection

In [None]:
def is_strong_password(password):
  """Returns True if password is strong."""
  has_eight_characters = r'\S{8,}'
  has_upper_and_lower_characters = r'(.*[a-z].*[A-Z].*)|(.*[A-Z].*[a-z].*)'
  has_one_digit = r'.*\d.*'
  patterns = [has_eight_characters, has_upper_and_lower_characters, has_one_digit]

  return all(re.match(pattern, password) for pattern in patterns)

In [None]:
is_strong_password('X5p3333333')

True

In [None]:
is_strong_password('abcdef543322')

False

In [None]:
is_strong_password('Cale03')

False

In [None]:
is_strong_password('CamErAMaN')

False

# Regex Version of strip() method

In [None]:
def regex_strip(text, replace_char=None):
  """Strip whitespace from edges of the text, or if replace_char is given, strip
  that character from the edges."""
  if replace_char is not None:
    pattern = fr'[^{replace_char}].*[^{replace_char}]'
  else:
    pattern = r'\S.*\S'
  return re.findall(pattern, text)[0]

In [None]:
regex_strip('         HELLO WORLD        ')

'HELLO WORLD'

In [None]:
regex_strip('WWWWWWWHELLO WORLDWWWWWWW', replace_char='W')

'HELLO WORLD'