In [14]:
"""
Case 1 - Regular Expressions
"""
# Example - find phone number patterns w/ Regex
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
        
    if text[3] != '-' or text[7] != '-':
        return False
    
    for i in range (4, 7):
        if not text[i].isdecimal():
            return False
            
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
        
    return True

isPhoneNumber('155-155-1555')

# isPhoneNumber('Check')

message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'

for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)

print('done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
done


In [2]:
# Using regex
import re # library for regular expressions

phoneNumRegex = re.compile(r'\d\d\d-\d\d\d\-\d\d\d\d')

res = phoneNumRegex.search('My number is 234-123-2944').group() # returns a Match object when using re to search for matching string

# print(res)

# Exercise (using re library)
# Exp - japanese post code
JPN_POST_CODE = re.compile(r'\d\d\d-\d\d\d\d')

address = '125-0041, Tokyo-to Katsushika-ku, Kanamachi 1-chome, 4th building room no. 225. 192-0003'

res = JPN_POST_CODE.search(address)

print(res.group())

# Link - http://regexpal.com/

125-0041


In [64]:
# Parentheses grouping
# Reusing phone number
'''
The first set of parentheses in a regex string will be group 1. The second set will
be group 2. By passing the integer 1 or 2 to the group() match object method,
you can grab different parts of the matched text. Passing 0 or nothing to the
group() method will return the entire matched text.
'''
PHONE_NUM_REGEX = re.compile(r'(\d\d\d)-(\d\d\d\-\d\d\d\d)')

res = PHONE_NUM_REGEX.search('My number is 234-123-2944')

print(res.group(1))
res.group(2)

# Matching multiple groups
POST_CODE_REG = re.compile(r'\d\d\d-\d\d\d\d|\d\d\d\d\d')

post_code_message = '124-9023, 43200'

jpn_post_code = POST_CODE_REG.search(post_code_message)
jpn_post_code.group() # only able to group the first matching for regex with groups

# using findall() will return a list of matched strings with the regex
my_post_code = POST_CODE_REG.findall(post_code_message) 
my_post_code

# Matching with ?
'''
Use when matg only optinally. Regex should find a match whether or not bit of text is here.
You can think of the ? as saying, “Match zero or one of the group preceding this
question mark.”
''' 
batRegex = re.compile(r'Bat(wo)?man')

bat_message = 'The Adventures of Batman'
bat_w_message = 'The Adventures of Batwoman'

mo1 = batRegex.search(bat_message)
mo1.group()

mo2 = batRegex.search(bat_w_message)
mo2.group()


234


'Batwoman'

In [11]:
# Case insensitive matching

batRegex = re.compile(r'Bat(wo)?man', re.IGNORECASE) # adding re.IGNORECASE or re.I as second argument to ignore case while matching

bat_message = 'Batman, BATMAN, batMan, batman'

res = batRegex.search('Batwoman')

res.group()

batRegexTwo = re.compile(r'Bat(wo)?man', re.IGNORECASE | re.DOTALL | re.VERBOSE)
res = batRegexTwo.search('batman, batWoman, batWOMAN').group()
res

'batman'

In [24]:
# Project: Phone number and email address extractor
import pyperclip, re


def findPhoneNumberEmailMatch(text = ''):
    # Creating regex
    # phone number
    phoneRegex = re.compile(r'''(
        (\d{3}|\(\d{3}\))?
        (\s|-|\.)?
        (\d{3})
        (\s|-|\.)
        (\d{4})                         # 
        (\s*(ext|x|ext.)\s*(\d{2,5}))?  #extension
    )''', re.VERBOSE)

    # email
    emailRegex = re.compile(r'''([a-zA-Z0-9._%+-]+
                            @
                            [a-zA-Z0-9.-]+
                            (\.[a-zA-Z]{2,4})
                            )''', re.VERBOSE)

    pyperclip.copy(text)
    
    # Find matches in clipboard test
    text = str(pyperclip.paste())
    matches = []

    for groups in phoneRegex.findall(text):
        phoneNum = '-'.join([groups[1], groups[3], groups[5]])
        if groups[8] != '':
            phoneNum += ' x' + groups[8]
        matches.append(phoneNum)
    for groups in emailRegex.findall(text):
        matches.append(groups[0])

    # Copy results to clipboard
    if len(matches) > 0:
        pyperclip.copy('\n'.join(matches))
        print('Copied to clipboard')
        print('\n'.join(matches))
    else:
        print('No phone numbers or email address found')

In [32]:
findPhoneNumberEmailMatch('erre@gmail.com, 122-302-1232')

findPhoneNumberEmailMatch()

Copied to clipboard
122-302-1232
erre@gmail.com
No phone numbers or email address found
