## Regular expressions

In [1]:
import re

In [3]:
# finding phone number in the text
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


In [4]:
# the same s above but written in different way
phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('My phone number is ' + mo.group())

My phone number is 415-555-4242


In [12]:
# finding name Tina or Kinga in text
# When both Tina and Kinga occur in the searched string, the first occurrence of matching text will be returned 
text=re.compile(r'Tina|Kinga' )
word=text.search('Kinga i Tina to przyjacióki')
result=word.group()
print(result)

Kinga


In [38]:
# finding all matches 
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [18]:
phoneNumRegex = re.compile(r'''(\d\d\d)-(\d\d\d)-(\d\d\d\d)''') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [29]:
# (.*) try to match as much text as possible (anything)
# the dot character means “any single character except the newline,”
# the star character means “zero or more of the preceding character.”
nameRegex = re.compile(r'First Name: .*?')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group()

'First Name: '

In [64]:
# Create phone number regex
phone=re.compile(r'''(\((\+|\d)\d{2}\))? # area code; ? means that this part of number is optional
(\s|-)? # separator - optional 
(\d{3}) # first 3 digits 
(\s|-)? # separator - optional 
(\d{3}) # second 3 digits 
(\s|-)? # separator - optional 
(\d{3}) # last 3 digits 

''', re.VERBOSE) # Verbose mode is used to ignore whitespace and comments inside the regular expression string. 
# Thanks to this I can split a long code into a few lines and ut comments
phone.findall('My phone number is (048)604 588-642 and (+48) 456-987-203')


[('(048)', '0', '', '604', ' ', '588', '-', '642'),
 ('(+48)', '+', ' ', '456', '-', '987', '-', '203')]

In [44]:
# Create phone number regex
import re
phone=re.compile(r'''(\((\+|\d)\d{2}\))?  # area code; ? means that this part of number is optional
(\s|-)? # separator - optional 
(\d{3}) # first 3 digits 
(\s|-)? # separator - optional 
(\d{3}) # second 3 digits 
(\s|-)? # separator - optional 
(\d{3}) # last 3 digits 

''', re.VERBOSE) # Verbose mode is used to ignore whitespace and comments inside the regular expression string. 
# Thanks to this I can split a long code into a few lines and ut comments
phone.findall('My phone number is (048)604 588-642 and (+48) 456-987-203')


[('(048)', '0', '', '604', ' ', '588', '-', '642'),
 ('(+48)', '+', ' ', '456', '-', '987', '-', '203')]

In [45]:
# Create e-mail regex
# \S match anything except a  space character
# the + (or plus) means “match one or more.”
mail=re.compile(r'\S+@\S+', re.VERBOSE)
mail.findall('My e mail address is: KIinga0kuzmia1k@interia.pl and I like cats')

['KIinga0kuzmia1k@interia.pl']

In [105]:
# Create e-mail regex - another way
mail=re.compile(r'[a-zA-Z0-9.-_]+@[a-zA-Z]+\.[a-zA-Z]+', re.VERBOSE)
mail.findall('My e mail address is: KIinga0kuzmia1k@igmail.com and I like cats')

['KIinga0kuzmia1k@igmail.com']

In [47]:
# Find matches of phone numbers and emails in the text
text = '''My phone number is (048)604 588-642 and (+48) 456-987-203
My main e-mail address is: Kinga_kuzmia1k@interia.pl.
Sometimes I use another e-mail which is kinga.kuzmiak@gmail.com
I like dogs and I like cats'''
matches=[]
for groups in phone.findall(text):
    phoneNum = '-'.join([groups[0], groups[3], groups[5],groups[7] ])
    matches.append(phoneNum)
for email in mail.findall(text):
    matches.append(email)
print(matches)

['(048)-604-588-642', '(+48)-456-987-203', 'Kinga_kuzmia1k@interia.pl.', 'kinga.kuzmiak@gmail.com']
