## REGEX

Regular Expressions is a "language" for pattern searching in strings.

Test your regexes and see all the possbile characters and sequences [here](https://regexr.com/).

In [1]:
import re

- `*`: Matches previous character 0 or more times
- `+`: Matches previous character 1 or more times
- `?`: Matches previous character 0 or 1 times (optional)
- `{}`: Matches previous characters however many times specified within:
- `{n}`: Exactly n times
- `{n,}`: At least n times
- `{n,m}`: Between n and m times

In [8]:
text = "Face it, Jared, being too early is the same as being wrong."

pattern = 'Jared'
pattern = '\w'
pattern = '[A-Z]'
pattern = '[Face]'
pattern = '[atsdhksdgs]'

re.findall(pattern, text)

['a',
 't',
 'a',
 'd',
 'g',
 't',
 'a',
 's',
 't',
 'h',
 's',
 'a',
 'a',
 's',
 'g',
 'g']

In [14]:
text = "This is an A and B conversation, so C your way out of it."
pattern = '^[A-z]' # '^' starts with, '[^...]' is not
re.findall(pattern, text)

['T']

In [15]:
text = "The complicit caat interacted with the other cats exactly as we expected."
pattern_1 = "ca*t" # 0 or more 'a'
pattern_2 = "ca+t" # 1 or more 'a'

re.findall(pattern_1, text), re.findall(pattern_2, text)

(['caat', 'ct', 'cat', 'ct', 'ct'], ['caat', 'cat'])

In [17]:
text = "My phone number is 9 982 478 320, my social security number is 320-478-982."
pattern = "\d{1,3}" # [0-9]
print(re.findall(pattern, text))

['9', '982', '478', '320', '320', '478', '982']


In [35]:
# I want ('9 982 478 320', '320-478-982')
pattern = '\d? \d{3}[- ]\d{3}[- ]\d{3}'
print(re.findall(pattern, text))

['9 982 478 320', ' 320-478-982']


In [40]:
text = 'Team Name Handicap -1'
pattern = 'Team Name (Handicap) -1'
# re.compile()

re.compile(r'Team Name (Handicap) -1', re.UNICODE)

In [41]:
text = "Is the correct spelling color or colour?"
pattern = "colou?r" # may or may not contain 'u'
print(re.findall(pattern, text))

['color', 'colour']


In [50]:
text = "Is it spelled gray (or grey)? grry"
pattern = 'gr[ae]y'
re.findall(pattern, text)

['gray', 'grey']

In [43]:
pattern = 'gray|grey'
re.findall(pattern, text)

['gray', 'grey']

In [46]:
pattern = 'gr.y'
re.findall(pattern, text)

['gray', 'grey', 'grry']

In [51]:
pattern = '\(or gr[ae]y\)'
re.findall(pattern, text)

['(or grey)']

In [80]:
# Activity 1

text = """
Aeromexico 800 - 237 - 6639
Air Canada 888- 247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-661-0407
Air Tindi 888-545-6794"""

pattern = '\d* \d{3}[ -]-* *\d{3}[ -]-* *\d{4}'

pattern = '\d+ ?- ?\d+ ?- ?\d+'

re.findall(pattern, text)

['800 - 237 - 6639',
 '888- 247-2262',
 '888-247-2262',
 '800-567-6567',
 '800-361-2965',
 '800-661-0407',
 '888-545-6794']

In [87]:
pattern = '(.*) (\d+ ?- ?\d+ ?- ?\d+)'

re.findall(pattern, text)

[('Aeromexico', '800 - 237 - 6639'),
 ('Air Canada', '888- 247-2262'),
 ('Air Canada Rouge', '888-247-2262'),
 ('Air Creebec', '800-567-6567'),
 ('Air Inuit', '800-361-2965'),
 ('Air North', '800-661-0407'),
 ('Air Tindi', '888-545-6794')]

In [64]:
text = '''
TKerraPower, A nuclear-energy company founded by Bill Gates,
is unlikely to follow through on building a demonstration reactor in China,
due largely to the Trump administrations crackdown on the country'''

pattern = '[A-Z][a-z]+ ?[A-Z][a-z]+|[A-Z][a-z]+'
re.findall(pattern, text) 

['KerraPower', 'Bill Gates', 'China', 'Trump']

In [75]:
text = "Is it spelled gray (or grey)?"

pattern = "or \w+"
re.findall(pattern, text)

['or grey']

In [76]:
pattern = "or (\w+)"
re.findall(pattern, text)

['grey']

In [78]:
pattern = "(or)? ?(\w+)"
re.findall(pattern, text)

[('', 'Is'),
 ('', 'it'),
 ('', 'spelled'),
 ('', 'gray'),
 ('or', 'grey'),
 ('', 'grry')]