# Regular Expressions 

In [2]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns 
import re

In [3]:
# r is a raw string 
re.findall(r'b', 'abcd')

['b']

In [4]:
#Function to simplify the process of showing many results from regular expressions 
def show_all_matches(regexes, subject, re_length=6):
    print('Sentence:')
    print()
    print('    {}'.format(subject))
    print()
    print(' regexp{} | matches'.format(' ' * (re_length - 6)))
    print(' ------{} | -------'.format(' ' * (re_length - 6)))
    for regexp in regexes:
        fmt = ' {:<%d} | {!r}' % re_length
        matches = re.findall(regexp, subject)
        if len(matches) > 8:
            matches = matches[:8] + ['...']
        print(fmt.format(regexp, matches))

In [5]:
sentence = 'Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.'

show_all_matches([
    r'a',
    r'm',
    r'M',
    r'Mary',
    r'little',
    r'1',
    r'10',
    r'22'
], sentence)



Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 a      | ['a', 'a', 'a', 'a', 'a']
 m      | ['m', 'm']
 M      | ['M']
 Mary   | ['Mary']
 little | ['little', 'little']
 1      | ['1', '1', '1']
 10     | ['10']
 22     | ['22']


In [6]:
res = [
    r'\w',
    r'\d',
    r'\s',
    r'.', # matches every character
    r'\.', # a literal period
]
show_all_matches(res, sentence)



Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \w     | ['M', 'a', 'r', 'y', 'h', 'a', 'd', 'a', '...']
 \d     | ['1', '1', '0', '1', '2', '2', '2']
 \s     | [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '...']
 .      | ['M', 'a', 'r', 'y', ' ', 'h', 'a', 'd', '...']
 \.     | ['.', '.', '.']


In [7]:
show_all_matches([r'l\w\w\w\W', r'\d\d'], sentence, re_length=9)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp    | matches
 ------    | -------
 l\w\w\w\W | ['lamb.', 'lamb.']
 \d\d      | ['10', '12', '22']


In [8]:
show_all_matches([
    r'\d+'
], sentence)

print('\n---\n')

show_all_matches([
    r'a{2,}',
    r'a{2}',
    r'a{3,4}'
], 'aabbaaaa')

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \d+    | ['1', '10', '12', '22']

---

Sentence:

    aabbaaaa

 regexp | matches
 ------ | -------
 a{2,}  | ['aa', 'aaaa']
 a{2}   | ['aa', 'aa', 'aa']
 a{3,4} | ['aaaa']


In [9]:
show_all_matches([
    r'[lt]',
    r'[lt]+',
    r'[^aeiou\s\.]', # any letter that's not a vowel
    r'[a-d]'
], sentence, re_length=12)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp       | matches
 ------       | -------
 [lt]         | ['l', 't', 't', 'l', 'l', 'l', 't', 't', '...']
 [lt]+        | ['l', 'ttl', 'l', 'l', 'ttl', 'l', 't', 't', '...']
 [^aeiou\s\.] | ['M', 'r', 'y', 'h', 'd', 'l', 't', 't', '...']
 [a-d]        | ['a', 'a', 'd', 'a', 'a', 'b', 'a', 'b']


In [10]:
sentence = '''
You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).
'''.strip()

In [11]:
ip_re = r'\d+(\.\d+){3}'

match = re.search(ip_re, sentence)
match[0]

'123.123.123.123'

In [12]:
# simplified for demonstration, a real url to parse urls would be much more
# complex
url_re = r'(https?)://(\w+)\.(\w+)'

protocol, domain, tld = re.search(url_re, sentence).groups()

print(f'''
protocol: {protocol}
domain:   {domain}
tld:      {tld}
''')


protocol: https
domain:   codeup
tld:      com



In [13]:
url_re = r'(?P<protocol>https?)://(?:\w+)\.(?P<tld>\w+)'

match = re.search(url_re, sentence)

print(f'''
groups: {match.groups()}
referencing a group by name: {match.group('tld')}
group dictionary: {match.groupdict()}
''')


groups: ('https', 'com')
referencing a group by name: com
group dictionary: {'protocol': 'https', 'tld': 'com'}



In [14]:
# remove anything that's not a digit
re.sub(r'\D', '', 'abc 123')

'123'

In [15]:
# remove anything that's not a letter
re.sub(r'[^a-z]', '', 'abc 123')



'abc'

In [16]:
re.sub(r'.(.).', r'\1', 'abc')

'b'

In [17]:
re.sub(r'(.)(.)(.)', r'\3\2\1', 'abc')

'cba'

In [18]:
re.sub(r'.{2}$', 'X', 'abc')

'aX'

In [19]:
regexp = r'''
[aeiou] (?# any vowel)
[^aeiou] (?# followed by a non-vowel)
'''

In [20]:
regexp = r'[aeiou][^aeiou]'

#### Exercises 

_Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string._

In [21]:
def is_vowel(string): 
    return bool(re.search(r'^[aeiou]$', string, re.IGNORECASE))

In [23]:
is_vowel("a")

True

In [24]:
is_vowel('ee')

False

_Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username._




In [26]:
def is_valid_username(string): 
    #r for raw string
    #carrot for starts with 
    #list all options for contents of username with no commas 
    #allow 32 digits 
    password_pattern = r"^[a-z][a-z0-9_]{,31}$"
    return bool(re.search(password_pattern, string))

In [28]:
is_valid_username('aa012345678901234567890123456789')

True

_Write a regular expression to capture phone numbers. It should match all of the following:_

- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

In [29]:
#match seven digits with no other characters 
re.search(r"\d{7}", "8675309")

<re.Match object; span=(0, 7), match='8675309'>

In [30]:
#match 3 digits and then 4 digits with the hyphen in the middle 
re.search(r"\d{3}-\d{4}", "867-5309")

<re.Match object; span=(0, 8), match='867-5309'>

In [32]:
#match 3 digits then a hyphen, dot, or space then four digits 
re.search(r"\d{3}[-. ]\d{4}", "867 5309")

<re.Match object; span=(0, 8), match='867 5309'>

In [33]:
#Another approach on the delimiter could be to use \D for any non-digit 
re.search(r"\d{3}\D?\d{4}", "8675309")

<re.Match object; span=(0, 7), match='8675309'>

In [34]:
#search for 10 digit number 
re.search(r"\(?\d{3}\)?.?\d{3}.?\d{4}", "210-867-5309")

<re.Match object; span=(0, 12), match='210-867-5309'>

In [35]:
#search for digit number separated by periods 
re.search(r"\(?\d{3}\)?.?\d{3}.?\d{4}", "210.867.5309")

<re.Match object; span=(0, 12), match='210.867.5309'>

# Alternative Answer 

In [36]:
#define phone regex
phone_regex = re.compile(
"""
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
""", re.VERBOSE)

In [37]:
#build dataframe with numbers, input 'number'
df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]
df

Unnamed: 0,number
0,(210) 867 5309
1,+1 210.867.5309
2,867-5309
3,210-867-5309
4,2108675309


In [38]:
#extract string in dataframe 
df.number.str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


_Use regular expressions to convert the dates below to the standardized year-month-day format._

In [39]:
dates = pd.Series([
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19',
])
dates.str.replace(r'(\d+)/(\d+)/(\d+)', r'20\3-\1-\2', regex=True)

0    2019-02-04
1    2019-02-05
2    2019-02-06
3    2019-02-07
4    2019-02-08
5    2019-02-09
6    2019-02-10
dtype: object

_Write a regex to extract the various parts of these logfile lines:_




GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58


In [40]:
#define regex
logfile_re = r'''
^(?P<method>GET|POST)
\s+
(?P<path>.*?)
\s+
\[(?P<timestamp>.*?)\]
\s+
(?P<http_version>.*?)
\s+
\{(?P<status>\d+)\}
\s+
(?P<bytes_sent>\d+)
\s+
"(?P<user_agent>.*)$
'''
#create pd Series with log code 
lines = pd.Series([
    'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
    'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
    'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58',
])
#use .str.extract to extract lines of code
lines.str.extract(logfile_re, re.VERBOSE)

Unnamed: 0,method,path,timestamp,http_version,status,bytes_sent,user_agent
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,"python-requests/2.21.0"" 97.105.19.58"
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,"python-requests/2.21.0"" 97.105.19.58"
