In [42]:
import pandas as pd
import re

### Exercise 1
### Write a function named is_vowel. 
 - It should accept a string as input and use a regular expression to determine if the passed string is a vowel. 
 - While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [3]:
def startswith_vowel(word):
    regexp = r'^[aeiouAEIOU]\w+'
    return re.search(regexp, word)

words = 'apple banana watermelon orange grapes'

# [(word, startswith_vowel(word)) for word in words.split()]

for word in words.split():
    if startswith_vowel(word):
        print(f'{word} starts with a vowel')
    else:
        print(f'{word} does not start with a vowel')

apple starts with a vowel
banana does not start with a vowel
watermelon does not start with a vowel
orange starts with a vowel
grapes does not start with a vowel


In [4]:
re.search(r"^(a|e|i|o|u)$", "a", re.IGNORECASE)

<re.Match object; span=(0, 1), match='a'>

In [13]:
re.search(r"^(a|e|i|o|u)$", "orange", re.IGNORECASE)

In [18]:
def is_vowel(string):
    return bool(re.search(r"^[aeiou]$", string, re.IGNORECASE))
    
assert is_vowel("a") == True
assert is_vowel("E") == True
assert is_vowel("aaa") == False
assert is_vowel("aeiou") == False

### Exercise 2
### Write a function named is_valid_username that accepts a string as input. 
- A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. 
- It should also be no longer than 32 characters. 
- The function should return either True or False depending on whether the passed string is a valid username.

In [23]:
# Why do we need the $ in the above pattern?
# because the regex below matches up to the capital letter but we're not saying everything needs to be lowercase
re.search(r"^[a-z][a-z0-9_]{,31}", "aaaCODEUPCODEUPaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")

<re.Match object; span=(0, 3), match='aaa'>

In [39]:
# must start with a lowercase letter
# is only lowercase letters, numbers, or _
# shoul dbe no longer than 32 characters
# return a boolean

def is_valid_username(string):
    pattern = r'^[a-z][a-z0-9_]{,31}$'
    return bool(re.search(pattern, string))


In [36]:
is_valid_username('keiko9'), is_valid_username('Keiko9'), is_valid_username('yougotthis13')

(True, False, True)

### Exercise 3

Write a regular expression to capture phone numbers. It should match all of the following:

- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

Problem solving process:

- Put the subject strings in order of increasing complexity
- Solve them one at a time and build an iterative solution
- Add optionality as the pattern increases in parts

In [29]:
# Match seven digits with no other characters
re.search(r'\d{7}', '8675309')

<re.Match object; span=(0, 7), match='8675309'>

In [30]:
# Match 3 digits then a hypohen then 4 digits
re.search(r"\d{3}-\d{4}", "867-5309")

<re.Match object; span=(0, 8), match='867-5309'>

In [31]:
# Match 3 digits then a hypohen or a dot then 4 digits
re.search(r"\d{3}[-.]\d{4}", "867-5309")

<re.Match object; span=(0, 8), match='867-5309'>

In [None]:
# The regex here handles optional area code, optional parentheses, but looses the international code
# But what about the international code +1
re.search(r"(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "+1 210.867.5309")

In [44]:
phone_regex = re.compile(
"""
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
""", re.VERBOSE)

In [45]:
df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]
df

Unnamed: 0,number
0,(210) 867 5309
1,+1 210.867.5309
2,867-5309
3,210-867-5309
4,2108675309


In [46]:
# extract turns named capture groups into dataframe columns
# NaNs for no match
df.number.str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [47]:
df = pd.concat([df, df.number.str.extract(phone_regex)], axis=1)
df


Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


In [48]:
df = pd.concat([df, df.number.str.extract(phone_regex)], axis=1)
df

Unnamed: 0,number,country_code,area_code,exchange_code,line_number,country_code.1,area_code.1,exchange_code.1,line_number.1
0,(210) 867 5309,,210.0,867,5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309,1.0,210.0,867,5309
2,867-5309,,,867,5309,,,867,5309
3,210-867-5309,,210.0,867,5309,,210.0,867,5309
4,2108675309,,210.0,867,5309,,210.0,867,5309


# Exercise 4
Use regular expressions to convert the dates below to the standardized year-month-day format.

- 02/04/19
- 02/05/19
- 02/06/19
- 02/07/19
- 02/08/19
- 02/09/19
- 02/10/19

In [49]:
dates = [
    "02/04/19",
    "02/05/19",
    "02/06/19",
    "02/07/19",
    "02/08/19",
    "02/09/19",
    "02/10/19"
]

df = pd.DataFrame({"original": dates})
df

Unnamed: 0,original
0,02/04/19
1,02/05/19
2,02/06/19
3,02/07/19
4,02/08/19
5,02/09/19
6,02/10/19


In [51]:
pattern = re.compile(r"""
(?P<month>\d{2})/
(?P<day>\d{2})/
(?P<year>\d{2})
""", re.VERBOSE)

In [52]:
df = pd.concat([df, df.original.str.extract(pattern)], axis=1)
df

Unnamed: 0,original,month,day,year
0,02/04/19,2,4,19
1,02/05/19,2,5,19
2,02/06/19,2,6,19
3,02/07/19,2,7,19
4,02/08/19,2,8,19
5,02/09/19,2,9,19
6,02/10/19,2,10,19


In [53]:
df = pd.concat([df, df.original.str.extract(pattern)], axis=1)
df

Unnamed: 0,original,month,day,year,month.1,day.1,year.1
0,02/04/19,2,4,19,2,4,19
1,02/05/19,2,5,19,2,5,19
2,02/06/19,2,6,19,2,6,19
3,02/07/19,2,7,19,2,7,19
4,02/08/19,2,8,19,2,8,19
5,02/09/19,2,9,19,2,9,19
6,02/10/19,2,10,19,2,10,19
