In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import seaborn as sns 
import re

### What are regular expressions?

#### You must memorize them 

#### Symbols representing a text pattern 

Regular Expressions are separated into the _Basic Set_ and the _Extended Set_ 

#### WIldcard Period . 

- foo.bar
- will correctly identify 
- fooabar
- fooxbar
- foocbar

In [72]:
def is_foobar(string):
    return bool(re.search(r'^foo.bar$', string))

In [73]:
is_foobar('fooaabar')

False

In [74]:
is_foobar('fooabar')

True

#### Wildcard Asterisk Combo

- foo.*bar
- will correctly identify
    - fooabcbar
    - foorbar
    - fooxxabar

In [75]:
def is_foobar2(string):
    return bool(re.search(r'^foo.*bar$', string))

In [79]:
is_foobar2('fooabcbar')

True

In [80]:
is_foobar2('foowewbar')

True

In [81]:
#this should be false
is_foobar2('foo  bar')

True

#### Representing Whitespaces 

- Want to correctly identify: 
    - foo bar 
    - foo   bar 
    - foo     bar 

- foo\s*bar

In [82]:
#the \s* will correctly identify a string that has varying numbers of spaces in the middle 
def is_foobar3(string):
    return bool(re.search(r'^foo\s*bar$', string))

In [83]:
is_foobar3('foo bar')

True

In [84]:
is_foobar3('foo   bar')

True

In [85]:
is_foobar3('fooabar')

False

In [86]:
is_foobar3('foobar')

True

#### Character Classes 

In [33]:
#suppose you want to correctly identify 'foo', 'coo', 'loo'
#we must separate for the computer the first letter 'f' from what each character has in common the ending of 'oo'
# [fcl]oo
def foo_coo_loo(string):
    return bool(re.search(r'^[fcl]oo$', string))

In [34]:
foo_coo_loo("loo")

True

In [35]:
foo_coo_loo("zoo")

False

In [36]:
def foo_coo_doo_poo_loo_boo(string):
    return bool(re.search(r'^[fcdplb]oo$', string))

In [37]:
foo_coo_doo_poo_loo_boo('boo')

True

In [38]:
foo_coo_doo_poo_loo_boo('zoo')

False

In [39]:
#what if we want to just exclude two characters 
#the carrot symbol negates the class 
def notmoo_nothoo(string):
    return bool(re.search(r'^[^mh]oo$', string))

In [40]:
notmoo_nothoo('moo')

False

In [42]:
notmoo_nothoo('zoo')

True

In [43]:
notmoo_nothoo('hoo')

False

#### Character Classes with Ranges 

In [44]:
#suppose we want to correctly identify joo, koo, loo, moo
# notice the beginning of these words starts with j, k, l, m 
def j_through_m(string):
    return bool(re.search(r'^[j-m]oo$', string))

In [45]:
j_through_m('boo')

False

In [46]:
j_through_m('koo')

True

In [47]:
#suppose we want to correctly identify joo, koo, loo, moo, zoo 
#all first letters are one after the other in the alphabet with the exception of zoo 
def j_through_m_andZ(string):
    return bool(re.search(r'^[j-mz]oo$', string))

In [48]:
j_through_m_andZ("zoo")

True

In [49]:
j_through_m_andZ("boo")

False

In [50]:
j_through_m_andZ("moo")

True

In [53]:
#joo, Koo, Loo, moo, zoo
#ignoring case sensitivity 
def lower_and_upper(string):
    return bool(re.search(r'^[j-mJ-Mz]oo$', string))

In [54]:
lower_and_upper("Koo")

True

In [55]:
lower_and_upper("roo")

False

In [56]:
lower_and_upper("zoo")

True

#### Escaping with Backslash

In [61]:
# We want: xxx.yy, xx.yyyy, x.yy 
#first section is all x's
#last section is all y's 
#middle section is a period, the period occurs only once 
#backslash is called the 'escape' character in regex 
def x_period_y(string):
    return bool(re.search(r'[x*\.y*]$', string))

In [62]:
x_period_y('xxx.yy')

True

In [63]:
x_period_y('xx.yyyy')

True

In [64]:
x_period_y('x.yy')

True

In [65]:
#should be false
x_period_y('xy')

True

In [66]:
#suppose we have x#y, x:y, x.y
# # and : are not regex symbols but period IS a special regex symbol 
# if a period is inside square brackets it need not be escaped 
# the period is only interpreted as a wildcard outside of square brackets 
# hyphen is used for character class ranges inside square brackets 
#carrot is also used to negate inside square brackets
def x_period_y_withsymbol(string):
    return bool(re.search(r'x[#:.]y$', string))

In [68]:
x_period_y_withsymbol('x#y')

True

In [69]:
x_period_y_withsymbol('x:y')

True

In [70]:
x_period_y_withsymbol('x.y')

True

In [71]:
x_period_y_withsymbol('x&y')

False

In [88]:
#x^y
#each time you have a special character you must use a backslash to negate even within square brackets 
#i.e. x\y or x^y would have to be specified x[\\\^]y
def x_period_y_withcarrot(string):
    return bool(re.search(r'x[\^]y$', string))

In [90]:
x_period_y_withcarrot('x^y')

True

#### Anchors

In [91]:
#we want: foo bar baz, foo baz bar
def foo_b_b(string):
    return bool(re.search(r'^foo.*$', string))

In [92]:
foo_b_b("foo bar baz")

True

In [93]:
foo_b_b("bar foo baz")

False

In [94]:
#baz foo bar 
#foo baz bar 
def something_bar(string):
    return bool(re.search(r'.*bar$', string))

In [95]:
something_bar('baz foo bar')

True

In [96]:
something_bar('foo baz bar')

True

In [97]:
something_bar('foo bar baz')

False

In [99]:
#foo  (only foo)
#not foo bar, not baz foo, not foo bar baz, not baz bar foo 
def only_foo(string):
    return bool(re.search(r'^foo$', string))

In [100]:
only_foo("foo")

True

In [101]:
only_foo("foo boo")

False

#### Curly Braces Repeater

In [109]:
#Curly Braces Repeater
# 834, 519, 645
def three_digit_number(string):
    return bool(re.search('^[0-9]{3}$', string))

In [110]:
three_digit_number('834')

True

In [111]:
three_digit_number('2298')

False

In [112]:
#curly braces repeater for letters 
# words we want: lion, tiger, mouse, cuckoo, deer
# words in symbols: ^[a-z]{4}$, ^[a-z]{5}$, ^[a-z]{5}$, ^[a-z]{6}$, ^[a-z]{4}$
def four_to_six_letterwords(string):
    return bool(re.search('^[a-z]{4,6}$', string))

In [135]:
# hahahahahahahahaha
# hahahahaha
# hahahahahahaha
# ha{5}, ha{4}, ha{6}, ha{8}, ha{9}
def hahahaha(string):
    return bool(re.search('(ha){4,}', string))

In [136]:
hahahaha('hahahaha')

True

In [137]:
hahahaha('haha')

False

In [141]:
#ha 
#haha
def ha(string):
    return bool(re.search('^(ha){,2}$', string))

In [142]:
ha('ha')

True

In [143]:
ha('hahaha')

False

In [145]:
# fooaaaabar, fooabar, fooaabar
#first entry has four as between foo and bar 
#second has one a 
# third has two a's 
def foo_a_bar(string):
    return bool(re.search('fooa+bar', string))

In [146]:
foo_a_bar('fooaaabar')

True

In [148]:
foo_a_bar('foobar')

False

#### The question mark binary

In [150]:
#we want: https://website, http://website
def website(string):
    return bool(re.search('https?://website', string))

In [151]:
website('https://website')

True

In [152]:
website('http://website')

True

#### Making Choices with Pipe 

In [153]:
#want logwood, plywood 
#don't want redwood
def logwood_or_plywood(string):
    return bool(re.search('(log|ply)wood', string))

In [154]:
logwood_or_plywood('logwood')

True

In [155]:
logwood_or_plywood('redwood')

False

In [156]:
logwood_or_plywood('plywood')

True

### Regex - Group capture, find and replace 

In [None]:
#rephrase a string in a new way 
#example is 1280x720 
#format of this ([0-9]+)x([0-9]+)  2 numbers between 0-9 with an unspecified length divided by the character x 
def new_pixel_format(string):
    return bool(re.search('\pix by \2 pix', string))