In [None]:
# special sequence of characters that uses search pattern to find a string or set of strings
# can split patterns into one or more subpatterns
# import re

# \ = drops a special meaning to character
# [] = complete character class
# ^ = matches the beginning of the string
# . = matched any character except \n
# | = OR operator
# ? = matches zero or one occurrences
# * = any number of occurrences
# + = one or more occurrences
# {} = num of occurrences preceding regex
# () = encloses group of regex

# Special Sequences:
    # \A = matches if string begins with the given character
    # \b(string) = check the beginning of the word; (string)\b = check for the ending of the word
    # \B = should not start or end with given regex
    # \d = any decimal digits
    # \D = non-digit character
    # \s = matches any whitespace character
    # \S = non whitespace character
    # \w = any alphanumerical character
    # \W = non alphanumerical character
    # \Z = matches if string ends with given regex
    # \n = creates a new line
    # r = raw string
    
# Functions in regex:
    # re.findall()
    # re.compile() = regex will compiled to pattern objs
    # re.split()
    # re.sub()
    # re.subn()
    # re.escape()
    # re.search()
    # Match Object
    
# Regex Sets: []
    # [atx]
    # [a-h]
    # [^atx]
    # [0123] = match of numerical presence
    # [0-9]
    # [0-7][0-9]
    # [a-z][A-Z]

In [2]:
import re

In [3]:
pattern = 'term1'

# text to parse
text = 'This is a string with term1, but it does not have the other tern'
match = re.search(pattern, text)

match

<re.Match object; span=(22, 27), match='term1'>

In [4]:
type(match)

re.Match

In [5]:
match.start()

22

In [6]:
match.end()

27

In [7]:
# re.split

split_term = '@'

phrase = 'what is the domain name of someone with email: hello@gmail.com'
re.split(split_term,phrase)

['what is the domain name of someone with email: hello', 'gmail.com']

In [8]:
# findall

re.findall('match','test phrase match is in middle')

['match']

In [9]:
def multi_re_find(patterns,phrase):
    '''
    Takes in a list of regex patterns
    Prints a list of all matches
    '''
    
    for pattern in patterns:
        print('Searching the phrase using recheck: %r' %pattern)
        print(re.findall(pattern,phrase))
        print('\n')

In [10]:
# test phrase
tph = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'
# test patterns
tpt = ['sd*', # s followed by zero or more d's
       'sd+', # s followed by one or more d's
       'sd?', # s followed by 0 or 1 d
       'sd{3}', # s followed by 3 d's
       'sd{2,3}' # s followed by 2 to 3 d's
      ]

multi_re_find(tpt,tph)

Searching the phrase using recheck: 'sd*'
['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']


Searching the phrase using recheck: 'sd+'
['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']


Searching the phrase using recheck: 'sd?'
['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']


Searching the phrase using recheck: 'sd{3}'
['sddd', 'sddd', 'sddd', 'sddd']


Searching the phrase using recheck: 'sd{2,3}'
['sddd', 'sddd', 'sddd', 'sddd']




In [11]:
# character sets: check occurrences of either character like an OR operator w/n []
# test phrase
tph = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'
tpt = ['[sd]', # either s or d
       's[sd]+' # s followed by one or more s or d
      ]

multi_re_find(tpt,tph)

Searching the phrase using recheck: '[sd]'
['s', 'd', 's', 'd', 's', 's', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 'd', 's', 'd', 's', 'd', 's', 's', 's', 's', 's', 's', 'd', 'd', 'd', 'd']


Searching the phrase using recheck: 's[sd]+'
['sdsd', 'sssddd', 'sdddsddd', 'sds', 'sssss', 'sdddd']




In [12]:
# exclusion (^): match any single character not in []
tph = 'This is a string! But it has punctuation. How can we remove it?'
re.findall('[^!.?]+',tph)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [13]:
# character ranges [start-end]: [a-f] return instances of letters b/w a and f
tph = 'This an example sentence. Lets see if we can find some letters.'
tpt = [ '[a-z]+', # sequence of lower case letters
        '[A-Z]+', # sequence of upper case letters
        '[a-zA-Z]+', # sequence of lower and upper case letters
        '[A-Z][a-z]+' # one upper case followed by lower case letters
      ]

multi_re_find(tpt,tph)

Searching the phrase using recheck: '[a-z]+'
['his', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using recheck: '[A-Z]+'
['T', 'L']


Searching the phrase using recheck: '[a-zA-Z]+'
['This', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using recheck: '[A-Z][a-z]+'
['This', 'Lets']




In [14]:
# Escape codes find specific types of patterns such as digits, non-digits, whitespace
tph = 'This is a string with some numbers 1233 and a symbol #hashtag'
tpt = [r'\d+', # sequence of digits
       r'\D+', # sequence of non digits
       r'\s', # sequence of whitespace
       r'\S+', # sequence of non whitespace
       r'\w+', # sequence of alphanumeric 
       r'\W+', # sequence of non alphanumeric
    ]

multi_re_find(tpt,tph)

Searching the phrase using recheck: '\\d+'
['1233']


Searching the phrase using recheck: '\\D+'
['This is a string with some numbers ', ' and a symbol #hashtag']


Searching the phrase using recheck: '\\s'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


Searching the phrase using recheck: '\\S+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', '#hashtag']


Searching the phrase using recheck: '\\w+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', 'hashtag']


Searching the phrase using recheck: '\\W+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' #']




In [15]:
pattern = '^a...s$'
test_string = 'abyss'
result = re.match(pattern, test_string)

if result:
    print('Search Successful')
else:
    print('Search Unsuccessful')

Search Successful


In [16]:
#re.findall(): extract nums from string
string = 'Hello 12 hi 89. Dear 34'
pattern = '\d+'
result = re.findall(pattern, string)
print(result)

['12', '89', '34']


In [17]:
txt = '''Python is the most beautiful language that a human being has ever created.
         I recommend Python for a first programming language.'''

matches = re.findall('language', txt, re.I)
print(matches)

['language', 'language']


In [18]:
txt = '''Python is the most beautiful language that a human being has ever created.
         I recommend Python for a first programming language.'''

matches = re.findall('Python', txt, re.I)
print(matches)

['Python', 'Python']


In [19]:
txt = '''Python is the most beautiful language that a human being has ever created.
         I recommend python for a first programming language.'''

matches = re.findall('Python|python', txt)
print(matches)

matches = re.findall('[Pp]ython', txt)
print(matches)

['Python', 'python']
['Python', 'python']


In [20]:
# re.split(): splits where there is a match and returns a list of strings where the splits have occurred
txt = '''I am a teacher and I love teaching. There is nothing as rewarding as educating and empowering people.
         I found teaching more interesting than any other job. Does this motivate you to be a teacher?'''

print(re.split('\n',txt))

['I am a teacher and I love teaching. There is nothing as rewarding as educating and empowering people.', '         I found teaching more interesting than any other job. Does this motivate you to be a teacher?']


In [21]:
string = 'Twelve:12 Eighty nine: 89.'
pattern = '\d+'
result = re.split(pattern, string)
print(result)

['Twelve:', ' Eighty nine: ', '.']


In [22]:
# 1 rep maximum num of splits that will occur, default of max split = 0 which displays all possible splits
string = 'Twelve:12 Eighty nine: 89.'
pattern = '\d+'
result = re.split(pattern, string, 1)
print(result)

['Twelve:', ' Eighty nine: 89.']


In [23]:
# re.sub(pattern, replace, string, count): if pattern is not found it will return original string else replaces
# if count is omitted it will results to 0 and replace all occurrences

string = 'abc 12\ de 23 \n f45 6'

# matching all whitespace characters
pattern = '\s+'

# empty string
replace = ''
new_string = re.sub(pattern,replace,string)
print(new_string)

abc12\de23f456


In [24]:
string = 'abc 12\ de 23 \n f45 6'

# matching all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.sub(r'\s+',replace,string,1)
print(new_string)

abc12\ de 23 
 f45 6


In [25]:
txt = '''I am a teacher and I love teaching. There is nothing as rewarding as educating and empowering people.
         I found teaching more interesting than any other job. Does this motivate you to be a teacher?'''

match_replaced = re.sub('Python|python','Javascript', txt, re.I)
print(match_replaced)

match_replaced = re.sub('[Pp]ython','Javascript', txt, re.I)
print(match_replaced)

I am a teacher and I love teaching. There is nothing as rewarding as educating and empowering people.
         I found teaching more interesting than any other job. Does this motivate you to be a teacher?
I am a teacher and I love teaching. There is nothing as rewarding as educating and empowering people.
         I found teaching more interesting than any other job. Does this motivate you to be a teacher?


In [26]:
txt = '''%I a%m a te%%a%che%r% a%n%d %% I l%o%ve te%ach%ing. T%he%re i%s n%o%th%ing as r%ewarding a%s e%duc%at%i%ng a%n%d e%m%p%ow%er%ing p%e%o%ple.
         I found te%a%ching m%ore i%n%t%er%es%ting t%h%an any other %job. D%o%es thi%s m%o%t%iv%a%te %y%o%u to b%e a t%e%a%cher?'''

matches = re.sub('%','',txt)
print(matches)

I am a teacher and  I love teaching. There is nothing as rewarding as educating and empowering people.
         I found teaching more interesting than any other job. Does this motivate you to be a teacher?


In [27]:
# re.subn(): similar to re.sub but it returns a tuple of 2 items containing the new string and num of sub made
string = 'abc 12\n de 23 \n f45 6'

# matching all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.subn(pattern,replace,string)
print(new_string)

('abc12de23f456', 5)


In [28]:
# re.search(): if there is no match it will give none
# match = re.search(pattern, str)

string = 'Python is fun'

# check if Python is at beginning
match = re.search('\APython', string)

if match:
    print('Pattern found inside the string')
else:
    print('Pattern not found')

Pattern found inside the string


In [29]:
txt = '''Python is the most beautiful language that a human being has ever created.
         I recommend python for a first programming language.'''

# returns an object with span and match
match = re.search('first',txt,re.I)
print(match)

span = match.span()
print(span)

start,end = span
print(start,end)

substring = txt[start:end]
print(substring)

<re.Match object; span=(109, 114), match='first'>
(109, 114)
109 114
first


In [30]:
txt = 'I love to teach python and javascript'

match = re.match('I love to teach', txt, re.I)
print(match)

span = match.span()
print(span)

start,end = span
print(start,end)

substring = txt[start:end]
print(substring)

<re.Match object; span=(0, 15), match='I love to teach'>
(0, 15)
0 15
I love to teach


In [31]:
txt = 'I love to teach python and javascript'

match = re.match('I like to teach', txt, re.I)
print(match)

None


In [32]:
string = '39801 356 2102 1111'

# output: 801 35
# pattern = '[0-9]{3} [0-9]{2}'
pattern = '(\d{3}) (\d{2})'
match = re.search(pattern, string)

print(match.group())

801 35


In [33]:
match.group(1,2)

('801', '35')

In [34]:
match.groups()

('801', '35')

In [35]:
match.start()

2

In [36]:
match.end()

8

In [37]:
match.span()

(2, 8)

In [38]:
match.string

'39801 356 2102 1111'

In [39]:
string = '\n  and \r are escape sequences'
result = re.findall(r'[\n\r]',string)
print(result)

['\n', '\r']


In [40]:
regex_pattern = r'[Aa]pple'
txt = 'Apple and banana are fruits. An old cliche says that an apple a day keeps the doctor away has been replaced by banana a day keeps the doctor far far away.'

matches = re.findall(regex_pattern, txt)
print(matches)

['Apple', 'apple']


In [41]:
regex_pattern = r'[Aa]pple | [Bb]anana'
txt = 'Apple and banana are fruits. An old cliche says that an apple a day keeps the doctor away has been replaced by banana a day keeps the doctor far far away.'

matches = re.findall(regex_pattern, txt)
print(matches)

['Apple ', ' banana', 'apple ', ' banana']


In [42]:
regex_pattern = r'\d'
txt = 'Hawking born on 8 January 1942 and died on 14 March 2018 Einstein\'s Birth Anniversary (Pi-Day) and both died at 76.'
print(txt)
matches = re.findall(regex_pattern, txt)
print(matches)

Hawking born on 8 January 1942 and died on 14 March 2018 Einstein's Birth Anniversary (Pi-Day) and both died at 76.
['8', '1', '9', '4', '2', '1', '4', '2', '0', '1', '8', '7', '6']


In [43]:
regex_pattern = r'\d+'
txt = 'Hawking born on 8 January 1942 and died on 14 March 2018 Einstein\'s Birth Anniversary (Pi-Day) and both died at 76.'

matches = re.findall(regex_pattern, txt)
print(matches)

['8', '1942', '14', '2018', '76']


In [44]:
# . reps any character except new line
regex_pattern = r'[a].'
txt = '''Apples and Bananas are fruits.'''

matches = re.findall(regex_pattern, txt)
print(matches)

['an', 'an', 'an', 'as', 'ar']


In [45]:
regex_pattern = r'[a].+' # any char with + results any char one or more times
txt = '''Apples and Bananas are fruits.'''

matches = re.findall(regex_pattern, txt)
print(matches)

['and Bananas are fruits.']


In [46]:
regex_pattern = r'[a].*' # any char zero or more times
txt = '''Apples and Bananas are fruits.'''

matches = re.findall(regex_pattern, txt)
print(matches)

['and Bananas are fruits.']


In [47]:
txt = '''I am not sure if there is a convention how to write the word email.
         Some people write it as email others may write it as Email or E-mail.'''

regex_pattern = r'[Ee]-?mail' # ? means - is optional
matches = re.findall(regex_pattern, txt)
print(matches)

['email', 'email', 'Email', 'E-mail']


In [48]:
regex_pattern = r'\d{4}'
txt = 'Hawking born on 8 January 1942 and died on 14 March 2018 Einstein\'s Birth Anniversary (Pi-Day) and both died at 76.'

matches = re.findall(regex_pattern, txt)
print(matches)

['1942', '2018']


In [49]:
regex_pattern = r'\d{1,4}'
txt = 'Hawking born on 8 January 1942 and died on 14 March 2018 Einstein\'s Birth Anniversary (Pi-Day) and both died at 76.'

matches = re.findall(regex_pattern, txt)
print(matches)

['8', '1942', '14', '2018', '76']


In [50]:
regex_pattern = r'^Hawking'
txt = 'Hawking born on 8 January 1942 and died on 14 March 2018 Einstein\'s Birth Anniversary (Pi-Day) and both died at 76.'

matches = re.findall(regex_pattern, txt)
print(matches)

['Hawking']


In [51]:
regex_pattern = r'[^A-Za-z ]+'
txt = 'Hawking born on 8 January 1942 and died on 14 March 2018 Einstein\'s Birth Anniversary (Pi-Day) and both died at 76.'

matches = re.findall(regex_pattern, txt)
print(matches)

['8', '1942', '14', '2018', "'", '(', '-', ')', '76.']
