In [None]:
# special sequence of characters that uses search pattern to find a string or set of strings
# can split patterns into one or more subpatterns
# import re

# \ = drps a special meaning to character
# [] = complete character class
# ^ = matches the beginning of the string
# . = matched any character except \n
# | = OR operator
# ? = matches zero or oned occurances
# * = any number of occurances
# + = one or more occurances
# {} = num of occurances preceeding regex
# () = encloses group of regex

# Special Sequences:
    # \A = matches if string begins with the given character
    # \b(string) = check the beginning of the word; (string)\b = check for the ending of the word
    # \B = should not start or end with given regex
    # \d = any decimal digits
    # \D = non-digit character
    # \s = matches any whitespace character
    # \S = non whitespace character
    # \w = any alphanumerical character
    # \W = non alphanumerical character
    # \Z = matches if string ends with given regex
    
# Functions in regex:
    # re.findall()
    # re.compile() = regex will compiled to pattern objs
    # re.split()
    # re.sub()
    # re.subn()
    # re.escape()
    # re.search()
    # Match Object
    
# Regex Sets: []
    # [atx]
    # [a-h]
    # [^atx]
    # [0123] = match of numerical presence
    # [0-9]
    # [0-7][0-9]
    # [a-z][A-Z]

In [1]:
import re

In [2]:
pattern = 'term1'

# text to parse
text = 'This is a string with term1, but it does not have the other tern'
match = re.search(pattern, text)

match

<re.Match object; span=(22, 27), match='term1'>

In [3]:
type(match)

re.Match

In [4]:
match.start()

22

In [5]:
match.end()

27

In [6]:
# re.split

split_term = '@'

phrase = 'what is the domain name of someone with email: hello@gmail.com'
re.split(split_term,phrase)

['what is the domain name of someone with email: hello', 'gmail.com']

In [7]:
# findall

re.findall('match','test phrase match is in middle')

['match']

In [8]:
def multi_re_find(patterns,phrase):
    '''
    Takes in a list of regex patterns
    Prints a list of all matches
    '''
    
    for pattern in patterns:
        print('Searchin the phrase using recheck: %r' %pattern)
        print(re.findall(pattern,phrase))
        print('\n')

In [9]:
# test phrase
tph = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'
# test patterns
tpt = ['sd*', # s followed by zero or more d's
       'sd+', # s followed by one or more d's
       'sd?', # s followed by 0 or 1 d
       'sd{3}', # s followed by 3 d's
       'sd{2,3}' # s followed by 2 to 3 d's
      ]

multi_re_find(tpt,tph)

Searchin the phrase using recheck: 'sd*'
['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']


Searchin the phrase using recheck: 'sd+'
['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']


Searchin the phrase using recheck: 'sd?'
['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']


Searchin the phrase using recheck: 'sd{3}'
['sddd', 'sddd', 'sddd', 'sddd']


Searchin the phrase using recheck: 'sd{2,3}'
['sddd', 'sddd', 'sddd', 'sddd']




In [10]:
# character sets: check occurances of either character like an OR operator w/n []
# test phrase
tph = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'
tpt = ['[sd]', # either s or d
       's[sd]+' # s followed by one or more s or d
      ]

multi_re_find(tpt,tph)

Searchin the phrase using recheck: '[sd]'
['s', 'd', 's', 'd', 's', 's', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 'd', 's', 'd', 's', 'd', 's', 's', 's', 's', 's', 's', 'd', 'd', 'd', 'd']


Searchin the phrase using recheck: 's[sd]+'
['sdsd', 'sssddd', 'sdddsddd', 'sds', 'sssss', 'sdddd']




In [11]:
# exclusion (^): match any single character not in []
tph = 'This is a string! But it has punctuation. How can we remove it?'
re.findall('[^!.?]+',tph)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [12]:
# character ranges [start-end]: [a-f] return instances of letters b/w a and f
tph = 'This an example sentence. Lets see if we can find some letters.'
tpt = [ '[a-z]+', # sequence of lower case letters
        '[A-Z]+', # sequence of upper case letters
        '[a-zA-Z]+', # sequence of lower and upper case letters
        '[A-Z][a-z]+' # one upper case followed by lower case letters
      ]

multi_re_find(tpt,tph)

Searchin the phrase using recheck: '[a-z]+'
['his', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searchin the phrase using recheck: '[A-Z]+'
['T', 'L']


Searchin the phrase using recheck: '[a-zA-Z]+'
['This', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searchin the phrase using recheck: '[A-Z][a-z]+'
['This', 'Lets']




In [14]:
# Escape codes find specific types of patterns such as digits, nondigits, whitespace
tph = 'This is a string with some numbers 1233 and a symbol #hashtag'
tpt = [r'\d+', # sequence of digits
       r'\D+', # sequence of non digits
       r'\s', # sequence of whitespace
       r'\S+', # sequence of non whitespace
       r'\w+', # sequence of alphanumeric 
       r'\W+', # sequence of non alphanumeric
    ]

multi_re_find(tpt,tph)

Searchin the phrase using recheck: '\\d+'
['1233']


Searchin the phrase using recheck: '\\D+'
['This is a string with some numbers ', ' and a symbol #hashtag']


Searchin the phrase using recheck: '\\s'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


Searchin the phrase using recheck: '\\S+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', '#hashtag']


Searchin the phrase using recheck: '\\w+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', 'hashtag']


Searchin the phrase using recheck: '\\W+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' #']


