# Pattern matching

In [1]:
import re
re.findall('abc', 'kasdcfghikabcrjklabccaabcc')

['abc', 'abc', 'abc']

In [3]:
iterator = re.finditer('abc', 'kasdcfghikabcrjklabccaabcc')
for match in iterator:
    print(match.group())

abc
abc
abc


## Square brackets [], 
#### used to pass a range say [a-z] or a set of charaters like [abcd] for finding them individually

In [4]:
re.findall('[abc]', 'kasdcfghikabcrjklabccaabcc')

['a', 'c', 'a', 'b', 'c', 'a', 'b', 'c', 'c', 'a', 'a', 'b', 'c', 'c']

In [5]:
re.findall('[a-e]', 'kasdcfeghikeabcrjkelabccaabcc')

['a',
 'd',
 'c',
 'e',
 'e',
 'a',
 'b',
 'c',
 'e',
 'a',
 'b',
 'c',
 'c',
 'a',
 'a',
 'b',
 'c',
 'c']

In [6]:
len(re.findall('[0-9]', '1556189342@janhavik$###!'))

10

In [7]:
# m = re.findall('[0-9]'*4, '15561@janhavik$###!89342')
m = re.findall('[0-9][0-9][0-9][0-9]', '15561@janhavik$###!89342')
if len(m)!=0:
    print(True)
else:
    print(False)

True


In [8]:
re.findall('[....]', 'abcd1234***1256e3##>>>....99hjk....poosoow......')

['.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']

## Caps ^ 
**it represents a complement i.e as used in statistics, all the values except 123 in [^123] But the position matters, if applied in the beginning suggest everything except the charaters mentioned in the bracket whereas at the end suggests only those characters that are present in the bracket**

In [9]:
## No digit and no character should be present in the documents
doc = ['abcd:;,./#%*&@1!!3wt2','y2y2oindsd1i92essax=={{]]','..hdwhs282!!???++===','avgdbjukjk']
for d in doc:
    m = re.findall('[#%*0-9#@!&{:;,.?=+^]', d)
    if len(m)!=0:
        pass
    else:
        print(d)

avgdbjukjk


In [10]:
##or
doc = ['abcd:;,./#%*&@1!!3wt2','y2y2oindsd1i92essax=={{]]','..hdwhs282!!???++===','avgdbjukjk']
for d in doc:
    m = re.findall('[^#%*0-9#@!&{:;,.?=+]', d)
    if len(m) != len(d):
        pass
    else:
        print(d)

avgdbjukjk


In [11]:
##or
doc = ['abcd:;,./#%*&@1!!3wt2','y2y2oindsd1i92essax=={{]]','..hdwhs282!!???++===','avgdbjukjk']
for d in doc:
    m = re.findall('[#%*0-9#@!&{:;,.?=+]', d)
    if len(m) > 0:
        pass
    else:
        print(d)

avgdbjukjk


## Backslash '\\'

In [6]:
doc = ['abcd:;,.\#%*&@1!!3wt2\pklm','y2y2oinds\d1i\/\/\/\//92es\\sax=={{]]','..hdwhs\\282!!?\\??++=\==','avgdbjukjk']
s = '[\\\]'
for d in doc:
    m = re.findall(s, d)
    print(f"There are {len(m)} backslash in {d}")

There are 2 backslash in abcd:;,.\#%*&@1!!3wt2\pklm
There are 6 backslash in y2y2oinds\d1i\/\/\/\//92es\sax=={{]]
There are 3 backslash in ..hdwhs\282!!?\??++=\==
There are 0 backslash in avgdbjukjk


**Here when we have two consecutive backslashes i.e \\, then it represents a single backslash, thus in case of 3rd string we have only 3 backslash in total. Similarly \\\ = 2 backshalsh. To avoid this while counting the backslash, we can use 'r' to represent our string as a raw string and thus backslash will be treated as a regular character rather than a special character** 

In [7]:
doc = [r'abcd:;,.\n#%*&@1!!3wt2\pklm',r'y2y2oinds\td1i\/\/\/\//92es\\sax=={{]]',r'..hdwhs\\282!!?\\??++=\==',r'avgdbjukjk']
s = '[\\\]'
for d in doc:
    m = re.findall(s, d)
    print(f"There are {len(m)} backslash in {d}")

There are 2 backslash in abcd:;,.\n#%*&@1!!3wt2\pklm
There are 7 backslash in y2y2oinds\td1i\/\/\/\//92es\\sax=={{]]
There are 5 backslash in ..hdwhs\\282!!?\\??++=\==
There are 0 backslash in avgdbjukjk


### Count Backslash '\' and square bracket '[]' in the document

In [12]:
doc = [r'abcd:;,.\n#%*&[@1!!3wt2\]pklm',r'y2y2[[[[oind]]]]s\td1i\/\/\/\//92es\\sax=={{]]',r'..hdwhs\\282[[]]]!!?\\??++=\==',r'avgdbjukjk']
s = '[\\\[\]]'
for d in doc:
    m = re.findall(s, d)
    print(f"There are {len(m)} backslash in {d}")

There are 4 backslash in abcd:;,.\n#%*&[@1!!3wt2\]pklm
There are 17 backslash in y2y2[[[[oind]]]]s\td1i\/\/\/\//92es\\sax=={{]]
There are 10 backslash in ..hdwhs\\282[[]]]!!?\\??++=\==
There are 0 backslash in avgdbjukjk


In [13]:
doc = [r'abcd:;,.\n#%*&[@1!!3wt2\]pklm',r'y2y2[[[[oind]]]]s\td1i\/\/\/\//92es\\sax=={{]]',r'..hdwhs\\282[[]]]!!?\\??++=\==',r'avgdbjukjk']
s = '[\[\]\\\]'
for d in doc:
    m = re.findall(s, d)
    print(f"There are {len(m)} backslash in {d}")

There are 4 backslash in abcd:;,.\n#%*&[@1!!3wt2\]pklm
There are 17 backslash in y2y2[[[[oind]]]]s\td1i\/\/\/\//92es\\sax=={{]]
There are 10 backslash in ..hdwhs\\282[[]]]!!?\\??++=\==
There are 0 backslash in avgdbjukjk


In [22]:
## Count all the occureces of \section
s = '\\\section'
doc = ['I was in a meeting where we discussed this \section from \\\\\ the document.', 'It becomes difficult to handle these \section \section \\ all the time \section.']
for d in doc:
    m = len(re.findall(s, d))
    print(f'Number of occurences of \section are {m}')

Number of occurences of \section are 1
Number of occurences of \section are 3


In [21]:
print('\\\section')

\\section


In [33]:
##OR
s = '[\\\]section'
doc = ['I was in a meeting where we discussed this \section from \\\\\ the document.', 'It becomes difficult to handle these \section \section \\ all the time \section.']
for d in doc:
    m = len(re.findall(s, d))
    print(f'Number of occurrences of \section are {m}')

Number of occurrences of \section are 1
Number of occurrences of \section are 3


### Count spaces, digits, underscores in the document
- /d - Represents the digit class [0-9]
- /s - Represent the whitespaces
- /w - Represents the class of {A-Z,_, 0-9}

In [34]:
s = '[\s\d_]'
doc = ['I was 1567inameetingw00here_wediscussed_this\section_ from \\\\\ the document.', 'It 098865_becomes _difficult to handle_ these \section \section \\ all the time \section.']
for d in doc:
    m = re.findall(s, d)
    print(f'Number of occurrences are {len(m)} and they are {m}')

Number of occurrences are 15 and they are [' ', ' ', '1', '5', '6', '7', '0', '0', '_', '_', '_', ' ', ' ', ' ', ' ']
Number of occurrences are 21 and they are [' ', '0', '9', '8', '8', '6', '5', '_', ' ', '_', ' ', ' ', '_', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [31]:
d = 'R_m tkl\njk'
m = len(re.findall(s, d))
print(f'Number of occurences are {m}')

Number of occurences are 3


## Meta Character '*'
Used to find the recurrence i.e repetation of pattern

In [37]:
s = 'Yahooo*' #requires atleast two o's
doc = ['Meena Menaaaaaaaa Menaaaaaaaaaaaaaaaaaaaaaa', 'Yaho Yahoo Yahooooo Yahoooooooooooo']
for d in doc:
    m = re.findall(s, d)
    print(f'Number of occurrences are {len(m)} and they are {m}')

Number of occurrences are 0 and they are []
Number of occurrences are 3 and they are ['Yahoo', 'Yahooooo', 'Yahoooooooooooo']


In [46]:
#Find all the possible sub-strings of differet lengths
s = '[_a-zA-Z]\w*'
doc = ['134256gsjhsbjcbwhdiuhw_1y87y82wedkjasnxkziijoo', '_2hewiuqhiu  quwhiu723y78wqyuha&&87892w8e', '35247615765uyqsgqga9`1wa&*(79W8179)']
for d in doc:
    m = re.findall(s, d)
    k = max([len(i) for i in m])
    r = [ i for i in m if len(i)==k]
    print(f'Longest substring is {r} with length {k} ')
    print(m)

Longest substring is ['gsjhsbjcbwhdiuhw_1y87y82wedkjasnxkziijoo'] with length 40 
['gsjhsbjcbwhdiuhw_1y87y82wedkjasnxkziijoo']
Longest substring is ['quwhiu723y78wqyuha'] with length 18 
['_2hewiuqhiu', 'quwhiu723y78wqyuha', 'w8e']
Longest substring is ['uyqsgqga9'] with length 9 
['uyqsgqga9', 'wa', 'W8179']


In [53]:
#HW: Checking for valid function or variable name:
s = '[_a-zA-Z]\w*'
doc = ['13iijoo', '_2hewiuqhiu', '35&*(79W8179)', 'john', 'k123']
for d in doc:
    m = re.findall(s, d)
    if len(m[0]) == len(d):
        print('OK')
    else:
        print('syntax error')

syntax error
OK
syntax error
OK
OK


## '+' meta character
Unlike Aterisk + searches for the character from 1 to infitinity i.e aleast once the character is required.

In [2]:
d = 'YahooYahooooooooooooYahoYahooooo'
s = 'Yahoo+'
m = re.findall(s, d)
print(m)

['Yahoo', 'Yahoooooooooooo', 'Yahooooo']


## '?' meta character
Find the repeated pattern from 0 to 1 times

In [4]:
d = 'YahooYahooooooooooooYahoYahoooooYaooooYaoYaooooooo'
s = 'Yah?oo'
m = re.findall(s, d)
print(m)

['Yahoo', 'Yahoo', 'Yahoo', 'Yaoo', 'Yaoo']


**Here we can see the string is purned and has selected only the string by cliping it upto 2 o's with optional h**

In [5]:
#To avoid purning
d = 'YahooYahooooooooooooYahoYahoooooYaooooYaoYaooooooo'
s = 'Yah?oo+'
m = re.findall(s, d)
print(m)

['Yahoo', 'Yahoooooooooooo', 'Yahooooo', 'Yaoooo', 'Yaooooooo']


## '{}' Curly brackets meta character
Used to find the patterns repeated within a specific range say m to n as {m,n}

In [10]:
d = 'YahooYahooooooooooooYahoYahoooooYaooooYaoYaooooooo'
s = '[a-zA-Z]{2,5}'
m = re.findall(s, d)
print(m)

['Yahoo', 'Yahoo', 'ooooo', 'ooooo', 'YahoY', 'ahooo', 'ooYao', 'oooYa', 'oYaoo', 'ooooo']


In [16]:
d = '123455hkwhgsjaudkqahwk87238174873664987239edngucatsyi7eugcbmsbdcmbk123yrhjsgcmsbcjqwhr8813218qeuhdsfkxcnb ms&&rr6598789???/ku'
s = '[0-9]{2,5}'
#or
# s = '\d{2,5}'
m = re.findall(s, d)
print(m)

['12345', '87238', '17487', '36649', '87239', '123', '88132', '18', '65987', '89']


## Pattern Objects using regular expressions or regex

In [3]:
p = re.compile('a\d*b', re.IGNORECASE)
p.findall('a234122b23456Abghjgjhjfh')

['a234122b', 'Ab']

## Match and Search
- match() : Determine if re matches at the beginning of the string.
- search() : Scan through the string to find the location where the string matches.
- Both return only the first occurence of the pattern

In [24]:
#Count space character if any in the beginning of the document
d = '    \t \t\t\t  \t  gjasjyduyeuyiqwyeoihiu7eiuwh9874695641298uehdas##   .'
p = re.compile('\s*')
m = p.match(d)
# m.span()
# len(m.group())
print(m.end())
print(type(p))
print(d[m.end():])

14
<class 're.Pattern'>
gjasjyduyeuyiqwyeoihiu7eiuwh9874695641298uehdas##   .


In [30]:
d = 'wdhewgd    \t \t\t\t  \t  gjasjyduyeuyiqwyeoihiu7eiuwh9874695641298uehdas##   .'
p = re.compile('\s+')
m = p.search(d)
# m.span()
# len(m.group())
print(m.start())
print(m.end())
print(type(p))
print(d[m.end():])

7
21
<class 're.Pattern'>
gjasjyduyeuyiqwyeoihiu7eiuwh9874695641298uehdas##   .


In [79]:
#List all the location where valid variable name exists
#Find all the possible sub-strings of differet lengths
s = '[_a-zA-Z]\w*'
doc = ['134256gsjhsbjcbwhdiuhw_1y87y82wedkjasnxkziijoo', '_2hewiuqhiu  quwhiu723y78wqyuha&&87892w8e', '35247615765uyqsgqga9`1wa&*(79W8179)']
for d in doc:
    r = re.finditer(s, d)
    print(f'In document {d} following are location of the valid variable names')
    for i in r:
        p = re.compile(i.group())
        m = p.search(d)
        print(m.span())

In document 134256gsjhsbjcbwhdiuhw_1y87y82wedkjasnxkziijoo following are location of the valid variable names
(6, 46)
In document _2hewiuqhiu  quwhiu723y78wqyuha&&87892w8e following are location of the valid variable names
(0, 11)
(13, 31)
(38, 41)
In document 35247615765uyqsgqga9`1wa&*(79W8179) following are location of the valid variable names
(11, 20)
(22, 24)
(29, 34)


In [83]:
s = '[_a-zA-Z]\w*'
doc = ('134256gsjhsbjcbwhdiuhw_1y87y82wedkjasnxkziijoo' '_2hewiuqhiu  quwhiu723y78wqyuha&&87892w8e' '35247615765uyqsgqga9`1wa&*(79W8179)')
p = re.compile(s)  #if s does not contain A-Z then we will use the flag = 'IGNORECASE'
match = p.finditer(doc)
for m in match:
    print(m.span())

(6, 57)
(59, 77)
(84, 107)
(109, 111)
(116, 121)


## Logical Meta Character or '|'
It is usually used to join regular expressions

In [87]:
doc = ('sfaGSYUQWGE12-0312||\\||||||\\\\\\\\\\\\[]]DBSJBDJWEDJHWED[JDGHGDE]GWGEU272Y874682734YRHDBX3233\\\\\\\[[[]]]wfsahjzxv      ')
s = '[\[\]]'
r = '[\\\]'
p = re.compile(s+'|'+r)
print(len(p.findall(doc)))

22


## The meta character hat i.e '^', 
But if it is in classes i.e a-z,0-9, then it will work as a compliment otherwise if added before a string it will serve the purpose to find the strings or sentences or documents beginning with the given regular expression

In [90]:
d1 = 'From time to time she was instructed by her mentor to study hard.' 
d2 = 'Time to time she was instructed by her mentor to study hard.'
s = '^From'
print(re.search(s, d1))
print(re.search(s, d2))

<re.Match object; span=(0, 4), match='From'>
None


In [91]:
d1 = 'From time to time she was instructed by her mentor to study hard.' 
d2 = 'Time to time she was instructed by her mentor to study hard.'
s = '^From'
print(re.findall(s, d1))
print(re.findall(s, d2))

['From']
[]


## Dollar Sign as a meta character '$'
It checks for the matching string at the end of the document or sentence or any location followed by the newliner i.e \n

In [93]:
d1 = '{block} ' 
d2 = '{block}'
d3 = '{block}\n'
s = '}$'
print(re.search(s, d1))
print(re.search(s, d2))
print(re.search(s, d3))

None
<re.Match object; span=(6, 7), match='}'>
<re.Match object; span=(6, 7), match='}'>


## () round brackets/paranthesis Meta character 
- used for searching for a group of specified characters i.e 'the', 

In [106]:
d = 'Thethe hotel the room was so dirty that they wished thethethethe would have opted for some other place thethethe.'
s = re.compile('(the)+', re.IGNORECASE)
for m in s.finditer(d):
    print(m.span(), m.group())

(0, 6) Thethe
(13, 16) the
(40, 43) the
(52, 64) thethethethe
(92, 95) the
(103, 112) thethethe


# Pattern or String replacement or modification

## split()
To divide the string depending upon the given regular expression

In [109]:
d = 'Mango,Orange,Apple,Chikoo,Banana,Pineapple'
m = re.split(',', d)
m

['Mango', 'Orange', 'Apple', 'Chikoo', 'Banana', 'Pineapple']

In [110]:
d = 'Mango Orange Apple Chikoo Banana Pineapple'
m = re.split(' ', d)
m

['Mango', 'Orange', 'Apple', 'Chikoo', 'Banana', 'Pineapple']

In [111]:
d = 'MangoOrangeAppleChikooBananaPineapple'
s = re.compile('o', re.IGNORECASE)
m = s.split(d)
m

['Mang', '', 'rangeAppleChik', '', 'BananaPineapple']

### \W is used to represent a class without a-zA-Z0-9_

In [114]:
d = 'The girls, were%&^^7928*** so excited ##4$$that they .....wished $to have a party!.'
s = re.compile('\W+')
m = s.split(d)
print(m)

['The', 'girls', 'were', '7928', 'so', 'excited', '4', 'that', 'they', 'wished', 'to', 'have', 'a', 'party', '']


## sub() i.e substitute

In [121]:
d = 'He had red, blue, orange and Yellow cars.'
s = re.compile('(red,\sblue,\sorange\s|and\syellow)+', re.IGNORECASE)
s.sub('three different color', d)

'He had three different color cars.'

In [139]:
#remove multiple whitespaces in to a single and also remove white space from beginning and the end
d = '    Reema    is a   sweet girl.    She   loves   music.  '
s = re.compile('(\s)+')  ##any space in the string
s2 = re.compile('(^ | $)+')   ##only at the beginning and at the end
s2.sub("", s.sub(' ',d)) ##first reduce the no. of spaces to single space and then remove spaces from end and start

'Reema is a sweet girl. She loves music.'