# Regular Expression
1. re.search()
2. re.match()
3. re.split()
4. re.group()
5. re.groups()
6. re.find()
7. re.findall()
8. re.finditer()
9. re.sub()

In [1]:
import re

In [2]:
x = re.search("cat","A cat is an animal")
print(x)

<_sre.SRE_Match object; span=(2, 5), match='cat'>


In [3]:
# The [] matches the particular characters from them 
line = "He is a German called Mayer."
if re.search(r"M[ae][iy]er",line): 
    print("I found one!")

I found one!


In [4]:
# The caret '^' matches the start of the string
s1 = "Mayer is a very common Name"
s2 = "He is called Mayer but he isn't German."
print(re.search(r"^M[ae][iy]er", s1))

print(re.search(r"^M[ae][iy]er", s2))


<_sre.SRE_Match object; span=(0, 5), match='Mayer'>
None


In [5]:
s = s2 + "\n" + s1
print(s)
print()
print(re.search(r"^M[ae][iy]er", s)) # '^' caret match only when string appear at starting position like here 'He' appears
 # so it return None

He is called Mayer but he isn't German.
Mayer is a very common Name

None


In [6]:
# If we want to match starting as well as middle string then
print(re.search(r"^M[ae][iy]er", s, re.MULTILINE)) # this allow us to match even string lie in middle of sentence

print(re.search(r"^M[ae][iy]er", s, re.M))

print(re.match(r"^M[ae][iy]er", s, re.M))


<_sre.SRE_Match object; span=(40, 45), match='Mayer'>
<_sre.SRE_Match object; span=(40, 45), match='Mayer'>
None


In [7]:
# '$' doller sign matches the end of the string as well as before newline
print(re.search(r"Python\.$","I like Python."))

print(re.search(r"Python\.$","I like Python and Perl."))

print(re.search(r"Python\.$","I like Python.\nSome prefer Java or Perl."))

print(re.search(r"Python\.$","I like Python.\nSome prefer Java or Perl.", re.M))


<_sre.SRE_Match object; span=(7, 14), match='Python.'>
None
None
<_sre.SRE_Match object; span=(7, 14), match='Python.'>


You might have realized that it can be quite cumbersome to construe certain character classes. A good example is the character class, which describes a valid word character. These are all lower case and uppercase characters plus all the digits and the underscore, corresponding to the following regular expression: r"[a-zA-Z0-9_]"

The special sequences consist of "\\" and a character from the following list:

\d	Matches any decimal digit; equivalent to the set [0-9].

\D	The complement of \d. It matches any non-digit character; equivalent to the set [^0-9].

\s	Matches any whitespace character; equivalent to [ \t\n\r\f\v].

\S	The complement of \s. It matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].

\w	Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. With LOCALE, it will match the set [a-zA-Z0-9_] plus characters defined as letters for the current locale.

\W	Matches the complement of \w.

\b	Matches the empty string, but only at the start or end of a word.

\B	Matches the empty string, but not at the start or end of a word.

\\	Matches a literal backslash.

In [8]:
# Matching beginning and ending

# difference between search and match
import re
s1 = "Mayer is a very common Name"
s2 = "He is called Meyer but he isn't German."


print(re.search(r"M[ae][iy]er", s1))

print(re.search(r"M[ae][iy]er", s2))

print(re.match(r"M[ae][iy]er", s1))

print(re.match(r"M[ae][iy]er", s2))


<_sre.SRE_Match object; span=(0, 5), match='Mayer'>
<_sre.SRE_Match object; span=(13, 18), match='Meyer'>
<_sre.SRE_Match object; span=(0, 5), match='Mayer'>
None


In [9]:
#  finding plus quantifier


mo = re.search("[0-9]+", "Customer number: 232454, Date: February 12, 2011")

print("group: ",mo.group())

print("Span: ",mo.span())

print("Start: ",mo.start())

print("End: ",mo.end())

mo.span()[0],mo.span()[1]


group:  232454
Span:  (17, 23)
Start:  17
End:  23


(17, 23)

In [10]:
t="A fat cat doesn't eat oat but a rat eats bats."
mo = re.findall("[force]at", t) # find all words starting with ['f','o','r','c','e'] + at like fat, oat, rat, cat  eat
print(mo)


['fat', 'cat', 'eat', 'oat', 'rat', 'eat']


In [11]:
mo = re.search("[force]at",t)
if mo : print(mo.group())

fat


In [12]:
# Search and replace

st = "yes I said yes I will Yes."
res = re.sub("[yY]es","no", st)

print(res)


no I said no I will no.


In [13]:
import re

In [14]:
# re.group()  return 1 or more group of submatch
email = 'girrajjangid@hackerrank.com'
m = re.match(r'(\w+)@(\w+)\.(\w+)' , email)
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))
print(m.group( 1,2,3))
print(re.split('(@+)' , 'asd@asd,asd'))
print()

girrajjangid@hackerrank.com
girrajjangid
hackerrank
com
('girrajjangid', 'hackerrank', 'com')
['asd', '@', 'asd,asd']



In [15]:
# re.groups()
print(m.groups())
print(m.group(1,2,3))

('girrajjangid', 'hackerrank', 'com')
('girrajjangid', 'hackerrank', 'com')


In [16]:
# re.groupdict()  return group with dict name
n = re.match('(?P<user>\w+)@(?P<website>\w+)\.(?P<extension>\w+)','myname@hackerrank.com')
print(n.groupdict())


{'user': 'myname', 'website': 'hackerrank', 'extension': 'com'}


In [17]:
# re.split()
print(re.split('-' , 'asd-asd-asd-asd-asd-zxc-'))
print(re.split('[- , .]' , '100,000,000.000'))

['asd', 'asd', 'asd', 'asd', 'asd', 'zxc', '']
['100', '000', '000', '000']


In [18]:
a = input()
print(bool(re.match(r'^[-+]?[0-9]*\.[0-9]+$' , a)))

0
False



^ says start of the expression.

[-+]? says it can start with either - or +.

[0-9] says any number from 0-9 can be followed after it.

* says that whichever thing it follows[in this case it is[0-9]], it can repeat arbitrarily times, even 0 times.

'.' is placeholder for any character.(for the answer it should be '\.' instead of '.' ; '\' is escape character. Because of this you can literally mean a dot in expression).

again[0-9] as explained earlier.

'+' says that whichever thing it follows[in this case it is[0-9]], it can repeat arbitrarily times, but atleast one time.

$ follows whichever thing it should come in the end.

In [19]:
m = re.search(r'([a-zA-Z0-9])\1+', input().strip())
print(m.group(1) if m else -1)
# 12312331255
# return 5 bcz 5 repeat first

0
-1


In [20]:
# re.findall()
print(re.findall(r'[:./]' , 'http://www.hackerrank.com/')) # it return all (words)
print(re.findall(r'\w' , 'http://www.hackerrank.com/')) # it return all (words)


[':', '/', '/', '.', '.', '/']
['h', 't', 't', 'p', 'w', 'w', 'w', 'h', 'a', 'c', 'k', 'e', 'r', 'r', 'a', 'n', 'k', 'c', 'o', 'm']


In [21]:
# re.finditer()   return iterator
a = (re.finditer(r'[@]' , 'www.girraj@gmail.com@'))
print(a)
list(map(lambda x : x.group() , a))


<callable_iterator object at 0x00000181ACAAC748>


['@', '@']

In [22]:
v = 'aeiou'
c = 'qwrtypsdfghjklzxcvbnm'
m = re.findall( r'(?<=[%s])([%s]{2,})[%s]'%(c,v,c) ,"rabcdeefgyYhFjkIoomnpOeorteeeeet"  ,flags = re.I )
print(m)
print('\n'.join(m or ['-1']))

['ee', 'Ioo', 'Oeo', 'eeeee']
ee
Ioo
Oeo
eeeee


In [23]:
# re.start()  return indices of matching
# re.end()
m = (re.search(r'\d+' ,  'asd98762222'))
print(m.start())
print(m.end())

3
11


In [24]:
z = 'aa'
m = re.findall(r'(aa)' ,  'aaadaa')
print(m)

['aa', 'aa']


In [25]:
s = "aaadaa"
k = "aa"
if k in s:
    print(*[(i.start(), (i.start()+len(k)-1)) for i in re.finditer(r'(?={})'.format(k), s)], sep='\n')
else:
    print('(-1, -1)')

(0, 1)
(1, 2)
(4, 5)


In [26]:
a = re.finditer(r'(?=aa)' , 'aaadaa')
for i in a:
    print((i.start() , (i.start()+len(k)-1)))


(0, 1)
(1, 2)
(4, 5)


# Basic patterns
1. a, X, 9, < -- ordinary characters just match themselves exactly. The meta-characters which do not match themselves because they have special meanings are: . ^ $ * + ? { [ ] \ | ( ) (details below)

2. .(dot)(a period) -- matches any single character except newline '\n'
3. \w -- (lowercase w) matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_]. Note that although "word" is the mnemonic for this, it only matches a single word char, not a whole word. \W (upper case W) matches any non-word character.
4. \b -- boundary between word and non-word
5. \s -- (lowercase s) matches a single whitespace character -- space, newline, return, tab, form [ \n\r\t\f]. \S (upper case S) matches any non-whitespace character.
 6. \t, \n, \r -- tab, newline, return
7. \d -- decimal digit [0-9] (some older regex utilities do not support but \d, but they all support \w and \s)
8. ^ = start, $ = end -- match the start or end of the string
9. \ -- inhibit the "specialness" of a character. So, for example, use \. to match a period or \\ to match a slash. If you are unsure if a character has special meaning, such as '@', you can put a slash in front of it, \@, to make sure it is treated just as a character.

also go through

https://www.guru99.com/python-regular-expressions-complete-tutorial.html

In [27]:
matchObj = re.search(r"([a-z]+)","Hello, world")  # match any digit between a to z
thousand = "M{0,3}"
hundred = '(CM|CD|D?C{0,3})'
ten = "(XC|XL|L?X{0,3})"
one = "(IX|IV|V?I{0,3}) $"
regex_pattern = r'%s%s%s%s' % (thousand , hundred, ten, one)
print (matchObj.groups())

('ello',)


In [28]:
# Check mobile number

n = int(input())
for i in range(n):
    no = input()
    if (len(no)!= 10):
        print("NO")
    elif bool(re.search(r"^(9|7|8)" , no)):
        print("YES")
    else:
        print("NO")

if re.match(r'[789]\d{9}$',input()):   
    print ('YES')  
else:  
    print ('NO')

0
0
NO


## Email Utils

In [31]:
import email.utils as e
print(e.parseaddr('GIRRAJ <girraj@gmail.com>'))
print(e.formataddr(('GIRRAJ','girraj@gmail.com')))
re.match(r'(\w+)@(\w+)\.(\w+)' , email)

('GIRRAJ', 'girraj@gmail.com')
GIRRAJ <girraj@gmail.com>


<_sre.SRE_Match object; span=(0, 27), match='girrajjangid@hackerrank.com'>

In [35]:
# We want to remove comment from this
html = """
<head>
<title>HTML</title>
</head>
<object type="application/x-flash" 
  data="your-file.swf" 
  width="0" height="0">
  <!-- <param name="movie"  value="your-file.swf" /> -->
  <param name="quality" value="high"/>
</object>
"""
print(re.sub("(<!--.*?-->)" , "" ,html)) # remove comment it return new subString



<head>
<title>HTML</title>
</head>
<object type="application/x-flash" 
  data="your-file.swf" 
  width="0" height="0">
  
  <param name="quality" value="high"/>
</object>



In [36]:
text = """
a = 1;
b = input();

if a + b > 0 && a - b < 0:
    start()
elif a*b > 10 || a/b < 1:
    stop()
print set(list(a)) | set(list(b)) 
#Note do not change &&& or ||| or & or |
#Only change those '&&' which have space on both sides.
#Only change those '|| which have space on both sides."""

print (re.sub(r'(?<= )(&&|\|\|)(?= )', lambda x: 'and' if x.group() == '&&' else 'or', text))


a = 1;
b = input();

if a + b > 0 and a - b < 0:
    start()
elif a*b > 10 or a/b < 1:
    stop()
print set(list(a)) | set(list(b)) 
#Note do not change &&& or ||| or & or |
#Only change those '&&' which have space on both sides.
#Only change those '|| which have space on both sides.
