In [1]:
# Regular expressions allow you to locate and change
# string in very powerful ways.
# They work in almost exactly the same way in every
# programming language as well.

# Regular expressions (Regex) are used to
# 1. Search fora specific string in a large amount of data
# 2. Verify that a string has the proper format (Email, Phone #, etc)
# 3. Find a string and replace it with another string
# 4. Format data into a proper form for importing for example

# import the Regex module
import re

# ------------ Was a Match found -------------

# Search for ape in the string
if re.search("ape",'The ape was at the apex'):
    print("There is an ape")

There is an ape


In [3]:
import re
# ------------ Get All Matches --------------

# findall() return a list of mathes
# . is used to match any 1 character or space
allApes = re.findall("ape.","The ape was at the apex")

for i in allApes:
    print(i)

ape 
apex


In [5]:
# finditer returns an iterator of matching objects
# Yopu can use spa to get the location

theStr = "The ape was at the apex"

for i in re.finditer("ape.",theStr):
    
    # Spam returns a tuple
    locTuple = i.span()
    
    print(locTuple)
    
    # Slice the match out using the tuple values
    print(theStr[locTuple[0]:locTuple[1]])

(4, 8)
ape 
(19, 23)
apex


In [6]:
import re
# ------------- Match 1 of Several Letters ---------------

# Square brackets with match any one of the characters betweeen
# the brackets not including  upper and Lowercase varieties
# unless they are listed

animalStr = "Cat rat mat fat pat"

allAnimals = re.findall("[crmfp]at",animalStr)

for i in allAnimals:
    print(i)
    
print()

rat
mat
fat
pat



In [7]:
# We can also allow for characters in a range
# Remember to include upper and lowercase letters

animalStr = "Cat rat mat fat pat"
someAnimals = re.findall("[c-mC-M]at",animalStr)
for i in someAnimals:
    print(i)
    
print()    


Cat
mat
fat



In [8]:
import re
# ------------ Replace All matches ----------------

# Replace matching items in a string

owlFood = "rat cat mat pat"

# You can compile a regex into pattern objects which
# provide additional methods
regex = re.compile("[cr]at")

# sub() replaces items that match the regex in the string
# with the 1st attribute string passed to sub
owlFood = regex.sub("owl",owlFood)

print(owlFood)

owl owl mat pat


In [9]:
import re
# ------------ Solving backslash Problems ---------------

# Regex use the backslash to designate special characters
# and Python does the same inside strings which causes
# issues.

# Let's try to get "\\stuff" out of a string

randStr = "Here is \\stuff"

# This won't find it
print("Find \\stuff :", re.search("\\stuff",randStr))

# This does, but we have to put in 4 slashes which is
# messy
print("Find \\stuff :", re.search("\\\\stuff",randStr))

# You can get around this by using raw string which
# don't treat backlashes as special
print("Find \\stuff :", re.search(r"\\stuff",randStr))

Find \stuff : None
Find \stuff : <re.Match object; span=(8, 14), match='\\stuff'>
Find \stuff : <re.Match object; span=(8, 14), match='\\stuff'>


In [10]:
import re
# ------------- Matching Any character ------------
# We saw that  . matches any character, but what if we
# want to match a period. Backslash the period
# You do the same with [, ] and others


randStr = "F.B.I. I.R.S. CIA"

print("Matches :", len(re.findall(".\..\..", randStr)))
print("Matches :", re.findall(".\..\..", randStr))

Matches : 2
Matches : ['F.B.I', 'I.R.S']


In [12]:
import re
# ------------- Matching whitespaces --------------
# We can match many whitespaces characters

randStr = """This is a long
string that goes
on for many lines"""

print(randStr)

# Removes newlines
regex = re.compile("\n")

randStr = regex.sub(" ",randStr)

print(randStr)

# You can also match
# \b : Backspace
# \f : Form feed
# \r : Carriage Return
# \t : tab
# \v : vertical tab

# You may need to remove \r\n on Windows


This is a long
string that goes
on for many lines
This is a long string that goes on for many lines


In [13]:
import re
# ----------- Matching any Single Numer ------------
# \d can be used instead [0-9]
# \D is the same as [^0-9]

randStr = "12345"

print("Matches :",len(re.findall("\d",randStr)))
print("Matches :",re.findall("\d",randStr))

Matches : 5
Matches : ['1', '2', '3', '4', '5']


In [15]:
import re
# ------------ Matching Multiple Numbers -------------
# You can match multiple digits by following the \d with {numOfValues}

# match 5 numbers only
if re.search("\d{5}","12345"):
    print("It is a zip code")
    
# You can also match within a range
# Match values that are between 5 y 7 digits
numStr= "123 12345 123456 1234567"

print("Matches :", len(re.findall("\d{5,7}",numStr)))
print("Matches :", re.findall("\d{5,7}",numStr))

It is a zip code
Matches : 3
Matches : ['12345', '123456', '1234567']


In [17]:
import re
# ------------ Matching Any Single Letter or Number ---------------
# \w is the same as [a-zA-Z0-9_]
# \W is the same as [^a-zA-Z0-9_]

phNum = "412-555-1212"

# Check if it is a phone number
if re.search("\w{3}-\w{3}-\w{4}",phNum):
    print("It is a phone number")
    
# check for valid first name between 2 and 20 characters
if re.search("\w{2,20}","Ultraman"):
    print("It is a name")

    

It is a phone number
It is a name


In [21]:
import re
# -------------- Matching Whitespace -------------
# \s is the same as [\f\n\r\t\v]
# \S is the same as [^\f\n\r\t\v]

# Check for valid first and last name with a space
if re.search("\w{2,20}\s\w{2,20}", "Toshiro Muratamatsu"):
    print("It is a valid full name")

It is a valid full name


In [22]:
import re
# --------------- Matching one or more ------------------
# + mathces 1 or more characters

# match a followed by 1 or more characters
print("Matches :", len(re.findall("a+","a as a ape bug")))
print("Matches :", re.findall("a+","a as a ape bug"))

Matches : 4
Matches : ['a', 'a', 'a', 'a']


In [24]:
import re
#  Create a Regex that matches email addresses from a list
# 1. 1 to 20 Lowecase and uppercase letters, numbers, plus ._%+-
# 2. An @ suymbol
# 3. 2 to 20 lowercase and uppercase letters, numbers, plus .-
# 4. A period
# 5. 2 to 3 lowercase and uppercase letters

emailList = "bd@aol.com m@.com @apple.com bd@.com"

print("Email matches :", len(re.findall("[\w._%+-]{1,20}@[\w.-]{2,20}.[a-zA-Z]{2,3}",
                                       emailList)))
print("Email matches :", re.findall("[\w._%+-]{1,20}@[\w.-]{2,20}.[a-zA-Z]{2,3}",
                                       emailList))

Email matches : 1
Email matches : ['bd@aol.com']
