A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.

RegEx can be used to check if a string contains the specified search pattern.

Python has a built-in package called re, which can be used to work with Regular Expressions.



# Example
Search the string to see if it starts with "The" and ends with "Spain":

In [2]:
import re

#Check if the string starts with "The" and ends with "Spain":

txt = "The rain in Spain"
x = re.search("^The.*Spain$", txt)

if x:
  print("YES! We have a match!")
else:
  print("No match")


YES! We have a match!


In [1]:
import re

txt = "The rain in Spain"

#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[a-m]", txt)
print(x)


['h', 'e', 'a', 'i', 'i', 'a', 'i']


In [2]:
import re

txt = "TAe rain in Spain"

#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[A-M]", txt)
print(x)


['A']


In [6]:
import re

txt = "That will be 567 dollars"

#Find all digit characters:

x = re.findall("\d", txt)
print(x)

['5', '6', '7']


In [9]:
import re

txt = "helloworld"

#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":

x = re.findall("he....or", txt)
print(x)

['hellowor']


In [12]:
import re

txt = "hello world"

#Check if the string starts with 'hello':

x = re.findall("^hello", txt)
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

Yes, the string starts with 'hello'


In [14]:
import re

txt = "hello world"

#Check if the string ends with 'world':

x = re.findall("world$", txt)
if x:
  print("Yes, the string ends with 'world'")
else:
  print("No match")

Yes, the string ends with 'world'


In [6]:
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains "ai" followed by 0 or more "x" characters:

x = re.findall("aix*", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ai', 'ai', 'ai', 'ai']
Yes, there is at least one match!


In [15]:
import re

txt = "The raan in Spaan falls maanly in the plain!"

#Check if the string contains "ai" followed by 0 or more "x" characters:

x = re.findall("aax*", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['aa', 'aa', 'aa']
Yes, there is at least one match!


In [7]:
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains "ai" followed by 1 or more "x" characters:

x = re.findall("aix+", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [19]:
txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains "a" followed by exactly two "l" characters:

x = re.findall("al{2}", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['all']
Yes, there is at least one match!


In [20]:
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains either "falls" or "stays":

x = re.findall("falls|plain", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['falls', 'plain']
Yes, there is at least one match!


In [26]:
import re

txt = "The rain in Spain"

#Check if the string starts with "The":

x = re.findall("\AThe", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['The']
Yes, there is a match!


In [27]:
import re

txt = "The rain in Spain"

#Check if "ain" is present at the end of a WORD:

x = re.findall(r"n\b", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['n', 'n', 'n']
Yes, there is at least one match!


In [28]:
import re

txt = "The rain in Spain"

#Check if "ain" is present at the end of a WORD:

x = re.findall("\s", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


In [29]:
txt = "The rain in Spain 231 "



x = re.findall("\w", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n', '2', '3', '1']
Yes, there is at least one match!


In [30]:
txt = "The rain in Spain 231 ! $ "



x = re.findall("\W", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ', ' ', ' ', '!', ' ', '$', ' ']
Yes, there is at least one match!


# The findall() Function
The findall() function returns a list containing all matches.

In [2]:
import re

txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


# The list contains the matches in the order they are found.

If no matches are found, an empty list is returned:

In [3]:
import re

txt = "The rain in Spain"
x = re.findall("Portugal", txt)
print(x)

[]


# The search() Function
The search() function searches the string for a match, and returns a Match object if there is a match.

If there is more than one match, only the first occurrence of the match will be returned:

In [35]:
import re

txt = "The rain in Spain"
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())


The first white-space character is located in position: 3


# If no matches are found, the value None is returned:

In [5]:
import re

txt = "The rain in Spain"
x = re.search("Portugal", txt)
print(x)

None


In [31]:

txt = "The rain in Spain"
x = re.search("\s", txt)
print(x)

<re.Match object; span=(3, 4), match=' '>


# The split() Function
The split() function returns a list where the string has been split at each match:

In [6]:
import re

txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'Spain']


In [32]:
import re
#Split the string only at the first occurrence:
txt = "The rain in Spain"
x = re.split("\s", txt, 2)
print(x)

['The', 'rain', 'in Spain']


# The sub() Function
The sub() function replaces the matches with the text of your choice:

In [8]:
import re

#Replace all white-space characters with the digit "9":

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


In [9]:
import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)
print(x)

The9rain9in Spain


# Match Object
A Match Object is an object containing information about the search and the result.

Note: If there is no match, the value None will be returned, instead of the Match Object.

In [10]:
import re

txt = "The rain in Spain"
x = re.search("ai", txt)
print(x) #this will print an object

<re.Match object; span=(5, 7), match='ai'>


# The Match object has properties and methods used to retrieve information about the search, and the result:

.span() returns a tuple containing the start-, and end positions of the match.
.string returns the string passed into the function
.group() returns the part of the string where there was a match


In [11]:
import re

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.span())

(12, 17)


In [12]:
import re

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.string)

The rain in Spain


In [13]:
import re

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)   # The regular expression looks for any words that starts with an upper case "S":
print(x.group())


Spain
