A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.

RegEx can be used to check if a string contains the specified search pattern.

regex tester link
https://regex101.com/

# **RegEx Module**

In [1]:
import re

# **The findall() Function**

Returns a list containing all matches

In [2]:
import re

txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


# **The search() Function**
The search() function searches the string for a match, and returns a Match object if there is a match.

If there is more than one match, only the first occurrence of the match will be returned.

In [3]:
import re

txt = "The rain in Spain"
x = re.search("\s", txt)
# \s	Returns a match where the string contains a white space character

print(x)
print("The first white-space character is located in position:", x.start())
# x.start() is where the regex was matched in txt.

<re.Match object; span=(3, 4), match=' '>
The first white-space character is located in position: 3


In [4]:
import re

txt = "The rain in Spain"
x = re.search("Portugal", txt)
print(x)

None


# **The split() Function**
The split() function returns a list where the string has been split at each match

In [5]:
import re

txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'Spain']


In [6]:
import re

txt = "The rain in Spain"
x = re.split("\s", txt, 1) # Split the string only at the first occurrence
print(x)

['The', 'rain in Spain']


# **The sub() Function**
The sub() function replaces the matches with the text of your choice

In [7]:
import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


In [8]:
import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2) #Replace the first 2 occurrences
print(x)

The9rain9in Spain


# **Metacharacters**

**1. []	A set of characters**

In [9]:
import re

txt = "The rain in Spain"

#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[a-m]", txt)
print(x)

['h', 'e', 'a', 'i', 'i', 'a', 'i']


2. \	Signals a special sequence (can also be used to escape special characters)

\d	Returns a match where the string contains digits (numbers from 0-9) **bold text**

In [10]:
import re

txt = "That will be 59 dollars"

#Find all digit characters:

x = re.findall("\d", txt)
print(x)

['5', '9']


3.  .	Any character (except newline character)

In [11]:
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":

x = re.findall("he..o", txt)
print(x)

['hello']


4. ^	Starts with

In [12]:
import re

txt = "hello planet"

#Check if the string starts with 'hello':

x = re.findall("^hello", txt)
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

Yes, the string starts with 'hello'


5. $	Ends with

In [13]:
import re

txt = "hello planet"

#Check if the string ends with 'planet':

x = re.findall("planet$", txt)
if x:
  print("Yes, the string ends with 'planet'")
else:
  print("No match")


Yes, the string ends with 'planet'


6.   '*'	Zero or more occurrences

In [14]:
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by 0 or more  (any) characters, and an "o":

x = re.findall("he.*o", txt)

print(x)

['hello']


7. '+'	One or more occurrences

In [15]:
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by 1 or more  (any) characters, and an "o":

x = re.findall("he.+o", txt)

print(x)

['hello']


8. ?	Zero or one occurrences

In [16]:
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed by 0 or 1  (any) character, and an "o":

x = re.findall("he.?o", txt)

print(x)

#This time we got no match, because there were not zero, not one, but two characters between "he" and the "o"


[]


9. {}	Exactly the specified number of occurrences

In [17]:
import re

txt = "hello planet"

#Search for a sequence that starts with "he", followed excactly 2 (any) characters, and an "o":

x = re.findall("he.{2}o", txt)

print(x)

['hello']


10. |	Either or

In [18]:
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains either "falls" or "stays":

x = re.findall("falls|stays", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['falls']
Yes, there is at least one match!


11. \A	Returns a match if the specified characters are at the beginning of the string

In [19]:
import re

txt = "The rain in Spain"

#Check if the string starts with "The":

x = re.findall("\AThe", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")


['The']
Yes, there is a match!


12. \b	Returns a match where the specified characters are at the beginning or at the end of a word
(the "r" in the beginning is making sure that the string is being treated as a "raw string")

In [20]:
import re

txt = "The rain in Spain"

#Check if "ain" is present at the beginning of a WORD:

x = re.findall(r"\bain", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


[]
No match


13. \B	Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word
(the "r" in the beginning is making sure that the string is being treated as a "raw string")

In [21]:
import re

txt = "The rain in Spain"

#Check if "ain" is present, but NOT at the beginning of a word:

x = re.findall(r"\Bain", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['ain', 'ain']
Yes, there is at least one match!


14. \D	Returns a match where the string DOES NOT contain digits

In [22]:
import re

txt = "The rain in Spain"

#Return a match at every no-digit character:

x = re.findall("\D", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', ' ', 'r', 'a', 'i', 'n', ' ', 'i', 'n', ' ', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


15. \S	Returns a match where the string DOES NOT contain a white space character

In [23]:
import re

txt = "The rain in Spain"

#Return a match at every NON white-space character:

x = re.findall("\S", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


16. \w	Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)

In [24]:
import re

txt = "The rain in Spain"

#Return a match at every word character (characters from a to Z, digits from 0-9, and the underscore _ character):

x = re.findall("\w", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


**17. \W	Returns a match where the string DOES NOT contain any word characters**

In [25]:
import re

txt = "The rain in Spain !"

#Return a match at every NON word character (characters NOT between a and Z. Like "!", "?" white-space etc.):

x = re.findall("\W", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


[' ', ' ', ' ', ' ', '!']
Yes, there is at least one match!


**18. \Z	Returns a match if the specified characters are at the end of the string**

In [26]:
import re

txt = "The rain in Spain"

#Check if the string ends with "Spain":

x = re.findall("Spain\Z", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")


['Spain']
Yes, there is a match!


# **Sets**

**1. [arn]	Returns a match where one of the specified characters (a, r, or n) is present**

In [27]:
import re

txt = "The rain in Spain"

#Check if the string has any a, r, or n characters:

x = re.findall("[arn]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['r', 'a', 'n', 'n', 'a', 'n']
Yes, there is at least one match!


**2. [a-n]	Returns a match for any lower case character, alphabetically between a and n**

In [28]:
import re

txt = "The rain in Spain"

#Check if the string has any characters between a and n:

x = re.findall("[a-n]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['h', 'e', 'a', 'i', 'n', 'i', 'n', 'a', 'i', 'n']
Yes, there is at least one match!


**3. [^arn]	Returns a match for any character EXCEPT a, r, and n**

In [29]:
import re

txt = "The rain in Spain"

#Check if the string has other characters than a, r, or n:

x = re.findall("[^arn]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', ' ', 'i', ' ', 'i', ' ', 'S', 'p', 'i']
Yes, there is at least one match!


**4. [0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present**

In [30]:
import re

txt = "The rain in Spain"

#Check if the string has any 0, 1, 2, or 3 digits:

x = re.findall("[0123]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


**5. [0-9]	Returns a match for any digit between 0 and 9**

In [31]:
import re

txt = "8 times before 11:45 AM"

#Check if the string has any digits:

x = re.findall("[0-9]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['8', '1', '1', '4', '5']
Yes, there is at least one match!


**6. [0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59**

In [32]:
import re

txt = "8 times before 11:45 AM"

#Check if the string has any two-digit numbers, from 00 to 59:

x = re.findall("[0-5][0-9]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['11', '45']
Yes, there is at least one match!


**7. [a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case**

In [33]:
import re

txt = "8 times before 11:45 AM"

#Check if the string has any characters from a to z lower case, and A to Z upper case:

x = re.findall("[a-zA-Z]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['t', 'i', 'm', 'e', 's', 'b', 'e', 'f', 'o', 'r', 'e', 'A', 'M']
Yes, there is at least one match!


# **Examples**

**1. Extract phone numbers**

In [34]:
text='''
Elon musk's phone number is 9991116666, call him if you have any questions on dodgecoin. Tesla's revenue is 40 billion
Tesla's CFO number (999)-333-7777
'''
pattern = '\(\d{3}\)-\d{3}-\d{4}|\d{10}'

matches = re.findall(pattern, text)
print(matches)

['9991116666', '(999)-333-7777']


**2. Extract Note Titles**

In [41]:
text = '''
Note 1 - Overview
Tesla, Inc. (“Tesla”, the “Company”, “we”, “us” or “our”) was incorporated in the State of Delaware on July 1, 2003. We design, develop, manufacture and sell high-performance fully electric vehicles and design, manufacture, install and sell solar energy generation and energy storage
products. Our Chief Executive Officer, as the chief operating decision maker (“CODM”), organizes our company, manages resource allocations and measures performance among two operating and reportable segments: (i) automotive and (ii) energy generation and storage.
Beginning in the first quarter of 2021, there has been a trend in many parts of the world of increasing availability and administration of vaccines
against COVID-19, as well as an easing of restrictions on social, business, travel and government activities and functions. On the other hand, infection
rates and regulations continue to fluctuate in various regions and there are ongoing global impacts resulting from the pandemic, including challenges
and increases in costs for logistics and supply chains, such as increased port congestion, intermittent supplier delays and a shortfall of semiconductor
supply. We have also previously been affected by temporary manufacturing closures, employment and compensation adjustments and impediments to
administrative activities supporting our product deliveries and deployments.
Note 2 - Summary of Significant Accounting Policies
Unaudited Interim Financial Statements
The consolidated balance sheet as of September 30, 2021, the consolidated statements of operations, the consolidated statements of
comprehensive income, the consolidated statements of redeemable noncontrolling interests and equity for the three and nine months ended September
30, 2021 and 2020 and the consolidated statements of cash flows for the nine months ended September 30, 2021 and 2020, as well as other information
disclosed in the accompanying notes, are unaudited. The consolidated balance sheet as of December 31, 2020 was derived from the audited
consolidated financial statements as of that date. The interim consolidated financial statements and the accompanying notes should be read in
conjunction with the annual consolidated financial statements and the accompanying notes contained in our Annual Report on Form 10-K for the year
ended December 31, 2020.
'''


pattern = 'Note \d - ([^\n]*)'
matches = re.findall(pattern, text)
print(matches)

['Overview', 'Summary of Significant Accounting Policies']


**3. Extract financial periods from a company's financial reporting**

In [42]:
text = '''
The gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
In previous quarter i.e. FY2020 Q4 it was $3 billion.
'''
pattern = "FY\d{4} Q[1-4]"
matches = re.findall(pattern, text)
print(matches)

['FY2021 Q1', 'FY2020 Q4']


**4. Case insensitive pattern match using flags**

In [45]:
text = '''
The gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
In previous quarter i.e. fy2020 Q4 it was $3 billion.
'''
pattern = "FY\d{4} Q[1-4]"
matches = re.findall(pattern, text, flags=re.IGNORECASE)
print(matches)

['FY2021 Q1', 'fy2020 Q4']


**5. Extract only financial numbers**

In [54]:
text = '''
Tesla's gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
In previous quarter i.e. FY2020 Q4 it was $3 billion.
'''
pattern = "\$([0-9\.]+)"
matches = re.findall(pattern, text)
print(matches)

['4.85', '3']


**6. Extract periods and financial numbers both**

In [56]:
text = '''
Tesla's gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
In previous quarter i.e. FY2020 Q4 it was $3 billion.
'''
pattern = "FY(\d{4} Q[1-4])[^\$]+\$([0-9\.]+)"
matches = re.findall(pattern, text)
print(matches)

[('2021 Q1', '4.85'), ('2020 Q4', '3')]


In [35]:
# https://github.com/codebasics/py/blob/master/Advanced/regex/regex_tutorial_python.ipynb

In [36]:
# https://www.youtube.com/watch?v=sHw5hLYFaIw