In [None]:
# Regular Expression(regex)
# A regular expression or RegEx is a special sequence of character that uses a search pattern to find a string or set of strings.
# it can detect the presence or absence of a text by matching it with a particular
# pattern and also can split a pattern into one or more sub-patterns.
# python has a built-in module named "re" that is used for regular expressions in python
https://regex101.com/

In [None]:
# Common regex function in python

#  Function                               Description
# re.match()        Checks for a match only at the beginning of the string.
# re.search()       Searches the entire string for the first match of the pattern.
# re.findall()      Returns a list of all non-overlapping matches in the string.
# re.split()        Splits the string at each match of the pattern and returns a list.
# re.sub()          Replaces all matches with a specified string and returns the new string.


In [None]:
# Regex special sequences
# A special sequence consists of "\" and some selected characters. every special sequence has a unique meaning.

| Character | Description                                                                                                                               |
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `\A`      | Returns a match if the specified characters are at the beginning of the string                                                            |
| `\b`      | Returns a match where the specified characters are at the beginning or at the end of a word                                               |
| `\B`      | Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word                            |
| `\d`      | Returns a match where the string contains digits (numbers from 0–9)                                                                       |
| `\D`      | Returns a match where the string DOES NOT contain digits                                                                                  |
| `\s`      | Returns a match where the string contains a white space character                                                                         |
| `\S`      | Returns a match where the string DOES NOT contain a white space character                                                                 |
| `\w`      | Returns a match where the string contains any word characters (characters from a to Z, digits from 0–9, and the underscore `_` character) |
| `\W`      | Returns a match where the string DOES NOT contain any word characters                                                                     |
| `\Z`      | Returns a match if the specified characters are at the end of the string                                                                  |


    

In [None]:
Regex Metacharacters
The special characters used in RegEx are known as metacharacters.

For example, characters like '|', '+', or '*'.

| Character | Description                                                                |           |
| --------- | -------------------------------------------------------------------------- | --------- |
| `[]`      | A set of characters                                                        |           |
| `\`       | Signals a special sequence (can also be used to escape special characters) |           |
| `.`       | Any character (except newline character)                                   |           |
| `^`       | Starts with                                                                |           |
| `$`       | Ends with                                                                  |           |
| `*`       | Zero or more occurrences                                                   |           |
| `+`       | One or more occurrences                                                    |           |
| `{}`      | Exactly the specified number of occurrences                                |           |
| \`        | \`                                                                         | Either or |

# Regex Metacharacters The special characters used in RegEx are known as metacharacters.

For example, characters like '|', '+', or '*'.


In [3]:
# Importing regular expression re module
import re


In [5]:
# re.search()
# This method either returns None (if the pattern doesn’t match), or a re.MatchObject contains information about the matching part of the string. This method stops after the first match, so this is best suited for testing a regular expression more than extracting data.

# match.start() = index of the first character in the match

# match.end() = index just after the last character in the match

import re
regex = r"([a-zA-Z]+) (\d+)"

match = re.search(regex, "I was born on June 24")
if match != None:
    print ("Match at index %s, %s" % (match.start(), match.end()))
    print ("Full match: %s" % (match.group(0)))
    print ("Month: %s" % (match.group(1)))
    print ("Day: %s" % (match.group(2)))

else:
    print ("The regex pattern does not match.")


Match at index 14, 21
Full match: June 24
Month: June
Day: 24


In [7]:
import re

# Example text
text = "The rain in Spain falls mainly in the plain"

# search for a pattern
pattern = r"Spain"
match = re.search(pattern, text)
if match:
    print("Match found:", match.group())
else:
    print("No Match")

Match found: Spain


In [9]:
# re.findall()
# re.findall(pattern, string)

# Scans the entire string.

# Returns a list of all matching substrings.

# No match object, just a list of strings.

# Example text
text = "The rain in Spain falls mainly in the plain"
# Find all occurances of pattern
pattern = r"in"
pattern = r"\bin\b"
matches = re.findall(pattern, text)
print("All Matches:", matches)

All Matches: ['in', 'in']


In [13]:
text = '''Cloudblitz manager phone number is 9873746466, call him if you have any questions regarding sessions management.
Office landline number is (879)-773-83773'''

pattern = '\d{10}|\(\d{3}\)-\d{3}-\d{5}'

matches = re.findall(pattern, text)
matches

  pattern = '\d{10}|\(\d{3}\)-\d{3}-\d{5}'


['9873746466', '(879)-773-83773']

In [15]:
import re
text = "ababab"
pattern = re.findall(r"aba", text)
print(pattern)  # Output: ['aba']


pattern = re.findall(r"(?=(aba))", "ababab")
print(pattern)  # Output: ['aba', 'aba']

['aba']
['aba', 'aba']


In [17]:
# re.split()
# Split string by the occurrences of a character or a pattern, upon finding that pattern, the remaining characters from the string are returned as part of the resulting list.

# Syntax re.split(pattern, string, maxsplit=0, flags=0)

from re import split

print(split('\W+', 'Words, words , Words'))
print(split('\W+', "Word's words Words"))
print(split('\W+', 'On 12th Jan 2016, at 11:02 AM'))
print(split('\d+', 'On 12th Jan 2016, at 11:02 AM'))

print(re.split('\d+', 'On 12th Jan 2016, at 11:02 AM', 1))
print(re.split('[a-f]+', 'Aey, Boy oh boy, come here', flags=re.IGNORECASE))
print(re.split('[a-f]+', 'Aey, Boy oh boy, come here'))

['Words', 'words', 'Words']
['Word', 's', 'words', 'Words']
['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']
['On ', 'th Jan ', ', at ', ':', ' AM']
['On ', 'th Jan 2016, at 11:02 AM']
['', 'y, ', 'oy oh ', 'oy, ', 'om', ' h', 'r', '']
['A', 'y, Boy oh ', 'oy, ', 'om', ' h', 'r', '']


  print(split('\W+', 'Words, words , Words'))
  print(split('\W+', "Word's words Words"))
  print(split('\W+', 'On 12th Jan 2016, at 11:02 AM'))
  print(split('\d+', 'On 12th Jan 2016, at 11:02 AM'))
  print(re.split('\d+', 'On 12th Jan 2016, at 11:02 AM', 1))


In [19]:
# re.sub()
# The ‘sub’ in the function stands for SubString, a certain regular expression pattern is searched in the given string(3rd parameter), and upon finding the substring pattern is replaced by repl(2nd parameter), count checks and maintains the number of times this occurs.

# syntax re.sub(pattern, repl, string, count=0, flags=0)

import re
print(re.sub('ub', '~*', 'Subject has Uber booked already',
             flags=re.IGNORECASE))
print(re.sub('ub', '~*', 'Subject has Uber booked already'))
print(re.sub('ub', '~*', 'Subject has Uber booked already',
             count=1, flags=re.IGNORECASE))
print(re.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam',
              flags=re.IGNORECASE))

S~*ject has ~*er booked already
S~*ject has Uber booked already
S~*ject has Uber booked already
Baked Beans & Spam


In [21]:
# re.match()
# re.match(pattern, string)

# Checks only at the beginning of the string.

# Returns a match object if the pattern matches from the start.

# Stops at the first match.

import re

result = re.match(r'\d+', '123abc')
if result:
    print("Matched:", result.group())

Matched: 123
