# Topics: Regular Expressions

In [1]:
import re

### 1. What is a regular expression in Python?

Regular Expression:
- A sequence of symbols and characters expressing a string or pattern to be searched for within a longer piece of text.

Steps:
- import re

Syntax:
- re.funtion("pattern", iterables)

### 2. Explain the difference between re.match() and re.search() in Python.

In [2]:
"""
# re.match():
---------------------------------------------------------------------------------------------------------
|   # re.match()                                |   # re.search()                                       |
---------------------------------------------------------------------------------------------------------
|   1. find pattern at the beginning of the     |   1. find pattern anywhere in the string and          |
|   string and return a match object.           |   return a match object.                              |
|                                               |                                                       |
|   2. useful for validating if a string starts |   2. used for finding patterns that might appear      |
|   with a specific pattern.                    |   at any position within a string.                    |
---------------------------------------------------------------------------------------------------------
"""

string_with_newlines = """something
someotherthing"""

print(f"{'*'*11} re.match() {'*'*11}")
print(re.match('some', string_with_newlines), "\n")     # matches at first else None

print(f"{'*'*11} re.search() {'*'*10}")
print(re.search('someother', string_with_newlines))     # finds in entire string


*********** re.match() ***********
<re.Match object; span=(0, 4), match='some'> 

*********** re.search() **********
<re.Match object; span=(10, 19), match='someother'>


### 3. Explain the purpose of the re.sub() function in Python.

In [3]:
"""
Purpose:
    - re.sub() method that match a given regular expression pattern with a new substring.
    - it is useful in text processing or data cleaning.

Syntax :
    - re.sub()
"""

# E.g.:
my_string = "apple orange apple banana"
pattern = "apple"
replacement = "grape"

result = re.sub(pattern, replacement, my_string, count=1)

print(f"{'*'*11} Replacing Only One Occurance by Count {'*'*11}")
print(f"Replaced String:\n--> {result}")

*********** Replacing Only One Occurance by Count ***********
Replaced String:
--> grape orange apple banana


### 4. What is the purpose of the re.compile() function in Python?

In [4]:
"""
Purpose:
    - re.compile() that compiles the regular expression pattern into a regular expression object.
    - compiling patterns can improve the readability of code.
    - ir can be used multiple times within a program.

Syntax :
    - re.compile()
"""

# E.g.:
emails = """Email IDs:
viratkohli@gmail.com
viratkohli18@gmail.com
virat.kohli@gmail.com
virat.kohli18@gmail.com
viratkohli@bcci.co.in
viratkohli18@vctcpune.com
virat_kohli@gmail.com
virat_kohli1234@gmail.com
virat_kohli1234@gmail.com'
virat@outlook.com
ViRat@outlook.com
"""

email_pattern = re.compile(r"^[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")

valid, invalid = [],[]
for x in emails.split():
    if re.match(email_pattern, x) is not None:
        valid.append(x)
    else:
        invalid.append(x)

print(f"valid emails: {valid}")
print(f"invalid data: {invalid}")

valid emails: ['viratkohli@gmail.com', 'viratkohli18@gmail.com', 'virat.kohli@gmail.com', 'virat.kohli18@gmail.com', 'viratkohli@bcci.co.in', 'viratkohli18@vctcpune.com', 'virat_kohli@gmail.com', 'virat_kohli1234@gmail.com', 'virat@outlook.com', 'ViRat@outlook.com']
invalid data: ['Email', 'IDs:', "virat_kohli1234@gmail.com'"]


### 5. Explain the significance of all special characters (*, +, ?, ^, $).

In [5]:
"""
-----------------------------------------
| # Char|   # Description               |
-----------------------------------------
|   *   |   Zero or more occurrences    |
|   +   |   One or more occurrences     |
|   ?   |   Zero or one occurrences     |
|   ^   |   Starts with                 |
|   $   |   Ends with                   |
-----------------------------------------
"""
pass

### 6. What is the purpose of the \b anchor in regular expressions?

In [6]:
"""
Purpose:
    - To ensure that a pattern matches an entire word rather than just a part of a word i.e: \bword\b
"""

# E.g.:
pan_cards = """DFGHJ4567V
FGBVJD4562J
HIOPA1234KO"""

valid_pan = re.findall(r"\b[A-Z]{5}[0-9]{4}[A-z]\b", pan_cards)

print(f"{'*'*11} Strict Pattern Match {'*'*11}")
print(valid_pan)

*********** Strict Pattern Match ***********
['DFGHJ4567V']


### 7. How do you perform a case-insensitive search using regular expressions in Python?

In [7]:
"""
To perform case-insensitive search we can use re.IGNORECASE flag.
"""

# E.g.:
text = "Python python PyThOn"
pattern = r"python"

# Perform a case-insensitive search
matches = re.findall(pattern, text, re.IGNORECASE)
print(matches)

['Python', 'python', 'PyThOn']


### 8. What is the purpose of the re.findall() function in Python?

In [8]:
"""
Purpose:
    - it find all non-overlapping occurrences of a given RE pattern within a string and return them as a list of strings.
    - The matches are returned in the order.

Syntax :
    - re.findall()
"""

# E.g.:
my_string = "don moron cron prone drone"
matches = re.findall(".on|..on", my_string)
matches

['don', 'oron', 'cron', 'pron', 'dron']

### 9. What is the purpose of the re.split() function in Python?

In [9]:
"""
Purpose:
    - it is used to split a string into a list of substrings based on a regular expression pattern.
    - it can handles multiple delimiters for searching patterns.
Syntax :
    - re.split()
"""

# E.g.:
data = "DFGHJ4567V"
print(re.split("(\d+)", data))

['DFGHJ', '4567', 'V']


### 10. How do you use the re.IGNORECASE flag in a regular expression pattern?

In [10]:
string_with_newlines = """Something
someotherthing"""

print(f"{'*'*11} IGNORECASE {'*'*11}")
print(re.match('some', string_with_newlines, re.IGNORECASE))   

*********** IGNORECASE ***********
<re.Match object; span=(0, 4), match='Some'>


### 11. How can you escape special characters in a regular expression pattern?

In [11]:
# text = "$pe@ial Chars *"
# print(re.sub("[~`!@#$%^&*()_+-=]", "", text))

text = "www.google.com"
print(re.escape(text))

www\.google\.com


### 12. Python Program to check that a string contains only a certain set of characters (in this case a-z, A-Z, and 0-9).

In [12]:
string = "kp@gmail.com INDIA 8752 kpcI8"
pattern = re.compile(r"[a-zA-Z0-9]+")
result = re.findall(pattern, string)

print(result)

['kp', 'gmail', 'com', 'INDIA', '8752', 'kpcI8']


### 13. Python Program that matches a string that has an 'a' followed by one or more 'b's.

In [13]:
string = "abbreviation"
pattern = re.compile(r"ab+")
result = re.findall(pattern, string)
print(result)

print(re.findall(r"ab+", "abbbbbb"), re.findall(r"ab+", "a123bb"))

['abb']
['abbbbbb'] []


### 14. Python Program that matches a string that has an 'a' followed by zero or one 'b'.

In [14]:
string = "abap argb abab acdb"

pattern = re.compile("ab?")
result = re.findall(pattern, string)
print(result)

['ab', 'a', 'a', 'ab', 'ab', 'a']


### 15. Python Program that matches a string that has an 'a' followed by anything, ending in 'b'.

In [15]:
string1 = "abap argb prab abab acdb"
print(re.findall(r"\ba\w+b\b", string1))

['argb', 'abab', 'acdb']


### 16. Python Program to find sequences of lowercase letters joined with an underscore.

In [16]:
string = """Abstract_class,
kala_jamun,
test_runner,
Quality_Assurance,
enventory_Management
"""

print(re.findall(r"\b[a-z_]+[a-z]\b", string))

['kala_jamun', 'test_runner']


### 17. Python Program to find the sequences of one uppercase letter followed by lowercase letters.

In [17]:
string = """Abstract_class,
kala_jamun,
test_runner,
Quality_Assurance,
enventory_Management
"""

print(re.findall(r"\b[A-Z][a-z_]+\b", string))

['Abstract_class']


### 18. Python Program that matches a word at the beginning of a string.

In [18]:
string = "ABCDF2861Z PAN Status demo"

print(re.match(r"\w{5}\d{4}\w", string).group())

ABCDF2861Z


### 19. Python Program that matches a word at the end of a string, with optional punctuation.

In [19]:
thought = """
Imagination governs the world!
History is written by the winners.
-Napoleon Bonaparte
"""

print(re.findall("\w+[.!_$]", thought))

['world!', 'winners.']


### 20. Python Program that matches a word containing 'z'.

In [20]:
string = "normalize the zest of enzymes"

print(re.findall(r"[a-z]*[z]+[a-z]+", string))

['normalize', 'zest', 'enzymes']


### 21. Python Program to match a string that contains only upper and lowercase letters, numbers, and underscores.

In [21]:
string = "order id _cwkHfr230925, error code is 0x2A0x80070005"

print(re.findall(r"\b\w+\d+\b", string))

['_cwkHfr230925', '0x2A0x80070005']


### 22. Python Program where a string will start with a specific number.

In [22]:
string = """batch id:
223-B95685
223-B95795
225-B98873
226-B100123
223-B94985
"""

print(re.findall(r"\b[223]+[-]\w\d+\b", string))    

['223-B95685', '223-B95795', '223-B94985']


### 23. Python Program to check for a number at the end of a string.

In [23]:
string = "ERROR_DHCP_ADDRESS_CONFLICT 4100 0x1004"

print(re.search("\d[a-z]\d+$", string).group())

0x1004


### 24. Python Program to validate an email address using a regular expression.

In [24]:
emails = """
abc'def@mail.c abc.def@mail.cc
abc.def@mail#archive.com abc_def@mail-archive.com
abc.def@mail abcdef1236@mail.org
abc.def@mail..com abc65.def@mail.com
"""

valid_email_pattern = re.compile(r"\b\w{3,10}[._0-9]\w+[@][a-z]{3,15}[.]\w+\b")
print(re.findall(valid_email_pattern, emails))

['abc.def@mail.cc', 'abcdef1236@mail.org', 'abc65.def@mail.com']


### 25. Python Program to extract all phone numbers from a string containing phone numbers.

In [25]:
numbers = """
75658963
8600742471
98255225634
9865421786
8596324756
"""
print(re.findall(r"\b\d{10}\b", numbers))

['8600742471', '9865421786', '8596324756']
