## What are regular expressions

In [2]:
print('Hello')

Hello


In [3]:
filenames = ['nov-12.txt', 'november-14.txt', 'Oct-17.txt', 'Nov-22.txt']

## Regular Expression Example

In [1]:
text = 'Hi there you here exa_mple@example.com @blabla some more text here and there another@example.de'

In [10]:
from re import compile

pattern = compile(r"[^ ]+@[^ ]+.[a-z]+")
matches = pattern.findall(text)
print(f"{matches=}")

matches=['exa_mple@example.com', 'another@example.de']


## Meta characters

In [None]:
"""
.        Matches any single character
\        Escapes one of the meta characters to treat it as a regular character
[...]    Matches a single character or a range that is contained within brackets
         _- -_ order does not matter but without brackets order does matter
+        Matches the preceding element one or more times
?        Matches the preceding pattern element zero or one time
*        Matches the preceding element zero or more times
{m,n}    Matches the preceding element at least m and not more than n times
^        Matches the beginning of a line or string
$        Matches the end of a line or string
[^...]   Matches a single character or a range that is not contained within the brackets
?:...|..."Or" operator
()       Matches an optional expression
"""

In [3]:
text = 'Hi there you here exa_mple@example.com @blabla.com some more text here and there another@example.de another@exampl.ne'

import re

pattern = re.compile("[^ ]+@[^ ]+\.(?:com|de)+")
matches = pattern.findall(text)
print(f"{matches=}")


matches=['exa_mple@example.com', 'another@example.de']


## Extract URLs from Text

In [6]:
from pathlib import Path

from p_regex.constants import TEMP_FOLDER_PATH

with open(Path.joinpath(TEMP_FOLDER_PATH,'urls.txt'), mode='r') as file:
    content = file.read()

print(content)

http://google.com
https://example.com
http://www.wikipedia.com
http://pythonhow.com
https://python.org


In [7]:
import re

pattern = re.compile("https?://(?:www.)?[^ \n]+\.com")
matches = pattern.findall(content)
print(f"{matches=}")

matches=['http://google.com', 'https://example.com', 'http://www.wikipedia.com', 'http://pythonhow.com']


## Extract IP addresses using regex

In [12]:
from pathlib import Path

from p_regex.constants import TEMP_FOLDER_PATH

with open(Path.joinpath(TEMP_FOLDER_PATH,'ips.txt'), mode='r') as file:
    content = file.read()

print(content)

912.131.120.111
912.131.134.000
912.131.129.129


In [10]:
import re

# only the IPs complying with XXX.XXX.12X.XXX
pattern = re.compile("[0-9]{3}\.[0-9]{3}\.12[0-9]{1}\.[0-9]{3}")
matches = pattern.findall(content)
matches

['912.131.120.111', '912.131.129.129']

## Filter filenames

In [6]:
from p_regex.constants import FILES_FOLDER_PATH

filenames = FILES_FOLDER_PATH.iterdir()
filenames_str = [filename.name for filename in filenames]
filenames_str

['November-24.txt',
 'Nov-22.txt',
 'Nov-02.txt',
 'Nov-12.txt',
 'Oct-17.txt',
 'nov-20.txt',
 'billy_Nov-13.txt',
 'november-14.txt']

In [8]:
import re

# Only files from Nov-1 to Nov-20
pattern = re.compile("nov[a-z]*-(?:0?[1-9]|1[0-9]|20).txt", re.IGNORECASE)
matches = [filename for filename in filenames_str if pattern.findall(filename)]
matches

['Nov-12.txt', 'nov-20.txt', 'billy_Nov-13.txt', 'november-14.txt']