In [None]:
"""Using re module and regular expressions"""
import re

x = 'My 2 favorite numbers: 1 and 10'

"""Find all digit sequences in the string"""
y = re.findall(r'\d+', x) #
print(y)  # Output: ['2', '1', '10']

"""Find all non-digit sequences in the string"""
z = re.findall(r'\D+', x)
print(z)  # Output: ['My ', ' favorite numbers are ', ' and ']

"""Find all whitespace sequences in the string"""
a = re.findall(r'\s+', x)
print(a)  # Output: [' ', ' ', ' ', ' ', ' ']

"""Find all non-whitespace sequences in the string"""
b = re.findall(r'\S+', x)
print(b)  # Output: ['My', '2', 'favorite', 'numbers', 'are', '1', 'and', '10']

['2', '1', '10']
['y', '2', 'f', 'a', 'v', 'o', 'r', 'i', 't', 'e', 'n', 'u', 'm', 'b', 'e', 'r', 's', '1', 'a', 'n', 'd', '1', '0']
['My ', ' favorite numbers: ', ' and ']
[' ', ' ', ' ', ' ', ' ', ' ']
['My', '2', 'favorite', 'numbers:', '1', 'and', '10']


In [None]:
"""Find all digit sequences in the string"""
c = re.findall(r'[0-9]+', x)
print(c)  # Output: ['2', '1', '10']

"""Find all lowercase letter sequences in the string"""
d = re.findall(r'[a-z]+', x)
print(d)  # Output: ['y', 'favorite', 'numbers', 'are', 'and']

"""Find all uppercase letter sequences in the string"""
e = re.findall(r'[A-Z]+', x)
print(e)  # Output: ['M']

"""Find all alphanumeric characters in the string"""
v = re.findall(r'[a-z0-9]', x)
print(v)  # Output: ['y', '2', 'favorite', 'numbers', '1', 'and', '10']

['2', '1', '10']
['y', 'favorite', 'numbers', 'and']
['M']


In [None]:
fe = 'From: Using the : character'

"""Find specific substrings in the string"""
f = re.findall(r'U.+?:', fe)
print(f)  # Output: ['Using the :']

"""Demonstrate the difference between greedy and non-greedy matching"""
g = re.findall(r'^F.+:', fe)
h = re.findall(r'^F.+?:', fe)
print(g)  # Output: ['From: Using the :']
print(h)  # Output: ['From:']

['Using the :']
['From: Using the :']
['From:']


In [None]:
email = 'This code is made for practice. ' \
'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'

"""Find email addresses in the string"""
i = re.findall(r'\S+?@\S+', email)
j = re.findall(r'T.+ (\S+@\S+)', email)
print(i)  # Output: ['stephen.marquard@uct.ac.za']
print(j)  # Output: ['stephen.marquard@uct.ac.za']

['stephen.marquard@uct.ac.za']
['stephen.marquard@uct.ac.za']


In [72]:
"""Extract domain names from email addresses with string slicing"""
k = email.find('@')
l = email.find(' ', k)
domain = email[k+1:l]
print(k) 
print(l)
print(domain)

"""Using double split to extract domain names"""
parts = email.split()
for part in parts:
    if '@' in part:
        domain2 = part.split('@')[1]
        print(domain2)

"""Extract domain names from email addresses with regular expressions"""
ex1 = re.findall(r'@([^ ]*)', email)
ex2 = re.findall(r'@(\S+)', email)
ex3 = re.findall(r'@(\S+)\s', email)
ex4 = re.findall(r'From.*@([^ ]+)', email)
print(ex1)  # Output: ['uct.ac.za']
print(ex2)  # Output: ['uct.ac.za']
print(ex3)  # Output: ['uct.ac.za']
print(ex4)  # Output: ['uct.ac.za']

53
63
uct.ac.za
uct.ac.za
['uct.ac.za']
['uct.ac.za']
['uct.ac.za']
['uct.ac.za']


In [None]:
"""Create files to work the assignment for extracting data with regular expressions"""
import os
import requests
from dotenv import load_dotenv

load_dotenv()

# Set up the file directory and URL
url1 = "https://py4e-data.dr-chuck.net/regex_sum_42.txt"
url2 = "https://py4e-data.dr-chuck.net/regex_sum_2326291.txt"
filedir = os.getenv("coursera_mich_dir", ".")
data1 = requests.get(url1).text
data2 = requests.get(url2).text

# Define the filename and path
filepath1 = os.path.join(filedir, "regex_sum_42.txt")
filepath2 = os.path.join(filedir, "regex_sum_2326291.txt")

# Save the file if it does not exist
if not os.path.exists(filepath1):
    with open(filepath1, 'w', encoding='utf-8') as file:
        file.write(data1)
        print(f"The file {filepath1} has been created.")
else:
        print(f"The file {filepath1} already exists.")

if not os.path.exists(filepath2):
    with open(filepath2, 'w', encoding='utf-8') as file:
        file.write(data2)
    print(f"The file {filepath2} has been created.")
else:
        print(f"The file {filepath2} already exists.")

The file .\regex_sum_42.txt has been created.
The file .\regex_sum_2326291.txt has been created.


In [87]:
"""Calculate the sum of all numbers in the file using regular expressions"""
with open(filepath2, 'r', encoding='utf-8') as file:
    numberlist = list()
    for line in file:
        line = line.rstrip()
        numbers = re.findall(r'[0-9]+', line)
        for number in numbers:
            numberlist.append(int(number))

print(numberlist)
print(sum(numberlist))

[3969, 6006, 1522, 8598, 8219, 5979, 5470, 3448, 1629, 6799, 1040, 6181, 5770, 4507, 6022, 7524, 3779, 727, 2150, 9928, 3150, 2638, 1467, 8311, 251, 9540, 3868, 721, 6953, 3117, 376, 1411, 8816, 8362, 3095, 6131, 4399, 2811, 2259, 9375, 1700, 9044, 3044, 2478, 5754, 6653, 7588, 7036, 3882, 7987, 4984, 8741, 7373, 4263, 1846, 758, 8766, 4, 5139, 9877, 7250, 7774, 5904, 1503, 2631, 2578, 4654, 5557, 5135, 8769, 1986, 5502, 4915, 2028, 9227, 8934, 2888, 4, 3, 42]
380519


In [89]:
"""Calculate the sum of all numbers in the file using a single line of code"""

#file.seek(0) # Reset file pointer to the beginning
with open(filepath2, 'r', encoding='utf-8') as file:
    print(sum([int(number) for number in re.findall(r'[0-9]+', file.read())]))

380519
