<h3>(1) Regex in customer support</h3>

<h3>Retrieve order number</h3>

In [1]:
# Import the 're' module for regular expression operations
import re

In [2]:
# Define the chat messages with a customer query containing an order number
chat1='customer: Hello, I am having an issue with my order # 412889912'
chat2='customer: I have a problem with my order number 412889912'
chat3='customer: My order 412889912 is having an issue, I was charged 300$ when online it says 280$'

In [3]:
# Define a regular expression pattern to capture the order number
pattern = 'order[^\d]*(\d*)'

- **'order'** : Match the word "order"
- **'[^\d]*'** : Match any characters that are not digits (zero or more times) after "order"
- **'(\d*)'** : Capture a sequence of digits (zero or more times) following the non-digit characters

In [4]:
# Use a list comprehension to apply re.findall() with the pattern to each chat message (chat1, chat2, chat3)
# This will find all matches for the pattern (order numbers) in each chat message
matches = [re.findall(pattern, chat) for chat in [chat1, chat2, chat3]]
matches

[['412889912'], ['412889912'], ['412889912']]

<h3>Retrieve email id and phone</h3>

In [5]:
# Define the chat messages containing a phone number, email, and other text
chat1 = 'codebasics: you ask lot of questions 1235678912, abc@xyz.com, 9998881234'
chat2 = 'codebasics: here it is: (123)-567-8912, abX_82@xyz.com'
chat3 = 'codebasics: yes, phone: 1235678912 email: abc@xyz.com'

**-----Email id-----**

In [6]:
# Define a pattern to match email addresses
pattern = '[a-zA-Z0-9_]*@[a-z]*\.[a-zA-Z0-9]*'

- **'[a-zA-Z0-9_]*'** : Match any combination of letters, digits, or underscores (zero or more times) before the '@'
- **'@'** : Match the '@' symbol
- **'[a-z]*'** : Match lowercase letters (zero or more times) after the '@'
- **'\.'** : Match the '.' symbol (escaped because '.' is a special character in regex)
- **'[a-zA-Z0-9]*'** : Match any combination of letters or digits (zero or more times) after the '.'

In [7]:
# Use a list comprehension to apply re.findall() with the pattern to each chat message (chat1, chat2, chat3)
# This will find all matches for the pattern (email) in each chat message
matches = [re.findall(pattern, chat) for chat in [chat1, chat2, chat3]]
matches

[['abc@xyz.com'], ['abX_82@xyz.com'], ['abc@xyz.com']]

**-----Phone number-----**

In [8]:
# Define a pattern to match phone numbers
pattern = '\d{10}|\(\d{3}\)-\d{3}-\d{4}'

- **'\d{10}'** : Match a sequence of exactly 10 digits (e.g., "1235678912")
- **'|'** : Logical OR, to also match another format
- **'\(\d{3}\)-\d{3}-\d{4}'** : Match a phone number in the format "(123)-567-8912"
- **'\('** : Match the opening parenthesis '('
- **'\d{3}'** : Match exactly 3 digits
- **'\)'** : Match the closing parenthesis ')'
- **'-'** : Match the hyphen '-'
- **'\d{3}-\d{4}'** : Match 3 digits, a hyphen, and then 4 digits

In [9]:
# Use a list comprehension to apply re.findall() with the pattern to each chat message (chat1, chat2, chat3)
# This will find all matches for the pattern (phone numbers) in each chat message
matches = [re.findall(pattern, chat) for chat in [chat1, chat2, chat3]]
matches

[['1235678912', '9998881234'], ['(123)-567-8912'], ['1235678912']]

<h3>(2) Regex for Information Extraction</h3>

In [10]:
# Define the text string containing details about Elon Musk
text='''
Born	Elon Reeve Musk
June 28, 1971 (age 50)
Pretoria, Transvaal, South Africa
Citizenship
South Africa (1971–present)
Canada (1971–present)
United States (2002–present)
Education	University of Pennsylvania (BS, BA)
Title
Founder, CEO and Chief Engineer of SpaceX
CEO and product architect of Tesla, Inc.
Founder of The Boring Company and X.com (now part of PayPal)
Co-founder of Neuralink, OpenAI, and Zip2
Spouse(s)
Justine Wilson
​
​(m. 2000; div. 2008)​
Talulah Riley
​
​(m. 2010; div. 2012)​
​
​(m. 2013; div. 2016)
'''

In [11]:
# Define a pattern to extract the age
pattern = 'age (\d+)'

# Use re.findall() to find all matches for the pattern (age) in the text
matches = re.findall(pattern, text)
matches

['50']

- **'age'** : Match the literal word "age"
- **'(\d+)'** : Capture one or more digits (the age number)

In [12]:
# Define a pattern to extract the full name
pattern = 'Born(.*)\n'

# Use re.findall() to find all matches for the full name, stripping any leading/trailing whitespace
matches = [match.strip() for match in re.findall(pattern, text)]
matches

['Elon Reeve Musk']

- **'Born'** : Match the literal word "Born"
- **'(.*)'** : Capture any characters (the full name) following "Born"
- **'\n'** : Match the newline character, ensuring separation from the next line

In [13]:
# Define a pattern to extract the birth date
pattern = 'Born.*\n(.*)\(age'

# Use re.findall() to find all matches for the birth date, stripping any leading/trailing whitespace
matches = [match.strip() for match in re.findall(pattern, text)]
matches

['June 28, 1971']

- **'Born'** : Match the literal word "Born"
- **'.*\n'** : Capture any characters followed by a newline, ensuring separation
- **'(.*)'** : Capture any characters (the birth date) between the newline and "age"

In [14]:
# Define a pattern to extract the birth place
pattern = '\(age.*\n(.*)'

# Use re.findall() to find all matches for the birth place
matches = re.findall(pattern, text)
matches

['Pretoria, Transvaal, South Africa']

- **'\(age'** : Match the literal string "(age"
- **'.*\n'** : Capture any characters followed by a newline, ensuring separation
- **'(.*)'**: Capture any characters (the birth place) after the newline

In [15]:
age = 'age (\d+)'
matches_age = re.findall(age, text)
matches_age

full_name = 'Born(.*)\n'
matches_full_name = [match.strip() for match in re.findall(full_name, text)]
matches_full_name

birth_date = 'Born.*\n(.*)\(age'
matches_birth_date = [match.strip() for match in re.findall(birth_date, text)]
matches_birth_date

birth_place = '\(age.*\n(.*)'
matches_birth_place = re.findall(birth_place, text)
matches_birth_place

# Print the extracted information
print('age: ', matches_age)
print('full_name: ', matches_full_name)
print('birth_date: ', matches_birth_date)
print('birth_place: ', matches_birth_place)

age:  ['50']
full_name:  ['Elon Reeve Musk']
birth_date:  ['June 28, 1971']
birth_place:  ['Pretoria, Transvaal, South Africa']


In [16]:
# Define the text string containing details about Mukesh Ambani
text = '''
Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law)
'''

In [17]:
# Define a pattern to extract the age
age = 'age (\d+)'
matches_age = re.findall(age, text)
matches_age

# Define a pattern to extract the full name
full_name = 'Born(.*)\n'
matches_full_name = [match.strip() for match in re.findall(full_name, text)]
matches_full_name

# Define a pattern to extract the birth_date
birth_date = 'Born.*\n(.*)\(age'
matches_birth_date = [match.strip() for match in re.findall(birth_date, text)]
matches_birth_date

# Define a pattern to extract the birth_place
birth_place = '\(age.*\n(.*)'
matches_birth_place = re.findall(birth_place, text)
matches_birth_place

# Print the extracted information
print('age: ', matches_age)
print('full_name: ', matches_full_name)
print('birth_date: ', matches_birth_date)
print('birth_place: ', matches_birth_place)

age:  ['64']
full_name:  ['Mukesh Dhirubhai Ambani']
birth_date:  ['19 April 1957']
birth_place:  ['Aden, Colony of Aden']
