# Regular Expressions Tutorial - Pattern Extraction with Pandas

This notebook demonstrates how to use regex to extract patterns from text and store them as lists in DataFrame columns.

## Step 1: Import Libraries and Create Sample Data

In [None]:
import pandas as pd
import re

# Create a sample dataset with 8 rows
data = {
    'page_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'text': [
        "Contact us at support@example.com or call 555-123-4567. Sale ends January 15, 2024!",
        "Email info@company.org for details. Our office is open Mon-Fri.",
        "Buy now for $29.99! Use code SAVE20 for 20% off. Ships free!",
        "Visit our website at https://example.com/products for more info.",
        "Posted on 2024-01-10 by @johndoe. Follow me on Twitter! #tech #AI",
        "Call 800-555-0123 or (555) 987-6543 for assistance. Available 24/7.",
        "SPECIAL OFFER: BUY TWO GET ONE FREE! Limited time only.",
        "For questions, email contact@site.com or visit https://help.site.com"
    ]
}

df = pd.DataFrame(data)

print("Sample Dataset:")
print(df)

: 

## Step 2: Extract Email Addresses

Pattern: `username@domain.extension`

In [2]:
def extract_emails(text):
    """
    Extract all email addresses from text
    Returns: list of email addresses
    """
    # Regex pattern for emails
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    # Find all matches
    emails = re.findall(pattern, text)
    
    return emails

# Apply to dataframe - creates a new column with lists
df['emails'] = df['text'].apply(extract_emails)

# Display results
print("Emails Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['emails']) > 0:
        print(f"Row {row['page_id']}: {row['emails']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with new column
print("\nDataFrame with emails column:")
print(df[['page_id', 'emails']])

Emails Extracted:
Row 1: ['support@example.com']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 2: ['info@company.org']
  Text: Email info@company.org for details. Our office is open Mon-F...

Row 8: ['contact@site.com']
  Text: For questions, email contact@site.com or visit https://help....


DataFrame with emails column:
   page_id                 emails
0        1  [support@example.com]
1        2     [info@company.org]
2        3                     []
3        4                     []
4        5                     []
5        6                     []
6        7                     []
7        8     [contact@site.com]


## Step 3: Extract Phone Numbers

Patterns: `555-123-4567`, `(555) 123-4567`, `5551234567`

In [3]:
def extract_phones(text):
    """
    Extract phone numbers in various formats
    Returns: list of phone numbers
    """
    phones = []
    
    # Pattern 1: 555-123-4567
    pattern1 = r'\b\d{3}-\d{3}-\d{4}\b'
    phones.extend(re.findall(pattern1, text))
    
    # Pattern 2: (555) 123-4567
    pattern2 = r'\(\d{3}\)\s*\d{3}-\d{4}'
    phones.extend(re.findall(pattern2, text))
    
    # Pattern 3: 800-555-0123 (toll-free)
    pattern3 = r'\b[8-9]00-\d{3}-\d{4}\b'
    phones.extend(re.findall(pattern3, text))
    
    return phones

# Apply to dataframe
df['phones'] = df['text'].apply(extract_phones)

# Display results
print("Phone Numbers Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['phones']) > 0:
        print(f"Row {row['page_id']}: {row['phones']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with emails and phones:")
print(df[['page_id', 'emails', 'phones']])

Phone Numbers Extracted:
Row 1: ['555-123-4567']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 6: ['800-555-0123', '(555) 987-6543', '800-555-0123']
  Text: Call 800-555-0123 or (555) 987-6543 for assistance. Availabl...


DataFrame with emails and phones:
   page_id                 emails  \
0        1  [support@example.com]   
1        2     [info@company.org]   
2        3                     []   
3        4                     []   
4        5                     []   
5        6                     []   
6        7                     []   
7        8     [contact@site.com]   

                                         phones  
0                                [555-123-4567]  
1                                            []  
2                                            []  
3                                            []  
4                                            []  
5  [800-555-0123, (555) 987-6543, 800-555-0123]  
6                           

## Step 4: View Complete DataFrame

Now we have two new columns with lists of extracted patterns

In [4]:
# Display the full dataframe
print("Complete DataFrame:")
print("="*80)
print(df[['page_id', 'text', 'emails', 'phones']])

# Display with better formatting
print("\n\nDetailed View:")
print("="*80)
for idx, row in df.iterrows():
    print(f"\nPage ID: {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"Emails found: {row['emails']}")
    print(f"Phones found: {row['phones']}")
    print("-"*80)

Complete DataFrame:
   page_id                                               text  \
0        1  Contact us at support@example.com or call 555-...   
1        2  Email info@company.org for details. Our office...   
2        3  Buy now for $29.99! Use code SAVE20 for 20% of...   
3        4  Visit our website at https://example.com/produ...   
4        5  Posted on 2024-01-10 by @johndoe. Follow me on...   
5        6  Call 800-555-0123 or (555) 987-6543 for assist...   
6        7  SPECIAL OFFER: BUY TWO GET ONE FREE! Limited t...   
7        8  For questions, email contact@site.com or visit...   

                  emails                                        phones  
0  [support@example.com]                                [555-123-4567]  
1     [info@company.org]                                            []  
2                     []                                            []  
3                     []                                            []  
4                     []     

## Step 5: Additional Pattern Examples

Let's add a few more useful patterns

In [5]:
# Extract dates
def extract_dates(text):
    dates = []
    # ISO format: 2024-01-15
    dates.extend(re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text))
    # Written: January 15, 2024
    dates.extend(re.findall(
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
        text
    ))
    return dates

# Extract prices
def extract_prices(text):
    prices = []
    # Dollar amounts: $29.99
    prices.extend(re.findall(r'\$\d+(?:\.\d{2})?', text))
    # Percentages: 20% off
    prices.extend(re.findall(r'\d+%\s*off', text, re.IGNORECASE))
    return prices

# Extract URLs
def extract_urls(text):
    return re.findall(r'https?://[^\s]+', text)

# Extract social media handles
def extract_social(text):
    social = []
    # @mentions
    social.extend(re.findall(r'@[A-Za-z0-9_]+', text))
    # #hashtags
    social.extend(re.findall(r'#[A-Za-z0-9_]+', text))
    return social

# Extract all-caps words
def extract_all_caps(text):
    return re.findall(r'\b[A-Z]{2,}\b', text)

# Apply all extractors
df['dates'] = df['text'].apply(extract_dates)
df['prices'] = df['text'].apply(extract_prices)
df['urls'] = df['text'].apply(extract_urls)
df['social'] = df['text'].apply(extract_social)
df['all_caps'] = df['text'].apply(extract_all_caps)

print("All Patterns Extracted!")
print("\nColumn names:")
print(df.columns.tolist())

All Patterns Extracted!

Column names:
['page_id', 'text', 'emails', 'phones', 'dates', 'prices', 'urls', 'social', 'all_caps']


## Step 6: View Final DataFrame

In [7]:
# Select columns to display
display_df = df[['page_id', 'text', 'emails', 'phones', 'dates', 'prices']]

print("Final DataFrame with Extracted Patterns:")
print("="*80)
print(display_df)

# Show each row in detail
print("\n\nDetailed View of Each Row:")
print("="*80)

for idx, row in df.iterrows():
    print(f"\nðŸ“„ Page {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"  â””â”€ Emails: {row['emails']}")
    print(f"  â””â”€ Phones: {row['phones']}")
    print(f"  â””â”€ Dates: {row['dates']}")
    print(f"  â””â”€ Prices: {row['prices']}")
    print(f"  â””â”€ URLs: {row['urls']}")
    print(f"  â””â”€ Social: {row['social']}")
    print(f"  â””â”€ All-caps: {row['all_caps']}")

Final DataFrame with Extracted Patterns:
   page_id                                               text  \
0        1  Contact us at support@example.com or call 555-...   
1        2  Email info@company.org for details. Our office...   
2        3  Buy now for $29.99! Use code SAVE20 for 20% of...   
3        4  Visit our website at https://example.com/produ...   
4        5  Posted on 2024-01-10 by @johndoe. Follow me on...   
5        6  Call 800-555-0123 or (555) 987-6543 for assist...   
6        7  SPECIAL OFFER: BUY TWO GET ONE FREE! Limited t...   
7        8  For questions, email contact@site.com or visit...   

                  emails                                        phones  \
0  [support@example.com]                                [555-123-4567]   
1     [info@company.org]                                            []   
2                     []                                            []   
3                     []                                            []   
4  

## Step 7: Save to CSV

In [9]:
# Save the dataframe with extracted patterns
df.to_csv('sample_with_patterns.csv', index=False)

print("âœ“ Saved to: sample_with_patterns.csv")
print("\nNote: When saved to CSV, the lists are stored as strings")
print("Example: ['email@test.com', 'info@site.com']")

âœ“ Saved to: sample_with_patterns.csv

Note: When saved to CSV, the lists are stored as strings
Example: ['email@test.com', 'info@site.com']
