# Regular Expressions Tutorial - Pattern Extraction with Pandas

This notebook demonstrates how to use regex to extract patterns from text and store them as lists in DataFrame columns.

## Step 1: Import Libraries and Create Sample Data

In [82]:
import pandas as pd
import re

# Create a sample dataset with 8 rows
data = {
    'page_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'text': [
        "Contact us at support@example.com or call 555-123-4567. Sale ends January 15, 2024!",
        "Email info@company.org for details. Our office is open Mon-Fri.",
        "Buy now for $29.99! Use code SAVE20 for 20% off. Ships free!",
        "Visit our website at https://example.com/products for more info.",
        "Posted on 2024-01-10 by @johndoe. Follow me on Twitter! #tech #AI",
        "Call 800-555-0123 or (555) 987-6543 for assistance. Available 24/7.",
        "SPECIAL OFFER: BUY TWO GET ONE FREE! Limited time only.",
        "For questions, email contact@site.com or visit https://help.site.com"
    ]
}

data_real = pd.read_csv('../data/dataset_with_assignments.csv')

df = pd.DataFrame(data_real)

print("Data size:", df.shape)
print("\nColumns:", df.columns.tolist())


page_to_write = 348
df.loc[page_to_write,'manual_label'] = 'other'
# print(df.loc[325, 'assigned_to'])
# print(df.iloc[page_to_write])

print(df[df['assigned_to']=='Jaee Oh'])

# df[df['assigned_to']=='Jaee Oh'].to_csv('../data/dataset_OH_label.csv', index=False)


data_real = pd.read_csv('../data/dataset_OH_label.csv')
print(data_real)

Data size: (6314, 15)

Columns: ['page_id', 'url', 'domain', 'tld', 'date', 'word_count', 'text_length', 'sentence_count', 'paragraph_count', 'avg_word_length', 'path_depth', 'text', 'full_text', 'assigned_to', 'manual_label']
     page_id  \
325      464   
326      465   
327      466   
328      467   
329      468   
330      469   
331      471   
332      472   
333      474   
334      475   
335      476   
336      477   
337      478   
338      480   
339      481   
340      482   
341      483   
342      484   
343      485   
344      486   
345      489   
346      490   
347      491   
348      493   
349      495   

                                                                                                                                                                                                                                                                                                                                               url  \
325           

## Step A-1: Extract Email Addresses

Pattern: `username@domain.extension`

In [40]:
def extract_emails(text):
    """
    Extract all email addresses from text
    Returns: list of email addresses
    """
    # Regex pattern for emails
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
    
    # Find all matches
    emails = re.findall(pattern, text)
    
    return emails

# Apply to dataframe - creates a new column with lists
df['emails'] = df['text'].apply(extract_emails)

# Display results
print("Emails Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['emails']) > 0:
        print(f"Row {row['page_id']}: {row['emails']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with new column
print("\nDataFrame with emails column:")
print(df[['page_id', 'emails']])

print("Data size:", df.shape)

Emails Extracted:
Row 46: ['sales@cadotech.com']
  Text: Cadotech Solutions Pvt Ltd
Cadotech Solutions Pvt Ltd
Toggle...

Row 60: ['njlimoservicenj@gmail.com']
  Text: njlimoservice08
Einloggen »
Neuer Benutzer? Jetzt anmelden!
...

Row 74: ['info@domain.com']
  Text: Creative DNN Responsive Theme > Shortcodes > Portfolio (Alph...

Row 141: ['info@itriaindustries.com', 'info@itriaindustries.com']
  Text: Home - ITRIA INDUSTRIES
Skip to content
info@itriaindustries...

Row 150: ['info@joinerymacclesfield.co.uk']
  Text: Bespoke Joinery and Carpentry Craftsmanship Macclesfield Dav...

Row 183: ['admin@motrjim.com']
  Text: Course page | Motrjim Academy
+201114998648
admin@motrjim.co...

Row 200: ['orionautocomponents@gmail.com']
  Text: Hello world! – OAC AUTO COMPONENTS
Home
About us
INFRASTRUCT...

Row 204: ['everythingwater8663@gmail.com']
  Text: Find and Infant Swimming Instructor in Your Area
Skip to con...

Row 253: ['Info@tavonconsulting.com']
  Text: East Azarbayjan judiciary
مه

## Step A-2: Extract Phone Numbers

Patterns: `555-123-4567`, `(555) 123-4567`, `5551234567`

In [15]:
def extract_phones(text):
    """
    Extract phone numbers in various formats
    Returns: list of phone numbers
    """
    phones = []
    
    # Pattern 1: 555-123-4567
    pattern1 = r'\b\d{3}-\d{3}-\d{4}\b'
    phones.extend(re.findall(pattern1, text))
    
    # Pattern 2: (555) 123-4567
    pattern2 = r'\(\d{3}\)\s*\d{3}-\d{4}'
    phones.extend(re.findall(pattern2, text))
    
    # Pattern 3: 800-555-0123 (toll-free)
    pattern3 = r'\b8[0-9]{2}-\d{3}-\d{4}\b'
    phones.extend(re.findall(pattern3, text))

    # Pattern 4: 555.123.4567
    pattern4 = r'\b\d{3}\.\d{3}\.\d{4}\b'
    phones.extend(re.findall(pattern4, text))

    # Pattern 5: 555 123 4567
    pattern5 = r'\b\d{3}\s\d{3}\s\d{4}\b'
    phones.extend(re.findall(pattern5, text))

    # Pattern 6: 10 consecutive digits
    pattern6 = r'\b\d{10}\b'
    phones.extend(re.findall(pattern6, text))

    # Pattern 7: +1 (555) 123-4567 or +1-555-123-4567
    pattern7 = r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    phones.extend(re.findall(pattern7, text))

    # Pattern 8: 1-555-123-4567 or 1.555.123.4567 or 1 555 123 4567
    pattern8 = r'\b1[-.\s]\d{3}[-.\s]\d{3}[-.\s]\d{4}\b'
    phones.extend(re.findall(pattern8, text))

    # Pattern 9: (555)123-4567 or (555) 123.4567
    pattern9 = r'\(\d{3}\)\s*\d{3}[.\s]\d{4}'
    phones.extend(re.findall(pattern9, text))

    # Pattern 10: 555-123-4567 ext. 1234 or 555.123.4567 x123
    pattern10 = r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\s*(?:ext\.?|x)\s*\d{1,5}\b'
    phones.extend(re.findall(pattern10, text))

    phones = list(set(phones)) # Remove duplicates
    
    return phones

# Apply to dataframe
df['phones'] = df['text'].apply(extract_phones)

# Display results
print("Phone Numbers Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['phones']) > 0:
        print(f"Row {row['page_id']}: {row['phones']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with emails and phones:")
print(df[['page_id', 'emails', 'phones']])

Phone Numbers Extracted:
Row 1: ['555-123-4567']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 6: ['800-555-0123', '(555) 987-6543']
  Text: Call 800-555-0123 or (555) 987-6543 for assistance. Availabl...


DataFrame with emails and phones:
   page_id                 emails                          phones
0        1  [support@example.com]                  [555-123-4567]
1        2     [info@company.org]                              []
2        3                     []                              []
3        4                     []                              []
4        5                     []                              []
5        6                     []  [800-555-0123, (555) 987-6543]
6        7                     []                              []
7        8     [contact@site.com]                              []


## Step A-3: Extract Dates

In [16]:
def extract_dates(text):
    dates = []
    
    # Pattern 1: MM/DD/YYYY or DD/MM/YYYY
    pattern1 = r'\b\d{1,2}/\d{1,2}/\d{2,4}\b'
    
    # Pattern 2: YYYY-MM-DD (ISO format)
    pattern2 = r'\b\d{4}-\d{2}-\d{2}\b'
    
    # Pattern 3: MM-DD-YYYY or DD-MM-YYYY
    pattern3 = r'\b\d{1,2}-\d{1,2}-\d{2,4}\b'
    
    # Pattern 4: MM.DD.YYYY or DD.MM.YYYY
    pattern4 = r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b'
    
    # Pattern 5: Month DD, YYYY (January 24, 2026)
    pattern5 = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b'
    
    # Pattern 6: DD Month YYYY (24 January 2026)
    pattern6 = r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
    
    # Pattern 7: Mon DD, YYYY (Jan 24, 2026)
    pattern7 = r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b'
    
    # Pattern 8: DD Mon YYYY (24 Jan 2026)
    pattern8 = r'\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}\b'
    
    # Pattern 9: YYYY/MM/DD
    pattern9 = r'\b\d{4}/\d{2}/\d{2}\b'
    
    # Pattern 10: Month YYYY (January 2026)
    pattern10 = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
    
    # Pattern 11: MM/YYYY or MM-YYYY
    pattern11 = r'\b\d{1,2}[-/]\d{4}\b'
    
    # Pattern 12: Ordinal dates (January 24th, 2026 or 24th January 2026)
    pattern12 = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th),?\s+\d{4}\b'
    pattern13 = r'\b\d{1,2}(?:st|nd|rd|th)\s+(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s+\d{4}\b'
    
    all_patterns = [pattern1, pattern2, pattern3, pattern4, pattern5,
                    pattern6, pattern7, pattern8, pattern9, pattern10,
                    pattern11, pattern12, pattern13]
    
    for pattern in all_patterns:
        dates.extend(re.findall(pattern, text, re.IGNORECASE))
    
    # Remove duplicates while preserving order
    seen = set()
    unique_dates = []
    for date in dates:
        if date.lower() not in seen:
            seen.add(date.lower())
            unique_dates.append(date)
    
    return unique_dates

# Apply to dataframe
df['dates'] = df['text'].apply(extract_dates)

# Display results
print("Dates Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['dates']) > 0:
        print(f"Row {row['page_id']}: {row['dates']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with dates:")
print(df[['page_id', 'dates']])

Dates Extracted:
Row 1: ['January 15, 2024']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 5: ['2024-01-10']
  Text: Posted on 2024-01-10 by @johndoe. Follow me on Twitter! #tec...


DataFrame with dates:
   page_id               dates
0        1  [January 15, 2024]
1        2                  []
2        3                  []
3        4                  []
4        5        [2024-01-10]
5        6                  []
6        7                  []
7        8                  []


## Step A-4: Extract Prices and Monetary Amounts

In [17]:
def extract_prices(text):
    prices = []
    
    # Pattern 1: $19.99, $1,234.56, $1234.56
    pattern1 = r'\$\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
    
    # Pattern 2: €19.99, €1.234,56 (European format)
    pattern2 = r'€\s*\d{1,3}(?:\.\d{3})*(?:,\d{2})?'
    
    # Pattern 3: €19.99, €1,234.56 (Euro with US format)
    pattern3 = r'€\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
    
    # Pattern 4: £19.99, £1,234.56
    pattern4 = r'£\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
    
    # Pattern 5: ¥1234, ¥1,234 (Yen/Yuan - typically no decimals)
    pattern5 = r'¥\s*\d{1,3}(?:,\d{3})*'
    
    # Pattern 6: ₹1,23,456.78 (Indian Rupee format)
    pattern6 = r'₹\s*\d{1,2}(?:,\d{2})*(?:,\d{3})(?:\.\d{2})?'
    
    # Pattern 7: USD 19.99, EUR 1234.56 (currency code prefix)
    pattern7 = r'\b(?:USD|EUR|GBP|JPY|CAD|AUD|CHF|CNY|INR|KRW|BRL|MXN)\s*\d{1,3}(?:[,.\s]\d{3})*(?:[.,]\d{2})?\b'
    
    # Pattern 8: 19.99 USD, 1234.56 EUR (currency code suffix)
    pattern8 = r'\b\d{1,3}(?:[,.\s]\d{3})*(?:[.,]\d{2})?\s*(?:USD|EUR|GBP|JPY|CAD|AUD|CHF|CNY|INR|KRW|BRL|MXN)\b'
    
    # Pattern 9: Negative amounts -$19.99
    pattern9 = r'-\$\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
    
    # Pattern 10: Negative amounts ($19.99) - accounting format
    pattern10 = r'\(\$\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?\)'
    
    # Pattern 11: $19, $1,234 (whole dollar amounts)
    pattern11 = r'\$\s*\d{1,3}(?:,\d{3})*\b(?!\.\d)'
    
    # Pattern 12: 19 cents, 99¢
    pattern12 = r'\b\d{1,2}\s*(?:cents?|¢)\b'
    
    # Pattern 13: $1.5M, $2.3B, $500K (abbreviated large amounts)
    pattern13 = r'\$\s*\d+(?:\.\d+)?\s*[KMBkmb]\b'
    
    # Pattern 14: $10-$20, $10 - $20 (price ranges)
    pattern14 = r'\$\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*[-–—]\s*\$?\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
    
    # Pattern 15: Written amounts (10 dollars, 5.99 euros)
    pattern15 = r'\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:dollars?|euros?|pounds?|yen|yuan)\b'
    
    all_patterns = [pattern1, pattern2, pattern3, pattern4, pattern5,
                    pattern6, pattern7, pattern8, pattern9, pattern10,
                    pattern11, pattern12, pattern13, pattern14, pattern15]
    
    for pattern in all_patterns:
        prices.extend(re.findall(pattern, text, re.IGNORECASE))
    
    # Remove duplicates while preserving order
    seen = set()
    unique_prices = []
    for price in prices:
        if price.lower() not in seen:
            seen.add(price.lower())
            unique_prices.append(price)
    
    return unique_prices

# Apply to dataframe
df['prices'] = df['text'].apply(extract_prices)

# Display results
print("Prices Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['prices']) > 0:
        print(f"Row {row['page_id']}: {row['prices']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with prices:")
print(df[['page_id', 'prices']])

Prices Extracted:
Row 3: ['$29.99']
  Text: Buy now for $29.99! Use code SAVE20 for 20% off. Ships free!...


DataFrame with prices:
   page_id    prices
0        1        []
1        2        []
2        3  [$29.99]
3        4        []
4        5        []
5        6        []
6        7        []
7        8        []


## Step A-5: Extract URLs

In [18]:
def extract_urls(text):
    urls = []
    
    # Pattern 1: Full HTTP/HTTPS URLs
    pattern1 = r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    
    # Pattern 2: URLs starting with www (no protocol)
    pattern2 = r'\bwww\.[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    
    # Pattern 3: FTP URLs
    pattern3 = r'ftp://[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    
    # Pattern 4: File URLs
    pattern4 = r'file:///[-a-zA-Z0-9@:%._\+~#=/\\]+'
    
    # Pattern 5: IP-based URLs
    pattern5 = r'https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?::\d{1,5})?(?:/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?'
    
    # Pattern 6: Localhost URLs
    pattern6 = r'https?://localhost(?::\d{1,5})?(?:/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?'
    
    # Pattern 7: Domain without protocol (example.com/path)
    pattern7 = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+(?:com|org|net|edu|gov|mil|io|co|ai|app|dev|info|biz|us|uk|ca|au|de|fr|jp|cn|in|br|ru)\b(?:/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?'
    
    # Pattern 8: URLs with authentication
    pattern8 = r'https?://[a-zA-Z0-9._-]+:[a-zA-Z0-9._-]+@[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    
    # Pattern 9: Mailto links
    pattern9 = r'mailto:[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    
    # Pattern 10: Tel links
    pattern10 = r'tel:[\+]?[\d\s\-()]{7,20}'
    
    all_patterns = [pattern1, pattern2, pattern3, pattern4, pattern5,
                    pattern6, pattern7, pattern8, pattern9, pattern10]
    
    for pattern in all_patterns:
        urls.extend(re.findall(pattern, text, re.IGNORECASE))
    
    # Remove duplicates while preserving order
    seen = set()
    unique_urls = []
    for url in urls:
        if url.lower() not in seen:
            seen.add(url.lower())
            unique_urls.append(url)
    
    return unique_urls

# Apply to dataframe
df['urls'] = df['text'].apply(extract_urls)

# Display results
print("URLs Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['urls']) > 0:
        print(f"Row {row['page_id']}: {row['urls']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with URLs:")
print(df[['page_id', 'urls']])

URLs Extracted:
Row 1: ['example.com']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 2: ['company.org']
  Text: Email info@company.org for details. Our office is open Mon-F...

Row 4: ['https://example.com/products', 'example.com/products']
  Text: Visit our website at https://example.com/products for more i...

Row 8: ['https://help.site.com', 'site.com', 'help.site.com']
  Text: For questions, email contact@site.com or visit https://help....


DataFrame with URLs:
   page_id                                               urls
0        1                                      [example.com]
1        2                                      [company.org]
2        3                                                 []
3        4  [https://example.com/products, example.com/pro...
4        5                                                 []
5        6                                                 []
6        7                                                 []
7 

## Steap A-6: Extract Social Media Handles and Hashtags

In [19]:
def extract_social(text):
    results = {
        'handles': [],
        'hashtags': []
    }
    
    # Pattern 1: @handles
    pattern_handles = r'(?<![\w])@([a-zA-Z_][a-zA-Z0-9_.]{0,29})(?![\w])'
    
    # Pattern 2: #hashtags
    pattern_hashtags = r'(?<![\w])#([a-zA-Z_][a-zA-Z0-9_]{0,138})(?![\w])'
    
    results['handles'] = re.findall(pattern_handles, text)
    results['hashtags'] = re.findall(pattern_hashtags, text)
    
    # Remove duplicates while preserving order
    results['handles'] = list(dict.fromkeys(results['handles']))
    results['hashtags'] = list(dict.fromkeys(results['hashtags']))
    
    return results

# Apply to dataframe
df['social_handles'] = df['text'].apply(extract_social)

# Display results
print("Social Media Handles and Hashtags Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['social_handles']['handles']) > 0 or len(row['social_handles']['hashtags']) > 0:
        print(f"Row {row['page_id']}: Handles: {row['social_handles']['handles']}, Hashtags: {row['social_handles']['hashtags']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with social media handles and hashtags:")
print(df[['page_id', 'social_handles']])

Social Media Handles and Hashtags Extracted:
Row 5: Handles: ['johndoe.'], Hashtags: ['tech', 'AI']
  Text: Posted on 2024-01-10 by @johndoe. Follow me on Twitter! #tec...


DataFrame with social media handles and hashtags:
   page_id                                     social_handles
0        1                    {'handles': [], 'hashtags': []}
1        2                    {'handles': [], 'hashtags': []}
2        3                    {'handles': [], 'hashtags': []}
3        4                    {'handles': [], 'hashtags': []}
4        5  {'handles': ['johndoe.'], 'hashtags': ['tech',...
5        6                    {'handles': [], 'hashtags': []}
6        7                    {'handles': [], 'hashtags': []}
7        8                    {'handles': [], 'hashtags': []}


## Step A-7: Extract All-caps words

In [20]:
def extract_all_caps(text):
    # Pattern: 2+ uppercase letters only
    pattern = r'\b([A-Z]{2,})\b'
    
    matches = re.findall(pattern, text)
    
    # Remove duplicates while preserving order
    return list(dict.fromkeys(matches))

# Apply to dataframe
df['all_caps_words'] = df['text'].apply(extract_all_caps)
# Display results
print("All-Caps Words Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['all_caps_words']) > 0:
        print(f"Row {row['page_id']}: {row['all_caps_words']}")
        print(f"  Text: {row['text'][:60]}...")

All-Caps Words Extracted:
Row 5: ['AI']
  Text: Posted on 2024-01-10 by @johndoe. Follow me on Twitter! #tec...
Row 7: ['SPECIAL', 'OFFER', 'BUY', 'TWO', 'GET', 'ONE', 'FREE']
  Text: SPECIAL OFFER: BUY TWO GET ONE FREE! Limited time only....


## Step A-8: Extract Question Marks

In [21]:
def count_question_marks(text):
    # Pattern: Single question mark
    pattern = r'\?'
    
    matches = re.findall(pattern, text)
    
    return len(matches)

# Apply to dataframe
df['question_count'] = df['text'].apply(count_question_marks)
# Display results
print("Question Mark Counts:")
print("="*80)
for idx, row in df.iterrows():
    if row['question_count'] > 0:
        print(f"Row {row['page_id']}: {row['question_count']} question marks")
        print(f"  Text: {row['text'][:60]}...")

Question Mark Counts:


## Step B 9-16: Classification

In [23]:
def classify_webpage(text, url=""):
    """
    Classify webpage using only regex patterns.
    
    Categories:
    - news
    - blog
    - e-commerce
    - forum/discussion
    - educational
    - technical/documentation
    - government
    - other
    """
    
    text_lower = text.lower()
    url_lower = url.lower()
    
    scores = {
        'news': 0,
        'blog': 0,
        'e-commerce': 0,
        'forum/discussion': 0,
        'educational': 0,
        'technical/documentation': 0,
        'government': 0
    }
    
    # ========== URL PATTERNS ==========
    
    url_patterns = {
        'news': r'(news\.|/news/|\.news|headline|breaking|reuters|bbc|cnn|nytimes|guardian|washingtonpost|foxnews|nbcnews)',
        'blog': r'(blog\.|/blog/|\.blog|medium\.com|wordpress|blogger|substack|tumblr|ghost\.io)',
        'e-commerce': r'(shop\.|/shop/|/product/|/cart/|/checkout/|amazon|ebay|etsy|shopify|store\.|/buy/|/order/|\.store|walmart|alibaba)',
        'forum/discussion': r'(forum\.|/forum/|/thread/|/discussion/|reddit\.com|quora\.com|stackoverflow|discord|/community/|/topics/|discuss\.|boards\.)',
        'educational': r'(\.edu|/learn/|/course/|/tutorial/|coursera|udemy|edx|khanacademy|university|college|school|academy|/lesson/)',
        'technical/documentation': r'(/docs/|/documentation/|/api/|/reference/|github\.com|gitlab|readthedocs|swagger|/sdk/|/guide/|developer\.|devdocs)',
        'government': r'(\.gov|\.mil|government|/agency/|whitehouse|congress|senate|parliament|ministry|federal)'
    }
    
    for category, pattern in url_patterns.items():
        if re.search(pattern, url_lower):
            scores[category] += 5
    
    # ========== CONTENT PATTERNS ==========
    
    content_patterns = {
        'news': [
            (r'\b(breaking\s*news|headline|reporter|journalist)\b', 2),
            (r'\b(press\s*release|news\s*update|latest\s*news)\b', 2),
            (r'\b(according\s*to\s*sources|correspondent|exclusive)\b', 2),
            (r'\b(developing\s*story|associated\s*press|reuters|afp)\b', 2),
            (r'\b(reported|reporting|news\s*desk)\b', 1),
            (r'(breaking:|updated:|exclusive:)', 3),
            (r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2},?\s+\d{4}\b', 2)
        ],
        'blog': [
            (r'\b(posted\s*by|written\s*by|author:)\b', 2),
            (r'\b(about\s*the\s*author|read\s*more|continue\s*reading)\b', 2),
            (r'\b(blog\s*post|my\s*thoughts|in\s*this\s*post)\b', 3),
            (r'\b(subscribe|newsletter|personal\s*blog)\b', 2),
            (r'\b(i\s*think|i\s*believe|my\s*experience|my\s*journey)\b', 2),
            (r'\b(leave\s*a\s*comment|share\s*this\s*post|comments\s*section)\b', 2),
            (r'\b(opinion|reflections|musings)\b', 1)
        ],
        'e-commerce': [
            (r'\b(add\s*to\s*cart|buy\s*now|shop\s*now|order\s*now)\b', 3),
            (r'\b(price:|checkout|shipping|free\s*delivery)\b', 2),
            (r'\b(in\s*stock|out\s*of\s*stock|quantity|inventory)\b', 2),
            (r'\b(product\s*description|customer\s*reviews|ratings)\b', 2),
            (r'\b(discount|sale|coupon|promo\s*code)\b', 2),
            (r'\b(wishlist|compare|sku:|upc:)\b', 2),
            (r'[\$€£¥]\s*\d+[.,]?\d*', 1),
            (r'★{1,5}|☆{1,5}|(\d+(\.\d+)?\s*stars)', 2)
        ],
        'forum/discussion': [
            (r'\b(reply|replies|posted:|thread|topic)\b', 2),
            (r'\b(member\s*since|posts:|joined:|reputation)\b', 3),
            (r'\b(quote|quoted|forum\s*rules|moderator|admin)\b', 2),
            (r'\b(sticky|pinned|upvote|downvote|karma)\b', 2),
            (r'\b(solved|answered|discussion|community)\b', 1),
            (r'\b(members\s*online|active\s*users|views:)\b', 2),
            (r'(reply\s*#\d+|post\s*#\d+)', 3),
            (r'\b\d+\s*(replies|posts|comments|views)\b', 2)
        ],
        'educational': [
            (r'\b(lesson|course|curriculum|syllabus|assignment)\b', 2),
            (r'\b(quiz|exam|test|grade|student|teacher)\b', 2),
            (r'\b(professor|lecture|learning\s*objectives|module)\b', 2),
            (r'\b(enroll|certificate|diploma|degree|academic)\b', 2),
            (r'\b(semester|tuition|scholarship|campus)\b', 2),
            (r'\b(learning|education|study|classroom)\b', 1),
            (r'(chapter\s*\d+|module\s*\d+|lesson\s*\d+|unit\s*\d+)', 3)
        ],
        'technical/documentation': [
            (r'\b(api|sdk|documentation|parameters|returns)\b', 2),
            (r'\b(example:|syntax|function|method|class)\b', 2),
            (r'\b(installation|requirements|dependencies)\b', 2),
            (r'\b(npm\s*install|pip\s*install|git\s*clone)\b', 3),
            (r'\b(import|code\s*example|endpoint)\b', 2),
            (r'\b(request|response|json|xml|deprecated)\b', 1),
            (r'\b(version:|changelog|release\s*notes)\b', 2),
            (r'(```|<code>|<pre>)', 2),
            (r'\b(def\s+\w+|function\s+\w+|class\s+\w+)\b', 2)
        ],
        'government': [
            (r'\b(official|federal|state|municipal|county)\b', 2),
            (r'\b(regulation|legislation|statute|law|policy)\b', 2),
            (r'\b(agency|department|bureau|commission|council)\b', 2),
            (r'\b(public\s*notice|government|citizen)\b', 2),
            (r'\b(tax|permit|license|compliance|jurisdiction)\b', 2),
            (r'\b(authority|ordinance|amendment|act\s*of)\b', 2),
            (r'(\d+\s*u\.?s\.?c\.?\s*§?\s*\d+)', 3),
            (r'\b(effective\s*date|pursuant\s*to|hereby)\b', 2)
        ]
    }
    
    for category, patterns in content_patterns.items():
        for pattern, weight in patterns:
            matches = re.findall(pattern, text_lower)
            scores[category] += len(matches) * weight
    
    # ========== STRUCTURAL PATTERNS ==========
    
    # Multiple prices = likely e-commerce
    price_count = len(re.findall(r'[\$€£¥]\s*\d+[.,]?\d*', text))
    if price_count >= 3:
        scores['e-commerce'] += price_count * 2
    
    # Multiple questions = likely forum or educational
    question_count = len(re.findall(r'\?', text))
    if question_count >= 5:
        scores['forum/discussion'] += question_count
        scores['educational'] += question_count // 2
    
    # Multiple code blocks = likely technical
    code_count = len(re.findall(r'```|<code>|<pre>', text_lower))
    if code_count >= 2:
        scores['technical/documentation'] += code_count * 3
    
    # Multiple reply patterns = likely forum
    reply_count = len(re.findall(r'(reply\s*#?\d*|post\s*#?\d*|@\w+\s*said)', text_lower))
    if reply_count >= 2:
        scores['forum/discussion'] += reply_count * 2
    
    # ========== DETERMINE WINNER ==========
    
    max_score = max(scores.values())
    
    if max_score == 0:
        return {
            'category': 'other',
            'confidence': 0,
            'scores': scores
        }
    
    winner = max(scores, key=scores.get)
    
    total_score = sum(scores.values())
    confidence = round((max_score / total_score) * 100, 2) if total_score > 0 else 0
    
    return {
        'category': winner,
        'confidence': confidence,
        'scores': scores
    }

# Apply to dataframe
df['category'] = df.apply(lambda row: classify_webpage(row['text'], url=""), axis=1)
# Display results
print("Webpage Classification:")
print("="*80)
for idx, row in df.iterrows():
    category = row['category']
    print(f"Row {row['page_id']}: Category: {category['category']}, Confidence: {category['confidence']}%")
    print(f"  Scores: {category['scores']}")
    print(f"  Text: {row['text'][:60]}...")
    print()
# Show dataframe with classification column
print("\nDataFrame with classification:")
print(df[['page_id', 'category']])

Webpage Classification:
Row 1: Category: e-commerce, Confidence: 100.0%
  Scores: {'news': 0, 'blog': 0, 'e-commerce': 2, 'forum/discussion': 0, 'educational': 0, 'technical/documentation': 0, 'government': 0}
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 2: Category: other, Confidence: 0%
  Scores: {'news': 0, 'blog': 0, 'e-commerce': 0, 'forum/discussion': 0, 'educational': 0, 'technical/documentation': 0, 'government': 0}
  Text: Email info@company.org for details. Our office is open Mon-F...

Row 3: Category: e-commerce, Confidence: 100.0%
  Scores: {'news': 0, 'blog': 0, 'e-commerce': 4, 'forum/discussion': 0, 'educational': 0, 'technical/documentation': 0, 'government': 0}
  Text: Buy now for $29.99! Use code SAVE20 for 20% off. Ships free!...

Row 4: Category: other, Confidence: 0%
  Scores: {'news': 0, 'blog': 0, 'e-commerce': 0, 'forum/discussion': 0, 'educational': 0, 'technical/documentation': 0, 'government': 0}
  Text: Visit our website at ht

## Deliverable

In [25]:
list_columns = ['page_id', 'emails', 'phones', 'dates', 
                'prices', 'urls', 'social_handles', 'all_caps_words', 'question_count', 'category',
                ]

for col in list_columns:
    df[col] = df[col].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

df.to_csv('../data/dataset_with_extractions.csv', index=False)


## Step 4: View Complete DataFrame

Now we have two new columns with lists of extracted patterns

In [16]:
# Display the full dataframe
print("Complete DataFrame:")
print("="*80)
print(df[['page_id', 'text', 'emails', 'phones']])

# Display with better formatting
print("\n\nDetailed View:")
print("="*80)
for idx, row in df.iterrows():
    print(f"\nPage ID: {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"Emails found: {row['emails']}")
    print(f"Phones found: {row['phones']}")
    print("-"*80)

Complete DataFrame:
      page_id                                               text emails phones
0           1  Best Resume Templates 2024 | Ready to Download...     []     []
1           4  SH/SZ-HK Stock Connect Quick Quote\n繁\n简\nMark...     []     []
2           6  A.D.A. Amiga Demoscene Archive\nA.D.A. Amiga D...     []     []
3           7  Time, Mortality And Memory - blog from trends\...     []     []
4           8  AFR News: JurisDictionary- How Your Constituti...     []     []
...       ...                                                ...    ...    ...
6309     8259  April 2022 Newsletter – Zettel Family Farms\nS...     []     []
6310     8261  PONDS BRIGHT BEAUTY SPOT-LESS GLOW FACEWASH 50...     []     []
6311     8262  Understanding Bitcoin And Its Market Trends Ho...     []     []
6312     8264  Woman and Dog Wood Figurine | Z Man's Wood Art...     []     []
6313     8268  People followed by GoodVans Roofing\nLog in\nS...     []     []

[6314 rows x 4 columns]


Detai

## Step 5: Additional Pattern Examples

Let's add a few more useful patterns

In [5]:
# Extract dates
def extract_dates(text):
    dates = []
    # ISO format: 2024-01-15
    dates.extend(re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text))
    # Written: January 15, 2024
    dates.extend(re.findall(
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
        text
    ))
    return dates

# Extract prices
def extract_prices(text):
    prices = []
    # Dollar amounts: $29.99
    prices.extend(re.findall(r'\$\d+(?:\.\d{2})?', text))
    # Percentages: 20% off
    prices.extend(re.findall(r'\d+%\s*off', text, re.IGNORECASE))
    return prices

# Extract URLs
def extract_urls(text):
    return re.findall(r'https?://[^\s]+', text)

# Extract social media handles
def extract_social(text):
    social = []
    # @mentions
    social.extend(re.findall(r'@[A-Za-z0-9_]+', text))
    # #hashtags
    social.extend(re.findall(r'#[A-Za-z0-9_]+', text))
    return social

# Extract all-caps words
def extract_all_caps(text):
    return re.findall(r'\b[A-Z]{2,}\b', text)

# Apply all extractors
df['dates'] = df['text'].apply(extract_dates)
df['prices'] = df['text'].apply(extract_prices)
df['urls'] = df['text'].apply(extract_urls)
df['social'] = df['text'].apply(extract_social)
df['all_caps'] = df['text'].apply(extract_all_caps)

print("All Patterns Extracted!")
print("\nColumn names:")
print(df.columns.tolist())

All Patterns Extracted!

Column names:
['page_id', 'text', 'emails', 'phones', 'dates', 'prices', 'urls', 'social', 'all_caps']


## Step 6: View Final DataFrame

In [7]:
# Select columns to display
display_df = df[['page_id', 'text', 'emails', 'phones', 'dates', 'prices']]

print("Final DataFrame with Extracted Patterns:")
print("="*80)
print(display_df)

# Show each row in detail
print("\n\nDetailed View of Each Row:")
print("="*80)

for idx, row in df.iterrows():
    print(f"\n📄 Page {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"  └─ Emails: {row['emails']}")
    print(f"  └─ Phones: {row['phones']}")
    print(f"  └─ Dates: {row['dates']}")
    print(f"  └─ Prices: {row['prices']}")
    print(f"  └─ URLs: {row['urls']}")
    print(f"  └─ Social: {row['social']}")
    print(f"  └─ All-caps: {row['all_caps']}")

Final DataFrame with Extracted Patterns:
   page_id                                               text  \
0        1  Contact us at support@example.com or call 555-...   
1        2  Email info@company.org for details. Our office...   
2        3  Buy now for $29.99! Use code SAVE20 for 20% of...   
3        4  Visit our website at https://example.com/produ...   
4        5  Posted on 2024-01-10 by @johndoe. Follow me on...   
5        6  Call 800-555-0123 or (555) 987-6543 for assist...   
6        7  SPECIAL OFFER: BUY TWO GET ONE FREE! Limited t...   
7        8  For questions, email contact@site.com or visit...   

                  emails                                        phones  \
0  [support@example.com]                                [555-123-4567]   
1     [info@company.org]                                            []   
2                     []                                            []   
3                     []                                            []   
4  

## Step 7: Save to CSV

In [9]:
# Save the dataframe with extracted patterns
df.to_csv('sample_with_patterns.csv', index=False)

print("✓ Saved to: sample_with_patterns.csv")
print("\nNote: When saved to CSV, the lists are stored as strings")
print("Example: ['email@test.com', 'info@site.com']")

✓ Saved to: sample_with_patterns.csv

Note: When saved to CSV, the lists are stored as strings
Example: ['email@test.com', 'info@site.com']
