# Regular Expressions Tutorial - Pattern Extraction with Pandas

This notebook demonstrates how to use regex to extract patterns from text and store them as lists in DataFrame columns.

In [57]:
import pandas as pd
import re

# Create a sample dataset with 8 rows
data = {
    'page_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'text': [
        "Contact us at support@example.com or call 555-123-4567. Sale ends January 15, 2024!",
        "Email info@company.org for details. Our office is open Mon-Fri.",
        "Buy now for $29.99! Use code SAVE20 for 20% off. Ships free!",
        "Visit our website at https://example.com/products for more info.",
        "Posted on 2024-01-10 by @johndoe. Follow me on Twitter! #tech #AI",
        "Call 800-555-0123 or (555) 987-6543 for assistance. Available 24/7.",
        "SPECIAL OFFER: BUY TWO GET ONE FREE! Limited time only.",
        "For questions, email contact@site.com or visit https://help.site.com"
    ]
}

df = pd.DataFrame(data)

print("Sample Dataset:")
print(df)

Sample Dataset:
   page_id                                               text
0        1  Contact us at support@example.com or call 555-...
1        2  Email info@company.org for details. Our office...
2        3  Buy now for $29.99! Use code SAVE20 for 20% of...
3        4  Visit our website at https://example.com/produ...
4        5  Posted on 2024-01-10 by @johndoe. Follow me on...
5        6  Call 800-555-0123 or (555) 987-6543 for assist...
6        7  SPECIAL OFFER: BUY TWO GET ONE FREE! Limited t...
7        8  For questions, email contact@site.com or visit...


## Step 2: Extract Email Addresses

Pattern: `username@domain.extension`

In [58]:
def extract_emails(text):
    """
    Extract all email addresses from text
    Returns: list of email addresses
    """
    # Regex pattern for emails
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    # Find all matches
    emails = re.findall(pattern, text)
    
    return emails

# Apply to dataframe - creates a new column with lists
df['emails'] = df['text'].apply(extract_emails)

# Display results
print("Emails Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['emails']) > 0:
        print(f"Row {row['page_id']}: {row['emails']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with new column
print("\nDataFrame with emails column:")
print(df[['page_id', 'emails']])

Emails Extracted:
Row 1: ['support@example.com']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 2: ['info@company.org']
  Text: Email info@company.org for details. Our office is open Mon-F...

Row 8: ['contact@site.com']
  Text: For questions, email contact@site.com or visit https://help....


DataFrame with emails column:
   page_id                 emails
0        1  [support@example.com]
1        2     [info@company.org]
2        3                     []
3        4                     []
4        5                     []
5        6                     []
6        7                     []
7        8     [contact@site.com]


## Step 3: Extract Phone Numbers

Patterns: `555-123-4567`, `(555) 123-4567`, `5551234567`

In [59]:
def extract_phones(text):
    """
    Extract phone numbers in various formats
    Returns: list of phone numbers
    """
    phones = []
    
    # Pattern 1: 555-123-4567
    pattern1 = r'\b\d{3}-\d{3}-\d{4}\b'
    phones.extend(re.findall(pattern1, text))
    
    # Pattern 2: (555) 123-4567
    pattern2 = r'\(\d{3}\)\s*\d{3}-\d{4}'
    phones.extend(re.findall(pattern2, text))
    
    # Pattern 3: 800-555-0123 (toll-free)
    pattern3 = r'\b[8-9]00-\d{3}-\d{4}\b'
    phones.extend(re.findall(pattern3, text))
    
    return phones

# Apply to dataframe
df['phones'] = df['text'].apply(extract_phones)

# Display results
print("Phone Numbers Extracted:")
print("="*80)
for idx, row in df.iterrows():
    if len(row['phones']) > 0:
        print(f"Row {row['page_id']}: {row['phones']}")
        print(f"  Text: {row['text'][:60]}...")
        print()

# Show dataframe with both columns
print("\nDataFrame with emails and phones:")
print(df[['page_id', 'emails', 'phones']])

Phone Numbers Extracted:
Row 1: ['555-123-4567']
  Text: Contact us at support@example.com or call 555-123-4567. Sale...

Row 6: ['800-555-0123', '(555) 987-6543', '800-555-0123']
  Text: Call 800-555-0123 or (555) 987-6543 for assistance. Availabl...


DataFrame with emails and phones:
   page_id                 emails  \
0        1  [support@example.com]   
1        2     [info@company.org]   
2        3                     []   
3        4                     []   
4        5                     []   
5        6                     []   
6        7                     []   
7        8     [contact@site.com]   

                                         phones  
0                                [555-123-4567]  
1                                            []  
2                                            []  
3                                            []  
4                                            []  
5  [800-555-0123, (555) 987-6543, 800-555-0123]  
6                           

## Step 4: View Complete DataFrame

Now we have two new columns with lists of extracted patterns

In [60]:
# Display the full dataframe
print("Complete DataFrame:")
print("="*80)
print(df[['page_id', 'text', 'emails', 'phones']])

# Display with better formatting
print("\n\nDetailed View:")
print("="*80)
for idx, row in df.iterrows():
    print(f"\nPage ID: {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"Emails found: {row['emails']}")
    print(f"Phones found: {row['phones']}")
    print("-"*80)

Complete DataFrame:
   page_id                                               text  \
0        1  Contact us at support@example.com or call 555-...   
1        2  Email info@company.org for details. Our office...   
2        3  Buy now for $29.99! Use code SAVE20 for 20% of...   
3        4  Visit our website at https://example.com/produ...   
4        5  Posted on 2024-01-10 by @johndoe. Follow me on...   
5        6  Call 800-555-0123 or (555) 987-6543 for assist...   
6        7  SPECIAL OFFER: BUY TWO GET ONE FREE! Limited t...   
7        8  For questions, email contact@site.com or visit...   

                  emails                                        phones  
0  [support@example.com]                                [555-123-4567]  
1     [info@company.org]                                            []  
2                     []                                            []  
3                     []                                            []  
4                     []     

## Step 5: Additional Pattern Examples

Let's add a few more useful patterns

In [61]:
# Extract dates
def extract_dates(text):
    dates = []
    # ISO format: 2024-01-15
    dates.extend(re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text))
    # Written: January 15, 2024
    dates.extend(re.findall(
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
        text
    ))
    return dates

# Extract prices
def extract_prices(text):
    prices = []
    # Dollar amounts: $29.99
    prices.extend(re.findall(r'\$\d+(?:\.\d{2})?', text))
    # Percentages: 20% off
    prices.extend(re.findall(r'\d+%\s*off', text, re.IGNORECASE))
    return prices

# Extract URLs
def extract_urls(text):
    return re.findall(r'https?://[^\s]+', text)

# Extract social media handles
def extract_social(text):
    social = []
    # @mentions
    social.extend(re.findall(r'@[A-Za-z0-9_]+', text))
    # #hashtags
    social.extend(re.findall(r'#[A-Za-z0-9_]+', text))
    return social

# Extract all-caps words
def extract_all_caps(text):
    return re.findall(r'\b[A-Z]{2,}\b', text)

# Apply all extractors
df['dates'] = df['text'].apply(extract_dates)
df['prices'] = df['text'].apply(extract_prices)
df['urls'] = df['text'].apply(extract_urls)
df['social'] = df['text'].apply(extract_social)
df['all_caps'] = df['text'].apply(extract_all_caps)

print("All Patterns Extracted!")
print("\nColumn names:")
print(df.columns.tolist())

All Patterns Extracted!

Column names:
['page_id', 'text', 'emails', 'phones', 'dates', 'prices', 'urls', 'social', 'all_caps']


## Step 6: View Final DataFrame

In [62]:
# Select columns to display
display_df = df[['page_id', 'text', 'emails', 'phones', 'dates', 'prices']]

print("Final DataFrame with Extracted Patterns:")
print("="*80)
print(display_df)

# Show each row in detail
print("\n\nDetailed View of Each Row:")
print("="*80)

for idx, row in df.iterrows():
    print(f"\n📄 Page {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"  └─ Emails: {row['emails']}")
    print(f"  └─ Phones: {row['phones']}")
    print(f"  └─ Dates: {row['dates']}")
    print(f"  └─ Prices: {row['prices']}")
    print(f"  └─ URLs: {row['urls']}")
    print(f"  └─ Social: {row['social']}")
    print(f"  └─ All-caps: {row['all_caps']}")

Final DataFrame with Extracted Patterns:
   page_id                                               text  \
0        1  Contact us at support@example.com or call 555-...   
1        2  Email info@company.org for details. Our office...   
2        3  Buy now for $29.99! Use code SAVE20 for 20% of...   
3        4  Visit our website at https://example.com/produ...   
4        5  Posted on 2024-01-10 by @johndoe. Follow me on...   
5        6  Call 800-555-0123 or (555) 987-6543 for assist...   
6        7  SPECIAL OFFER: BUY TWO GET ONE FREE! Limited t...   
7        8  For questions, email contact@site.com or visit...   

                  emails                                        phones  \
0  [support@example.com]                                [555-123-4567]   
1     [info@company.org]                                            []   
2                     []                                            []   
3                     []                                            []   
4  

## Step 7: Save to CSV

In [63]:
# Save the dataframe with extracted patterns
df.to_csv('sample_with_patterns.csv', index=False)

print("✓ Saved to: sample_with_patterns.csv")
print("\nNote: When saved to CSV, the lists are stored as strings")
print("Example: ['email@test.com', 'info@site.com']")

✓ Saved to: sample_with_patterns.csv

Note: When saved to CSV, the lists are stored as strings
Example: ['email@test.com', 'info@site.com']


In [64]:
# read data 
large_data = pd.read_csv('data/dataset_with_assignments.csv')

# create dataframe
df_large = pd.DataFrame(large_data)

In [65]:
# ensuring data is correct
df_large.head(5)

Unnamed: 0,page_id,url,domain,tld,date,word_count,text_length,sentence_count,paragraph_count,avg_word_length,path_depth,text,full_text,assigned_to,manual_label
0,1,http://0769sme.org/index-16.html,0769sme.org,org,2025-12-04T20:51:02Z,2377,14900,150,1,6.27,1,Best Resume Templates 2024 | Ready to Download...,Best Resume Templates 2024 | Ready to Download...,Gaurav Advani,
1,4,http://aastocks.com/en/cnhk/quote/quick-quote....,aastocks.com,com,2025-12-04T21:23:33Z,1738,12165,73,1,7.0,4,SH/SZ-HK Stock Connect Quick Quote\n繁\n简\nMark...,SH/SZ-HK Stock Connect Quick Quote\n繁\n简\nMark...,Gaurav Advani,
2,6,http://ada.untergrund.net/?p=boardthread&id=18...,ada.untergrund.net,net,2025-12-04T20:53:54Z,3759,20609,270,1,5.48,0,A.D.A. Amiga Demoscene Archive\nA.D.A. Amiga D...,A.D.A. Amiga Demoscene Archive\nA.D.A. Amiga D...,Gaurav Advani,
3,7,http://adrienedurand.wikidot.com/blog:127,adrienedurand.wikidot.com,com,2025-12-04T21:11:53Z,1328,7829,73,1,5.9,1,"Time, Mortality And Memory - blog from trends\...","Time, Mortality And Memory - blog from trends\...",Gaurav Advani,
4,8,http://afrafrontpagenews.blogspot.com/2012/03/...,afrafrontpagenews.blogspot.com,com,2025-12-04T19:58:47Z,5411,31154,654,1,5.76,3,AFR News: JurisDictionary- How Your Constituti...,AFR News: JurisDictionary- How Your Constituti...,Gaurav Advani,


In [66]:
# function to find number of '?' in each text
def count_Q(text):
    return len(re.findall(r'\?', text))


In [67]:
# testing 
res = count_Q('sdjhfr?fniue??jscdb?')
print(res)

4


In [68]:
df_large['emails'] = df_large['full_text'].apply(extract_emails)
df_large['phones'] = df_large['full_text'].apply(extract_phones)
df_large['dates'] = df_large['full_text'].apply(extract_dates)
df_large['prices'] = df_large['full_text'].apply(extract_prices)
df_large['urls'] = df_large['full_text'].apply(extract_urls)
df_large['social'] = df_large['full_text'].apply(extract_social)
df_large['all_caps'] = df_large['full_text'].apply(extract_all_caps)
df_large['count_?'] = df_large['full_text'].apply(count_Q)

In [69]:
# Select columns to display
display_df = df_large[['page_id', 'text', 'emails', 'phones', 'dates', 'prices', 'count_?']]

print("Final DataFrame with Extracted Patterns:")
print("="*80)
print(display_df)

# Show each row in detail
print("\n\nDetailed View of Each Row:")
print("="*80)

for idx, row in df_large.iterrows():
    print(f"\n📄 Page {row['page_id']}")
    print(f"Text: {row['text']}")
    print(f"  └─ Emails: {row['emails']}")
    print(f"  └─ Phones: {row['phones']}")
    print(f"  └─ Dates: {row['dates']}")
    print(f"  └─ Prices: {row['prices']}")
    print(f"  └─ URLs: {row['urls']}")
    print(f"  └─ Social: {row['social']}")
    print(f"  └─ All-caps: {row['all_caps']}")
    print(f"  └─ Count_?: {row['count_?']}")

Final DataFrame with Extracted Patterns:
      page_id                                               text  \
0           1  Best Resume Templates 2024 | Ready to Download...   
1           4  SH/SZ-HK Stock Connect Quick Quote\n繁\n简\nMark...   
2           6  A.D.A. Amiga Demoscene Archive\nA.D.A. Amiga D...   
3           7  Time, Mortality And Memory - blog from trends\...   
4           8  AFR News: JurisDictionary- How Your Constituti...   
...       ...                                                ...   
6309     8259  April 2022 Newsletter – Zettel Family Farms\nS...   
6310     8261  PONDS BRIGHT BEAUTY SPOT-LESS GLOW FACEWASH 50...   
6311     8262  Understanding Bitcoin And Its Market Trends Ho...   
6312     8264  Woman and Dog Wood Figurine | Z Man's Wood Art...   
6313     8268  People followed by GoodVans Roofing\nLog in\nS...   

                                                 emails  \
0                                                    []   
1                       

In [70]:
patterns = {
    'News': r'reported|announced|according to|breaking',
    'Blog': r'I think|in my opinion|posted by|comment below',
    'E-commerce': r'\$\d+|buy now|add to cart|shipping|in stock',
    'Forum/Discussion': r'reply|quote|thread|posted|member since|re:',
    'Educational': r'university|course|lesson|learn|student|professor',
    'Technical/Documentation': r'function|code|API|documentation|syntax|example',
    'Government': r'\.gov|department of|federal|state|legislation|official'
}

In [71]:
def classify_page(page_text):
    ''' function takes in a string and return a catogry based on matched patterns
        returns other if no match is found
    '''

    for cat, pattern in patterns.items():
        if re.search(pattern, page_text):
            return cat
        
    return 'Other'

In [72]:
# create category column and assign
df_large['Category'] = df_large['full_text'].apply(classify_page)

In [73]:
counts = df_large['Category'].value_counts()
display(counts)

Category
Other                      2136
E-commerce                 1199
Forum/Discussion            714
Technical/Documentation     649
News                        597
Educational                 542
Government                  334
Blog                        143
Name: count, dtype: int64

In [74]:
df_large.head(10)

Unnamed: 0,page_id,url,domain,tld,date,word_count,text_length,sentence_count,paragraph_count,avg_word_length,...,manual_label,emails,phones,dates,prices,urls,social,all_caps,count_?,Category
0,1,http://0769sme.org/index-16.html,0769sme.org,org,2025-12-04T20:51:02Z,2377,14900,150,1,6.27,...,,[],[],[],[],[],[],"[AI, AI, AI, GPT, AI, AI, AI, AI, AI, GPT, CV,...",15,News
1,4,http://aastocks.com/en/cnhk/quote/quick-quote....,aastocks.com,com,2025-12-04T21:23:33Z,1738,12165,73,1,7.0,...,,[support@aastocks.com],[],[],[],[],[@aastocks],"[SH, SZ, HK, US, STOCKS, FUNDS, FX, CRYPTO, SH...",0,Forum/Discussion
2,6,http://ada.untergrund.net/?p=boardthread&id=18...,ada.untergrund.net,net,2025-12-04T20:53:54Z,3759,20609,270,1,5.48,...,,"[kas1e@yandex.ru, kas1e@yandex.ru, kas1e@yande...",[],[],[],"[http://vague.lorraine-design.com/, http://vag...","[@yandex, @yandex, @Selectanovel, @kas1e, @kas...","[HOME, DEMOS, LOGOS, PARTIES, SCENERS, CHARTS,...",22,Blog
3,7,http://adrienedurand.wikidot.com/blog:127,adrienedurand.wikidot.com,com,2025-12-04T21:11:53Z,1328,7829,73,1,5.9,...,,[],[],[],[],[],[],"[US, FDA, FTC, FTC, URL]",1,E-commerce
4,8,http://afrafrontpagenews.blogspot.com/2012/03/...,afrafrontpagenews.blogspot.com,com,2025-12-04T19:58:47Z,5411,31154,654,1,5.76,...,,[],[866-529-3279],"[March 14, 2012, March 23, 2012]","[$5, $249, $7.50, $249]","[http://fairuse.stanford.edu), http://familyri...",[],"[AFR, HERE, YOU, HAVE, POWER, RULE, FREE, HERE...",24,Blog
5,9,http://aibirds.org/forum/viewtopic.php?f=10&t=...,aibirds.org,org,2025-12-04T21:08:29Z,978,5962,69,1,6.1,...,,[],[],[],[],[https://aibirds.org/basic-game-playing-softwa...,[],"[AI, AI, FAQ, OS, HI, FAQ, AI, IJCAI, AI, AI, ...",10,Forum/Discussion
6,10,http://airblowerservices.com/roots_blower/urai...,airblowerservices.com,com,2025-12-04T20:17:55Z,365,2318,19,1,6.35,...,,[],[],[],[],[],[],"[URAI, DSL, PD, PD, URAI, DSL, URAI, DSL, ROOT...",0,E-commerce
7,12,http://alphonsen69139265.wikidot.com/blog:610,alphonsen69139265.wikidot.com,com,2025-12-04T21:07:52Z,1231,7160,75,1,5.82,...,,[],[],[],[],[],[],"[THE, NEXT, THE, NEXT, THE, NEXT, GUIDELINES, ...",1,Educational
8,13,http://alumni-kolleg.de/bilderquiz/book/downlo...,alumni-kolleg.de,de,2025-12-04T20:19:58Z,321,2266,22,1,7.06,...,,[],[],[],[],[],[],"[US, UK, US, US, MSN, US, FIPS, PUB, III, FTC]",0,Educational
9,14,http://android-er.blogspot.com/2012/03/example...,android-er.blogspot.com,com,2025-12-04T20:04:13Z,2553,17776,137,15,6.96,...,,[],[],"[March 30, 2012, August 9, 2012, November 11, ...",[],"[https://coxxect.blogspot.com/, http://schemas...","[@Override, @Override, @Override, @string, @dr...","[API, IME, TODO, AM, AM, AM, API, SD, RSS, III...",4,Forum/Discussion


In [75]:
my_sites = df_large[df_large['assigned_to'] == 'Ghailan Fadah']
print(my_sites)

     page_id                                                url  \
150      232  http://shura.shu.ac.uk/view/journals/Journal_o...   
151      234  http://sibarber.ddns.net/archive/tags/cookery/...   
152      235  http://silkematzpohl.com/nbc-mortal-engines-tr...   
153      236  http://sintel.is.tue.mpg.de/hero?flow_type=Err...   
154      237  http://sintel.is.tue.mpg.de/hero?flow_type=Inp...   
155      238  http://site-celtic.soticcloud.net/teams/blues/...   
156      240  http://sonieland.com/2017/11/20/location-scout...   
157      241                            http://spiessdreher.de/   
158      243  http://stonechicago.com/img/library.php?q=shop...   
159      244  http://store.vangoghgallery.com/showprint.aspx...   
160      245  http://store.vangoghgallery.com/showprint.aspx...   
161      247  http://t.caareviews.org/dissertations/449/comp...   
162      248                http://t.caareviews.org/reviews/219   
163      249                 http://t4535.com/category/bloggin

In [76]:
my_classification = ['Other', 'Blog', 'News', 'Technical/Documentation', 'Technical/Documentation', 'News', 'Blog', 'E-commerce', 'Other', 'Other', 'Other', 'Forum/Discussion', 'Blog', 'Other',
                     'Educational', 'Other', 'Technical/Documentation', 'News', 'Other', 'Forum/Discussion', 'E-commerce', 'Other', 'E-commerce', 'Educational', 'Other']


df_large.loc[150:174, 'manual_label'] = my_classification


  df_large.loc[150:174, 'manual_label'] = my_classification


In [77]:
# Save the dataframe with extracted patterns
df_large.to_csv('student_FADAH_results.csv', index=False)

print("✓ Saved to: sample_with_patterns.csv")
print("\nNote: When saved to CSV, the lists are stored as strings")
print("Example: ['email@test.com', 'info@site.com']")

✓ Saved to: sample_with_patterns.csv

Note: When saved to CSV, the lists are stored as strings
Example: ['email@test.com', 'info@site.com']
