[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/M-Talha-Farooqi/NLP-CourseWork/blob/main/Assignments/Assignment_1/Instructions/Assignment_Sample_NLP_Regular_expressions_Examples.ipynb)

# Understanding of Regular Expressions

# üß† Introduction to Regular Expressions (Regex) in Python

## üîç What is a Regular Expression?

A **Regular Expression (Regex)** is a sequence of characters used to search, match, or manipulate text using **pattern matching**. It's extremely useful for text processing tasks like:

- Finding specific patterns (e.g., hashtags, email addresses)
- Replacing parts of text (e.g., removing mentions)
- Validating inputs (e.g., phone numbers)

---

## ‚ú® Common Regex Patterns

| Pattern       | Description                             | Example Match          |
|---------------|------------------------------------------|------------------------|
| `\d+`         | One or more digits                       | `2023`, `45`           |
| `\w+`         | One or more word characters (letters, digits, underscore) | `hello`, `abc_123` |
| `\bword\b`    | Whole word match                         | `word` but not `sword` |
| `#\w+`        | Hashtags                                 | `#TerrorAlert`         |
| `@\w+`        | Mentions                                 | `@IntelWatch`          |
| `[A-Z]{2,}`   | Two or more uppercase letters            | `ALERT`, `SOS`         |
| `^` / `$`     | Start / end of string                    | Useful in validation   |
| `.`           | Any character (except newline)           | `a.b` matches `aab`, `acb` |
| `*`, `+`, `?` | Quantifiers (`*`: 0+, `+`: 1+, `?`: 0 or 1) | `lo+l` ‚Üí `lol`, `lool` |
| `()`          | Grouping                                | Used for capturing      |

---


In [1]:
import pandas as pd
import re

# Sample dataset
data = {
    'ID': [1, 2],
    'Text': [
        "We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!",
        "Alert: 2 terrorists were defused near the airport!! #TerrorAlert  Contact @CT_Unit for details."
    ]
}

# Show entire text without truncation
pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(data)
df

Unnamed: 0,ID,Text
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.


In [2]:
# Regular Expressions

# 1. Extract Hashtags
df['Hashtags'] = df['Text'].str.findall(r'#\w+')
df

Unnamed: 0,ID,Text,Hashtags
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,[#Kabul]
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,[#TerrorAlert]


In [3]:
# 2. Extract Mentions (@usernames)
df['Mentions'] = df['Text'].str.findall(r'@\w+')
df

Unnamed: 0,ID,Text,Hashtags,Mentions
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,[#Kabul],[@IntelWatch]
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,[#TerrorAlert],[@CT_Unit]


In [4]:
# 3. Extract Numbers
df['Numbers'] = df['Text'].str.findall(r'\d+')
df

Unnamed: 0,ID,Text,Hashtags,Mentions,Numbers
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,[#Kabul],[@IntelWatch],"[3, 2023]"
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,[#TerrorAlert],[@CT_Unit],[2]


In [5]:

# 6. Extract Terror-related Words (like bombings, terrorists)
df['Terror_Keywords'] = df['Text'].str.findall(r'\b(bombings?|terrorists?|airport)\b')
df

Unnamed: 0,ID,Text,Hashtags,Mentions,Numbers,Terror_Keywords
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,[#Kabul],[@IntelWatch],"[3, 2023]",[bombings]
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,[#TerrorAlert],[@CT_Unit],[2],"[terrorists, airport]"


In [6]:
# 7. Check if Tweet contains "alert" (case-insensitive)
df['Contains_Alert'] = df['Text'].str.contains(r'alert', flags=re.IGNORECASE)
df

Unnamed: 0,ID,Text,Hashtags,Mentions,Numbers,Terror_Keywords,Contains_Alert
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,[#Kabul],[@IntelWatch],"[3, 2023]",[bombings],False
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,[#TerrorAlert],[@CT_Unit],[2],"[terrorists, airport]",True


In [7]:
# \b: This is the word boundary metacharacter.
# \w: This metacharacter stands for "word character".
df['Words_Start_A'] = df['Text'].str.findall(r'\b[aA]\w*')

# Show full DataFrame
df

Unnamed: 0,ID,Text,Hashtags,Mentions,Numbers,Terror_Keywords,Contains_Alert,Words_Start_A
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,[#Kabul],[@IntelWatch],"[3, 2023]",[bombings],False,[]
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,[#TerrorAlert],[@CT_Unit],[2],"[terrorists, airport]",True,"[Alert, airport]"


# Preprocessing of Data using RE

In [8]:
import pandas as pd
import re
import string

# Sample dataset
data = {
    'ID': [1, 2],
    'Text': [
        "We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!",
        "Alert: 2 terrorists were defused near the airport!! #TerrorAlert  Contact @CT_Unit for details."
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,ID,Text
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.


In [9]:
# Step 1: Lowercase
def to_lowercase(text):
    return text.lower()

# Apply each step one by one to a new column
df['Lowercase'] = df['Text'].apply(to_lowercase)
df

Unnamed: 0,ID,Text,Lowercase
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,we can't ignore the 3 bombings in #kabul last week ‚Äì @intelwatch says it's the worst in 2023!
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,alert: 2 terrorists were defused near the airport!! #terroralert contact @ct_unit for details.


In [10]:
# Step 2: Remove usernames (@username)
def remove_usernames(text):
    return re.sub(r'@\w+', '', text)

df['No_Usernames'] = df['Lowercase'].apply(remove_usernames)
df

Unnamed: 0,ID,Text,Lowercase,No_Usernames
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,we can't ignore the 3 bombings in #kabul last week ‚Äì @intelwatch says it's the worst in 2023!,we can't ignore the 3 bombings in #kabul last week ‚Äì says it's the worst in 2023!
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,alert: 2 terrorists were defused near the airport!! #terroralert contact @ct_unit for details.,alert: 2 terrorists were defused near the airport!! #terroralert contact for details.


In [11]:
# Step 3: Remove hashtags (#hashtag)
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

df['No_Hashtags'] = df['No_Usernames'].apply(remove_hashtags)
df

Unnamed: 0,ID,Text,Lowercase,No_Usernames,No_Hashtags
0,1,We can't ignore the 3 bombings in #Kabul last week ‚Äì @IntelWatch says it's the worst in 2023!,we can't ignore the 3 bombings in #kabul last week ‚Äì @intelwatch says it's the worst in 2023!,we can't ignore the 3 bombings in #kabul last week ‚Äì says it's the worst in 2023!,we can't ignore the 3 bombings in last week ‚Äì says it's the worst in 2023!
1,2,Alert: 2 terrorists were defused near the airport!! #TerrorAlert Contact @CT_Unit for details.,alert: 2 terrorists were defused near the airport!! #terroralert contact @ct_unit for details.,alert: 2 terrorists were defused near the airport!! #terroralert contact for details.,alert: 2 terrorists were defused near the airport!! contact for details.


In [12]:
import contractions
# Function 1: Expanding contractions
def expand_contractions(text):
    return contractions.fix(text)

df['No_Contractions'] = df['No_Hashtags'].apply(expand_contractions)
df

ModuleNotFoundError: No module named 'contractions'

In [None]:
# Step 5: Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df['No_Numbers'] = df['No_Contractions'].apply(remove_numbers)
df

In [None]:
# Tokenizer helper (split into words)
def tokenize(text):
    return text.split()

# Function 6: Removing special characters per token
def remove_special_characters(text):
    tokens = tokenize(text)
    return ' '.join([re.sub(r'[^A-Za-z0-9]+', '', word) for word in tokens if word])

df['No_SpecialChars'] = df['No_Numbers'].apply(remove_special_characters)
df

In [None]:

# Step 7: Remove extra whitespace
def remove_extra_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

df['Cleaned_Text'] = df['No_SpecialChars'].apply(remove_extra_whitespace)
df

In [None]:
import pandas as pd
from nltk.corpus import stopwords

# Make sure you have the stopwords downloaded
# import nltk
# nltk.download('stopwords')

df['Cleaned_Text'] = df['Cleaned_Text'].apply(word_tokenize)

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))
stop_words

# Define a function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

# Apply the function to the 'tokens' column
df['Cleaned_Text'] = df['Cleaned_Text'].apply(remove_stopwords)
df