Q1) Lab Assignment 1: Text Preprocessing and Regular Expressions
Implement tokenization, stemming, and lemmatization using NLTK and spaCy.
Use regular expressions for tasks such as extracting email addresses, phone numbers, and hashtags from a given text dataset of minimum 5 pages.
     

In [None]:
# Step 1: Install Required Libraries
!pip install nltk spacy
!python -m spacy download en_core_web_sm

In [2]:
# Step 2: Import Libraries
import nltk
import re
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Step 3: Simulate a sample 5-page dataset (you can load your own .txt or .csv here)
text_data = """
Contact me at john.doe@example.com or jane_doe22@sample.org.
My phone number is +1-800-555-1234 or (212) 555-4567.
I love #MachineLearning and #AI!
Barack Obama was the 44th president of the United States.
SpaCy is great for NLP. NLTK is also useful.

Email me at test.email@gmail.com or hello@mydomain.org.
Call me at 987-654-3210 or 1234567890.
Follow #Python and #DataScience on Twitter.
The cat sat on the mat. The cats are sitting on the mats.
"""

# Preprocess into lines like different pages for simulation
pages = text_data.strip().split('\n')

print("=== Tokenization ===")
for i, page in enumerate(pages):
    tokens = word_tokenize(page)
    print(f"\nPage {i+1} Tokens:\n", tokens)

stemmer = PorterStemmer()

print("\n=== Stemming ===")
for i, page in enumerate(pages):
    tokens = word_tokenize(page)
    stemmed = [stemmer.stem(word) for word in tokens]
    print(f"\nPage {i+1} Stemmed:\n", stemmed)

print("\n=== Lemmatization (spaCy) ===")
for i, page in enumerate(pages):
    doc = nlp(page)
    lemmatized = [token.lemma_ for token in doc]
    print(f"\nPage {i+1} Lemmatized:\n", lemmatized)

email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_pattern = r'(\+?\d{1,3})?[\s\-\.]??\d2,4?[\s\-\.]?\d{3,4}[\s\-\.]?\d{4}'
hashtag_pattern = r'#\w+'

print("\n=== Regex Extraction ===")
for i, page in enumerate(pages):
    emails = re.findall(email_pattern, page)
    phones = re.findall(phone_pattern, page)
    hashtags = re.findall(hashtag_pattern, page)

    print(f"\nPage {i+1} Results:")
    print("Emails:", emails)
    print("Phone Numbers:", phones)
    print("Hashtags:", hashtags)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


=== Tokenization ===

Page 1 Tokens:
 ['Contact', 'me', 'at', 'john.doe', '@', 'example.com', 'or', 'jane_doe22', '@', 'sample.org', '.']

Page 2 Tokens:
 ['My', 'phone', 'number', 'is', '+1-800-555-1234', 'or', '(', '212', ')', '555-4567', '.']

Page 3 Tokens:
 ['I', 'love', '#', 'MachineLearning', 'and', '#', 'AI', '!']

Page 4 Tokens:
 ['Barack', 'Obama', 'was', 'the', '44th', 'president', 'of', 'the', 'United', 'States', '.']

Page 5 Tokens:
 ['SpaCy', 'is', 'great', 'for', 'NLP', '.', 'NLTK', 'is', 'also', 'useful', '.']

Page 6 Tokens:
 []

Page 7 Tokens:
 ['Email', 'me', 'at', 'test.email', '@', 'gmail.com', 'or', 'hello', '@', 'mydomain.org', '.']

Page 8 Tokens:
 ['Call', 'me', 'at', '987-654-3210', 'or', '1234567890', '.']

Page 9 Tokens:
 ['Follow', '#', 'Python', 'and', '#', 'DataScience', 'on', 'Twitter', '.']

Page 10 Tokens:
 ['The', 'cat', 'sat', 'on', 'the', 'mat', '.', 'The', 'cats', 'are', 'sitting', 'on', 'the', 'mats', '.']

=== Stemming ===

Page 1 Stemmed:
 ['con