In [1]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

# NLTK resources 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text
text = """
Artificial Intelligence is transforming various industries, making tasks easier and more efficient. 
From healthcare to finance, AI-powered systems are becoming indispensable tools for decision-making. 
Technologies like deep learning and machine learning allow for better predictions, automated processes, and enhanced data analysis.
"""

# 1. Convert text to lowercase and remove punctuation
text_lower = text.lower()
text_clean = text_lower.translate(str.maketrans('', '', string.punctuation))
print("Cleaned Text:")
print(text_clean)

# 2. Tokenize the text into words and sentence
words = word_tokenize(text_clean)
sentences = sent_tokenize(text)
print("\nTokens:")
print(words)

# 3. Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
print("\nFiltered Words:")
print(filtered_words)

# 4. Display word frequency distribution
word_freq = Counter(filtered_words)
print("Word Frequency Distribution (Excluding Stopwords):")
print(word_freq)



ModuleNotFoundError: No module named 'nltk'

In [2]:
# Initialize stemmers and lemmatizer
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply stemming using PorterStemmer and LancasterStemmer
porter_stemmed = [porter_stemmer.stem(word) for word in filtered_words]
lancaster_stemmed = [lancaster_stemmer.stem(word) for word in filtered_words]

# Apply lemmatization using WordNetLemmatizer
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]

# Compare and display results of both techniques
print("\nStemming (PorterStemmer):", porter_stemmed)
print("\nStemming (LancasterStemmer):", lancaster_stemmed)
print("\nLemmatization:", lemmatized)



NameError: name 'PorterStemmer' is not defined

In [None]:
# 1. Use regular expressions to extract:
# a. All words with more than 5 letters
long_words = re.findall(r'\b\w{6,}\b', text_clean)
print("\nWords with more than 5 letters:", long_words)

# b. All numbers 
numbers = re.findall(r'\b\d+\b', text_clean)
print("\nNumbers found in text:", numbers)

# c. All capitalized words
capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', text_clean)
print("\nCapitalized words:", capitalized_words)

# 2. Use text splitting techniques:
# a. Split the text into words containing only alphabets 
alphabetic_words = re.findall(r'\b[a-zA-Z]+\b', text_clean)
print("\nAlphabetic words:", alphabetic_words)

# b. Extract words starting with a vowel
vowel_words = re.findall(r'\b[aeiouAEIOU]\w*\b', text_clean)
print("\nWords starting with a vowel:", vowel_words)



In [None]:
# 1. Custom tokenization function
def custom_tokenizer(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s\'-]', '', text)
    return word_tokenize(text)

custom_tokens = custom_tokenizer(text)
print("\nCustom Tokenized Text:", custom_tokens)

# 2. Regex substitutions for cleaning
# a. Replace email addresses with '<EMAIL>'
text_with_emails = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '<EMAIL>', text)
# b. Replace URLs with '<URL>'
text_with_urls = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '<URL>', text_with_emails)
# c. Replace phone numbers with '<PHONE>'
text_cleaned = re.sub(r'(\+?\d{1,2}\s?)?(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})', '<PHONE>', text_with_urls)

print("\nText with Emails, URLs, and Phone Numbers Replaced:")
print(text_cleaned)
