# Experiment No : 10

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re
from spellchecker import SpellChecker
from bs4 import BeautifulSoup

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define example text to work with
text = "Hello, this is an example text! It contains some words and numbers like 12345. Also, it has some punctuations like commas, periods, and exclamation marks."

# Tokenize the text into individual words
tokens = word_tokenize(text)

# Convert all words to lowercase
tokens = [token.lower() for token in tokens]

# Remove English stop words (e.g. "the", "a", "an") from the tokens
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]

# Apply stemming to the tokens (i.e. reduce each word to its root form)
stemmer = PorterStemmer()
tokens = [stemmer.stem(token) for token in tokens]

# Apply lemmatization to the tokens (i.e. convert each word to its base form based on its part of speech)
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Remove all punctuation from the tokens
tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

# Remove all special characters from the tokens
tokens = [re.sub(r'[^a-zA-Z0-9]+', '', token) for token in tokens]

# Remove all numbers from the tokens
tokens = [re.sub(r'\d+', '', token) for token in tokens]

# Use a spell checker to correct any misspelled words in the tokens
spell = SpellChecker()
tokens = [spell.correction(token) for token in tokens]

# Remove any HTML tags from the text
html_text = "<p>Hello, this is <strong>bold</strong> text!</p>"
soup = BeautifulSoup(html_text, 'html.parser')
clean_text = soup.get_text()

# Print the final tokens and cleaned text
print(tokens)
print(clean_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kamran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kamran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kamran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['hello', 'i', 'example', 'text', 'i', 'contain', 'word', 'number', 'like', 'i', 'i', 'also', 'i', 'punctual', 'like', 'comma', 'i', 'period', 'i', 'exclaim', 'mark', 'i']
Hello, this is bold text!
