<a href="https://www.kaggle.com/code/mh0386/text-processing?scriptVersionId=255653368" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Text Processing

## Import

In [None]:
from re import sub
from bs4 import BeautifulSoup
from nltk import download, PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from requests import get, Response
from rich import print

## Install Natural Language Toolkit

In [None]:
download("stopwords")
download("wordnet")
download("punkt_tab")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Fetch URL and Get HTML from it

In [None]:
response: Response = get(
    "https://en.wikipedia.org/wiki/Natural_language_processing", timeout=5
)
HTML: str = response.text

## Extract Text from HTML

In [None]:
soup: BeautifulSoup = BeautifulSoup(HTML, "html.parser")
paragraph: str = soup.get_text()

### Print it

In [None]:
print(paragraph)

## Cleaning extracted text

In [None]:
text: str = sub(r"[^a-zA-Z0-9]", " ", paragraph)

### Print it

In [None]:
print(text)

## Remove Whitespace

In [None]:
text_without_whitespace: str = sub(r"\s+", " ", text)

### Print it

In [None]:
print(text_without_whitespace)

## Convert to Lower Case

In [None]:
text_lowercase: str = text_without_whitespace.lower()

### Print it

In [None]:
print(text_lowercase)

## Tokenization

In [None]:
tokens: list[str] = word_tokenize(text_lowercase)

### Print it

In [None]:
print(tokens)

## Remove Words contains Number

In [None]:
tokens_without_numbers: list[str] = [
    token for token in tokens if not any(char.isdigit() for char in token)
]

### Print it

In [None]:
print(tokens_without_numbers)

## Remove Stop Words

In [None]:
tokens_without_stopwords: list[str] = [
    token for token in tokens_without_numbers if token not in stopwords.words("english")
]

### Print it

In [None]:
print(tokens_without_stopwords)

## Words with Length Less than 3

In [None]:
token_less_3: list[str] = [
    token for token in tokens_without_stopwords if len(token) < 3
]

### Print it

In [None]:
print(sorted(token_less_3))

## Remove Words with Length Less than 3

In [None]:
token_greater_2: list[str] = [
    token for token in tokens_without_stopwords if len(token) > 2
]

### Print it

In [None]:
print(token_greater_2)

## Stemming

In [None]:
ps = PorterStemmer()
token_stemming: list[str] = [ps.stem(token) for token in token_greater_2]

### Print it

In [None]:
print(token_stemming)

## Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
token_lemmatizer: list[str] = [lemmatizer.lemmatize(token) for token in token_stemming]

### Print it

In [None]:
print(token_lemmatizer)

## Unique words

In [None]:
unique_words: list[str] = list(set(token_lemmatizer))

### Print it

In [None]:
print(sorted(unique_words))