In [17]:
import re

import nltk
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [18]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Get URL and Get HTML from it


In [19]:
URL = "https://en.wikipedia.org/wiki/Natural_language_processing"
response = requests.get(URL, timeout=5)
HTML = response.text

# Extract Text from HTML


In [20]:
soup = BeautifulSoup(HTML, "html.parser")
paragraph = soup.find_all("p")
TEXT = [paragraph[i].text for i in range(len(paragraph))]
TEXT = " ".join(TEXT)

# cleaning


In [21]:
TEXT = re.sub(r"[^a-zA-Z0-9]", " ", TEXT)

# remove Whitespace


In [22]:
TEXT = re.sub(r"\s+", " ", TEXT)

# Convert to Lower Case


In [23]:
TEXT = TEXT.lower()

# Tokenization


In [24]:
TEXT = word_tokenize(TEXT)

# remove stop words


In [25]:
TEXT = [word for word in TEXT if word not in stopwords.words("english")]

# Words with Length Less than 3


In [26]:
TEXT_LESS_3 = [word for word in TEXT if len(word) < 3]

# Remove Words with Length Less than 3


In [27]:
TEXT = [word for word in TEXT if len(word) > 2]

# Stemming


In [28]:
ps = nltk.PorterStemmer()
TEXT = [ps.stem(word) for word in TEXT]

# Lemmatization


In [29]:
lemmatizer = WordNetLemmatizer()
TEXT = [lemmatizer.lemmatize(word) for word in TEXT]

# Unique words


In [30]:
unique_words = list(set(TEXT))

## Print Unique Words


In [34]:
sorted(unique_words)

['1940',
 '1980',
 '1990',
 '2003',
 '2010',
 '2015',
 '2020',
 '2023',
 'abil',
 'accur',
 'achiev',
 'acl',
 'acquir',
 'act',
 'action',
 'address',
 'advanc',
 'advantag',
 'age',
 'aid',
 'alan',
 'algorithm',
 'align',
 'along',
 'alreadi',
 'although',
 'among',
 'analyz',
 'announc',
 'answer',
 'anymor',
 'appar',
 'appli',
 'applic',
 'approach',
 'area',
 'art',
 'articl',
 'articul',
 'artifici',
 'aspect',
 'author',
 'autom',
 'base',
 'becam',
 'becom',
 'behaviour',
 'bengio',
 'best',
 'branch',
 'british',
 'brno',
 'broadli',
 'build',
 'call',
 'capabl',
 'captur',
 'care',
 'categor',
 'categori',
 'caus',
 'challeng',
 'chine',
 'chomskyan',
 'close',
 'cluster',
 'coars',
 'code',
 'cognit',
 'collect',
 'colleg',
 'combin',
 'commonli',
 'complex',
 'comprehens',
 'comput',
 'concern',
 'confer',
 'confront',
 'conll',
 'construct',
 'contain',
 'content',
 'context',
 'contextu',
 'conveni',
 'corpu',
 'corpus',
 'coupl',
 'cpu',
 'criterion',
 'data',
 'datase

## Print Words Less Than 3


In [35]:
sorted(TEXT_LESS_3)

['1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '44',
 '45',
 '46',
 '47',
 '48',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '8',
 '9',
 'ai',
 'ai',
 'ai',
 'co',
 'co',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'e',
 'g',
 'g',
 'g',
 'g',
 'g',
 'g',
 'g',
 'j',
 'n',
 'r']