In [12]:
import requests
from bs4 import BeautifulSoup
import re


In [13]:
# Making a GET request
text = requests.get('https://en.wikipedia.org/wiki/Natural_language_processing')

# Parsing the HTML
soup = BeautifulSoup(text.content, 'html.parser')
#extract all visible text
raw_text = soup.get_text()

In [14]:
raw_text

'\n\n\n\nNatural language processing - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1\nHistory\n\n\n\n\nToggle History subsection\n\n\n\n\n\n1.1\nSymbolic NLP (1950s – early 1990s)\n\n\n\n\n\n\n\n\n1.2\nStatistical NLP 

In [15]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r"\d+", "", text)  # Remove numbers
  text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
  text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
  return text

In [16]:
cleaned_text = clean_text(raw_text)
cleaned_text

'natural language processing wikipedia jump to content main menu main menu move to sidebar hide navigation main pagecontentscurrent eventsrandom articleabout wikipediacontact us contribute helplearn to editcommunity portalrecent changesupload filespecial pages search search appearance donate create account log in personal tools donate create account log in pages for logged out editors learn more contributionstalk contents move to sidebar hide top history toggle history subsection symbolic nlp s early s statistical nlp spresent approaches symbolic statistical neural networks toggle approaches symbolic statistical neural networks subsection statistical approach neural networks common nlp tasks toggle common nlp tasks subsection text and speech processing morphological analysis syntactic analysis lexical semantics of individual words in context relational semantics semantics of individual sentences discourse semantics beyond individual sentences higherlevel nlp applications general tenden

In [17]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')
# Sentence Tokenization
sentence_tokens = sent_tokenize(cleaned_text)
print("Sentence Tokenization:", sentence_tokens)

# Word Tokenization
word_tokens = word_tokenize(cleaned_text)
print("Word Tokenization:", word_tokens)

# Char Tokenization
char_tokens = list(cleaned_text)
print("Char Tokenization:",char_tokens)

Sentence Tokenization: ['natural language processing wikipedia jump to content main menu main menu move to sidebar hide navigation main pagecontentscurrent eventsrandom articleabout wikipediacontact us contribute helplearn to editcommunity portalrecent changesupload filespecial pages search search appearance donate create account log in personal tools donate create account log in pages for logged out editors learn more contributionstalk contents move to sidebar hide top history toggle history subsection symbolic nlp s early s statistical nlp spresent approaches symbolic statistical neural networks toggle approaches symbolic statistical neural networks subsection statistical approach neural networks common nlp tasks toggle common nlp tasks subsection text and speech processing morphological analysis syntactic analysis lexical semantics of individual words in context relational semantics semantics of individual sentences discourse semantics beyond individual sentences higherlevel nlp app

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:
from nltk.corpus import stopwords

nltk.download('stopwords')

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word not in stop_words]

print("Filtered Text:", filtered_words)


Filtered Text: ['natural', 'language', 'processing', 'wikipedia', 'jump', 'content', 'main', 'menu', 'main', 'menu', 'move', 'sidebar', 'hide', 'navigation', 'main', 'pagecontentscurrent', 'eventsrandom', 'articleabout', 'wikipediacontact', 'us', 'contribute', 'helplearn', 'editcommunity', 'portalrecent', 'changesupload', 'filespecial', 'pages', 'search', 'search', 'appearance', 'donate', 'create', 'account', 'log', 'personal', 'tools', 'donate', 'create', 'account', 'log', 'pages', 'logged', 'editors', 'learn', 'contributionstalk', 'contents', 'move', 'sidebar', 'hide', 'top', 'history', 'toggle', 'history', 'subsection', 'symbolic', 'nlp', 'early', 'statistical', 'nlp', 'spresent', 'approaches', 'symbolic', 'statistical', 'neural', 'networks', 'toggle', 'approaches', 'symbolic', 'statistical', 'neural', 'networks', 'subsection', 'statistical', 'approach', 'neural', 'networks', 'common', 'nlp', 'tasks', 'toggle', 'common', 'nlp', 'tasks', 'subsection', 'text', 'speech', 'processing', 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Ensure resources are available
nltk.download('punkt')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stemming
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemming:", stemmed_words)

# Lemmatization by default By default, WordNetLemmatizer assumes the noun form of a word when lemmatizing, words "running" can have different lemmas based on their POS 'running' when noun and 'run' when verb

lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatization:", lemmatized_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stemming: ['natur', 'languag', 'process', 'wikipedia', 'jump', 'content', 'main', 'menu', 'main', 'menu', 'move', 'sidebar', 'hide', 'navig', 'main', 'pagecontentscurr', 'eventsrandom', 'articleabout', 'wikipediacontact', 'us', 'contribut', 'helplearn', 'editcommun', 'portalrec', 'changesupload', 'filespeci', 'page', 'search', 'search', 'appear', 'donat', 'creat', 'account', 'log', 'person', 'tool', 'donat', 'creat', 'account', 'log', 'page', 'log', 'editor', 'learn', 'contributionstalk', 'content', 'move', 'sidebar', 'hide', 'top', 'histori', 'toggl', 'histori', 'subsect', 'symbol', 'nlp', 'earli', 'statist', 'nlp', 'spresent', 'approach', 'symbol', 'statist', 'neural', 'network', 'toggl', 'approach', 'symbol', 'statist', 'neural', 'network', 'subsect', 'statist', 'approach', 'neural', 'network', 'common', 'nlp', 'task', 'toggl', 'common', 'nlp', 'task', 'subsect', 'text', 'speech', 'process', 'morpholog', 'analysi', 'syntact', 'analysi', 'lexic', 'semant', 'individu', 'word', 'contex