In [None]:
import requests
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Access Website


In [None]:
url = "https://en.wikipedia.org/wiki/Muhammad_Ali_of_Egypt"

# Send a GET request to the URL
response = requests.get(url)

# Now HTML stored in response.text

# Parse Test

In [None]:
# Parse the HTML content of the page using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find the Class including its Markups like <P>
content_div = soup.find("div", class_="mw-content-ltr mw-parser-output")

assert content_div is not None, "Class not found."

# Get Desired Text

In [None]:
# Getting Actual Text without HTML markups
text = content_div.get_text() if content_div else None

# Extract wanted Text
start_index = text.find("Muhammad Ali[a]")
end_index = text.find("Republic of Egypt.", start_index) # represent index of first occurence
end_index += len("Republic of Egypt.") # represent endindex of wanted paragraph

desired_text = text[start_index:end_index]
print(desired_text)


Muhammad Ali[a] (4 March 1769 – 2 August 1849) was the Ottoman Albanian[3] governor and de facto ruler of Egypt from 1805 to 1848, considered the founder of modern Egypt. At the height of his rule, he controlled Egypt, Sudan, Hejaz, Najd, the Levant, Crete and parts of Greece.
He was a military commander in an Albanian Ottoman force sent to recover Egypt from French occupation under Napoleon. Following Napoleon's withdrawal, Muhammad Ali rose to power through a series of political maneuvers, and in 1805 he was named Wāli (governor) of Egypt and gained the rank of Pasha.
As Wāli, Ali attempted to modernize Egypt by instituting dramatic reforms in the military, economic and cultural spheres. He also initiated a violent purge of the Mamluks, consolidating his rule and permanently ending the Mamluk hold over Egypt.
Militarily, Ali recaptured the Arabian territories for the sultan, and conquered Sudan of his own accord. His attempt at suppressing the Greek rebellion failed decisively, howev

# Text Preprocessing

## Removing reference sign [a]

In [None]:
cleaned_text = re.sub(r'\[[^\]]+\]', '', desired_text) # removed reference mark like [a]
# by breaking pattern first \[ and \] represent open brackets
# then in [^\] it match any character except closing bracket ] (it stop before ])
# finally we add the closing bracket ] so it try to find any text inside []
print(cleaned_text)

Muhammad Ali (4 March 1769 – 2 August 1849) was the Ottoman Albanian governor and de facto ruler of Egypt from 1805 to 1848, considered the founder of modern Egypt. At the height of his rule, he controlled Egypt, Sudan, Hejaz, Najd, the Levant, Crete and parts of Greece.
He was a military commander in an Albanian Ottoman force sent to recover Egypt from French occupation under Napoleon. Following Napoleon's withdrawal, Muhammad Ali rose to power through a series of political maneuvers, and in 1805 he was named Wāli (governor) of Egypt and gained the rank of Pasha.
As Wāli, Ali attempted to modernize Egypt by instituting dramatic reforms in the military, economic and cultural spheres. He also initiated a violent purge of the Mamluks, consolidating his rule and permanently ending the Mamluk hold over Egypt.
Militarily, Ali recaptured the Arabian territories for the sultan, and conquered Sudan of his own accord. His attempt at suppressing the Greek rebellion failed decisively, however, fo

# removing symbols & punctuations

In [None]:
# \w == [a-zA-Z0-9_]    \s represent space
cleaned_text = re.sub(r"[^\w\s]", "", cleaned_text)
print(cleaned_text)

Muhammad Ali 4 March 1769  2 August 1849 was the Ottoman Albanian governor and de facto ruler of Egypt from 1805 to 1848 considered the founder of modern Egypt At the height of his rule he controlled Egypt Sudan Hejaz Najd the Levant Crete and parts of Greece
He was a military commander in an Albanian Ottoman force sent to recover Egypt from French occupation under Napoleon Following Napoleons withdrawal Muhammad Ali rose to power through a series of political maneuvers and in 1805 he was named Wāli governor of Egypt and gained the rank of Pasha
As Wāli Ali attempted to modernize Egypt by instituting dramatic reforms in the military economic and cultural spheres He also initiated a violent purge of the Mamluks consolidating his rule and permanently ending the Mamluk hold over Egypt
Militarily Ali recaptured the Arabian territories for the sultan and conquered Sudan of his own accord His attempt at suppressing the Greek rebellion failed decisively however following an intervention by th

## Normalize the Data to Lower Case

In [None]:
normalized_text = cleaned_text.lower()
print(normalized_text)

muhammad ali 4 march 1769  2 august 1849 was the ottoman albanian governor and de facto ruler of egypt from 1805 to 1848 considered the founder of modern egypt at the height of his rule he controlled egypt sudan hejaz najd the levant crete and parts of greece
he was a military commander in an albanian ottoman force sent to recover egypt from french occupation under napoleon following napoleons withdrawal muhammad ali rose to power through a series of political maneuvers and in 1805 he was named wāli governor of egypt and gained the rank of pasha
as wāli ali attempted to modernize egypt by instituting dramatic reforms in the military economic and cultural spheres he also initiated a violent purge of the mamluks consolidating his rule and permanently ending the mamluk hold over egypt
militarily ali recaptured the arabian territories for the sultan and conquered sudan of his own accord his attempt at suppressing the greek rebellion failed decisively however following an intervention by th

## Tokenize the data

In [None]:
nltk.download('punkt')
tokens = word_tokenize(normalized_text)

print(tokens)
print(f"unique words are: {len(set(tokens))}")


['muhammad', 'ali', '4', 'march', '1769', '2', 'august', '1849', 'was', 'the', 'ottoman', 'albanian', 'governor', 'and', 'de', 'facto', 'ruler', 'of', 'egypt', 'from', '1805', 'to', '1848', 'considered', 'the', 'founder', 'of', 'modern', 'egypt', 'at', 'the', 'height', 'of', 'his', 'rule', 'he', 'controlled', 'egypt', 'sudan', 'hejaz', 'najd', 'the', 'levant', 'crete', 'and', 'parts', 'of', 'greece', 'he', 'was', 'a', 'military', 'commander', 'in', 'an', 'albanian', 'ottoman', 'force', 'sent', 'to', 'recover', 'egypt', 'from', 'french', 'occupation', 'under', 'napoleon', 'following', 'napoleons', 'withdrawal', 'muhammad', 'ali', 'rose', 'to', 'power', 'through', 'a', 'series', 'of', 'political', 'maneuvers', 'and', 'in', '1805', 'he', 'was', 'named', 'wāli', 'governor', 'of', 'egypt', 'and', 'gained', 'the', 'rank', 'of', 'pasha', 'as', 'wāli', 'ali', 'attempted', 'to', 'modernize', 'egypt', 'by', 'instituting', 'dramatic', 'reforms', 'in', 'the', 'military', 'economic', 'and', 'cultur

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Remove Stop Words

In [None]:
nltk.download('stopwords')

stop_words = stopwords.words('english')
filtered_tokens = [token for token in tokens if token not in stop_words]

print(filtered_tokens)
print(f"unique words are: {len(set(filtered_tokens))}")

['muhammad', 'ali', '4', 'march', '1769', '2', 'august', '1849', 'ottoman', 'albanian', 'governor', 'de', 'facto', 'ruler', 'egypt', '1805', '1848', 'considered', 'founder', 'modern', 'egypt', 'height', 'rule', 'controlled', 'egypt', 'sudan', 'hejaz', 'najd', 'levant', 'crete', 'parts', 'greece', 'military', 'commander', 'albanian', 'ottoman', 'force', 'sent', 'recover', 'egypt', 'french', 'occupation', 'napoleon', 'following', 'napoleons', 'withdrawal', 'muhammad', 'ali', 'rose', 'power', 'series', 'political', 'maneuvers', '1805', 'named', 'wāli', 'governor', 'egypt', 'gained', 'rank', 'pasha', 'wāli', 'ali', 'attempted', 'modernize', 'egypt', 'instituting', 'dramatic', 'reforms', 'military', 'economic', 'cultural', 'spheres', 'also', 'initiated', 'violent', 'purge', 'mamluks', 'consolidating', 'rule', 'permanently', 'ending', 'mamluk', 'hold', 'egypt', 'militarily', 'ali', 'recaptured', 'arabian', 'territories', 'sultan', 'conquered', 'sudan', 'accord', 'attempt', 'suppressing', 'gr

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stemming the Data

In [None]:
# Create a Porter Stemmer instance
porter_stemmer = PorterStemmer()

# Use the instance
stemming_tokens = [porter_stemmer.stem(token) for token in filtered_tokens]

print(stemming_tokens)
print(f"unique words are: {len(set(stemming_tokens))}")

['muhammad', 'ali', '4', 'march', '1769', '2', 'august', '1849', 'ottoman', 'albanian', 'governor', 'de', 'facto', 'ruler', 'egypt', '1805', '1848', 'consid', 'founder', 'modern', 'egypt', 'height', 'rule', 'control', 'egypt', 'sudan', 'hejaz', 'najd', 'levant', 'crete', 'part', 'greec', 'militari', 'command', 'albanian', 'ottoman', 'forc', 'sent', 'recov', 'egypt', 'french', 'occup', 'napoleon', 'follow', 'napoleon', 'withdraw', 'muhammad', 'ali', 'rose', 'power', 'seri', 'polit', 'maneuv', '1805', 'name', 'wāli', 'governor', 'egypt', 'gain', 'rank', 'pasha', 'wāli', 'ali', 'attempt', 'modern', 'egypt', 'institut', 'dramat', 'reform', 'militari', 'econom', 'cultur', 'sphere', 'also', 'initi', 'violent', 'purg', 'mamluk', 'consolid', 'rule', 'perman', 'end', 'mamluk', 'hold', 'egypt', 'militarili', 'ali', 'recaptur', 'arabian', 'territori', 'sultan', 'conquer', 'sudan', 'accord', 'attempt', 'suppress', 'greek', 'rebellion', 'fail', 'decis', 'howev', 'follow', 'intervent', 'european', '

## Final Output

In [None]:
filtered_tokens

['muhammad',
 'ali',
 '4',
 'march',
 '1769',
 '2',
 'august',
 '1849',
 'ottoman',
 'albanian',
 'governor',
 'de',
 'facto',
 'ruler',
 'egypt',
 '1805',
 '1848',
 'considered',
 'founder',
 'modern',
 'egypt',
 'height',
 'rule',
 'controlled',
 'egypt',
 'sudan',
 'hejaz',
 'najd',
 'levant',
 'crete',
 'parts',
 'greece',
 'military',
 'commander',
 'albanian',
 'ottoman',
 'force',
 'sent',
 'recover',
 'egypt',
 'french',
 'occupation',
 'napoleon',
 'following',
 'napoleons',
 'withdrawal',
 'muhammad',
 'ali',
 'rose',
 'power',
 'series',
 'political',
 'maneuvers',
 '1805',
 'named',
 'wāli',
 'governor',
 'egypt',
 'gained',
 'rank',
 'pasha',
 'wāli',
 'ali',
 'attempted',
 'modernize',
 'egypt',
 'instituting',
 'dramatic',
 'reforms',
 'military',
 'economic',
 'cultural',
 'spheres',
 'also',
 'initiated',
 'violent',
 'purge',
 'mamluks',
 'consolidating',
 'rule',
 'permanently',
 'ending',
 'mamluk',
 'hold',
 'egypt',
 'militarily',
 'ali',
 'recaptured',
 'arabian'

In [None]:
count = 0
words = []
for word in filtered_tokens:
  if len(word)<3:
    words.append(word)
    count+=1

In [None]:
print(words)
print(count)

['4', '2', 'de']
3
