# Data:
### Extract HTML form URL

In [64]:
import requests

url = "https://www.searchenginejournal.com/introduction-to-python-seo-spreadsheets/342779/"

response = requests.get(url)
html_content = response.text

### Extract text from HTML page

In [65]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text()
# text = " ".join(
#     [p.get_text() for p in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])]
# )

# Processing on Data:
## Convert to lower case

In [66]:
def convert_to_lower(text):
    copy_text = text
    return copy_text.lower()
text = convert_to_lower(text)

## Removal of punctuations

In [67]:
import string
def remove_punctuations(text):
    copy_text = text
    punc = string.punctuation
    return copy_text.translate(str.maketrans('', '', punc))
text = remove_punctuations(text)

## Removal of stopwords

In [68]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [69]:
STOPWORDS = stopwords.words('english')
def remove_stopwords(text):
    copy_text = text
    copy_text = " ".join([word for word in text.split() if word not in STOPWORDS])
    return copy_text
text = remove_stopwords(text)

## Removal of special characters

In [70]:
import re
def remove_special_chars(text):
    copy_text = text
    copy_text = re.sub('[^a-zA-Z0-9]', " ", copy_text)
    copy_text = re.sub('\s+', ' ', copy_text)
    return copy_text
text = remove_special_chars(text)

  copy_text = re.sub('\s+', ' ', copy_text)


## Stemming

In [71]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    copy_text = text
    return " ".join([ps.stem(word) for word in text.split()])
# text = stem_words(text)

In [72]:
stem_words("Going worried")

'go worri'

### Lemmatization

In [73]:
# import nltk
# # nltk.download('wordnet')
# # nltk.download('averaged_perceptron_tagger')

In [74]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
def lemmatize_words(text):
    copy_text = text
    # find pos tags
    pos_text = pos_tag(copy_text.split())
    print(pos_text)
    # print(pos_text)
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])
text = lemmatize_words(text)

[('introduction', 'NN'), ('python', 'NN'), ('seo', 'NN'), ('pros', 'NNS'), ('using', 'VBG'), ('spreadsheets', 'NNS'), ('skip', 'JJ'), ('content', 'JJ'), ('latest', 'JJS'), ('news', 'NN'), ('seo', 'NN'), ('news', 'NN'), ('ppc', 'NN'), ('news', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('news', 'NN'), ('webinars', 'NNS'), ('podcast', 'VBP'), ('agencies', 'NNS'), ('career', 'VBP'), ('webinar', 'NN'), ('2024', 'CD'), ('google', 'NN'), ('serp', 'NN'), ('features', 'VBZ'), ('new', 'JJ'), ('strategies', 'NNS'), ('gain', 'VBP'), ('visibility', 'NN'), ('following', 'VBG'), ('presentation', 'NN'), ('tom', 'IN'), ('answering', 'VBG'), ('serp', 'JJ'), ('feature', 'JJ'), ('questions', 'NNS'), ('live', 'VBP'), ('qa', 'JJ'), ('won', 'VBD'), ('t', 'JJ'), ('want', 'JJ'), ('miss', 'NN'), ('come', 'VB'), ('prepared', 'JJ'), ('register', 'NN'), ('free', 'JJ'), ('seo', 'NN'), ('seo', 'NN'), ('seo', 'JJ'), ('news', 'NN'), ('seo', 'NN'), ('strategy', 'NN'), ('ask', 'VB'), ('seo', 'JJ'), ('enterprise', 'NN')

In [75]:
lemmatize_words("Going worried")

[('Going', 'VBG'), ('worried', 'VBD')]


'Going worry'

In [76]:
import spacy

def lemmatize_spacy(text):
    # Load the English language model
    nlp = spacy.load("en_core_web_sm")

    # Process the text with spaCy
    doc = nlp(text)

    # Lemmatize each token in the processed text
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text
# Print the lemmatized text
lemmatize_spacy(text)

'introduction python seo pro use spreadsheet skip content late news seo news ppc news social medium news webinar podcast agency career webinar 2024 google serp feature new strategy gain visibility follow presentation tom answer serp feature question live qa win t want miss come prepared register free seo seo seo news seo strategy ask seo enterprise seo google algorithm update international seo link build local seo mobile seo onpage seo technical seo vertical seo wordpress seo web dev seo webinar holistic website health technical tip seo success learn tip improve website technical health include crawlability indexability site speed accessibility register free pay medium pay medium pay medium news pay strategy ask ppc expert display ad ppc programmatic social medium advertise video advertising ebook b2b lead generation create content convert strategy guide first step towards attract highquality lead revolutionize think lead generation download content content content news content strateg

In [77]:
lemmatize_spacy("Going")

'go'

## Removal of URLs

In [78]:
def remove_url(text):
    copy_text = text
    return re.sub(r'https?://\S+|www\.\S+', '', copy_text)
text = remove_url(text)

## Removal of HTML tags

In [79]:
def remove_html_tags(text):
    copy_text = text
    return re.sub(r'<.*?>', '', copy_text)
text = remove_html_tags(text)

In [80]:
def remove_digits(text):
    copy_text = text
    return ''.join([i for i in copy_text if not i.isdigit()])
text = remove_digits(text)

# Unique Words

In [81]:
print(len(text.split()))

2393


In [82]:
all_unique = set(text.split())
print(len(all_unique))

806


In [83]:
all_unique

{'able',
 'access',
 'accessibility',
 'action',
 'actionqueryformatjsonpropinfotitlesavengersaendgame',
 'actually',
 'ad',
 'adapt',
 'add',
 'adoption',
 'advertise',
 'advertisement',
 'advertising',
 'affiliate',
 'agency',
 'aggregation',
 'agile',
 'ai',
 'algorithm',
 'algorithmic',
 'algorithms',
 'alias',
 'allocation',
 'allow',
 'alonso',
 'alpha',
 'already',
 'also',
 'always',
 'anaconda',
 'analysis',
 'analytics',
 'analyze',
 'another',
 'answer',
 'anything',
 'api',
 'app',
 'apparently',
 'approach',
 'arithmetic',
 'article',
 'ask',
 'atoi',
 'attract',
 'audit',
 'auth',
 'authauthenticateuser',
 'authenticate',
 'authentication',
 'author',
 'authority',
 'automate',
 'automatically',
 'available',
 'avenger',
 'avoid',
 'awesome',
 'back',
 'bar',
 'basic',
 'batchcomplete',
 'batista',
 'bb',
 'begin',
 'beginner',
 'behavior',
 'best',
 'big',
 'binding',
 'bio',
 'bit',
 'blindly',
 'block',
 'boolean',
 'booleans',
 'brand',
 'break',
 'brings',
 'browser'

In [84]:
all  = text.split()
print(all)
g = []
for i in range(len(all)):
    if len(all[i]) < 3:
        g.append(all[i])
    
final_text = " ".join(g)
final_text

['introduction', 'python', 'seo', 'pro', 'use', 'spreadsheet', 'skip', 'content', 'late', 'news', 'seo', 'news', 'ppc', 'news', 'social', 'medium', 'news', 'webinars', 'podcast', 'agency', 'career', 'webinar', 'google', 'serp', 'feature', 'new', 'strategy', 'gain', 'visibility', 'follow', 'presentation', 'tom', 'answer', 'serp', 'feature', 'question', 'live', 'qa', 'win', 't', 'want', 'miss', 'come', 'prepared', 'register', 'free', 'seo', 'seo', 'seo', 'news', 'seo', 'strategy', 'ask', 'seo', 'enterprise', 'seo', 'google', 'algorithm', 'update', 'international', 'seo', 'link', 'build', 'local', 'seo', 'mobile', 'seo', 'onpage', 'seo', 'technical', 'seo', 'vertical', 'seo', 'wordpress', 'seo', 'web', 'dev', 'seo', 'webinar', 'holistic', 'website', 'health', 'technical', 'tip', 'seo', 'success', 'learn', 'tip', 'improve', 'website', 'technical', 'health', 'include', 'crawlability', 'indexability', 'site', 'speed', 'accessibility', 'register', 'free', 'pay', 'medium', 'pay', 'medium', 'pa

'qa t ad bb u u ai qa t ad u r k go s t go do go go in go s s v s s s id s id q q d q q s q u ns en en tz s ns en en tz ns en en tz jq we ve u s s ns rs rs r s go s q t c de la id df pd df s gc s s u ai'