In [2]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def extract_text_from_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    text = ' '.join([p.get_text() for p in paragraphs])

    return text

def process_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)

    normalized_text = cleaned_text.lower()

    words = word_tokenize(normalized_text)

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word not in stop_words]

    return filtered_words

def get_unique_words(words):
    return list(set(words))

url = 'https://en.wikipedia.org/wiki/Valorant'
html_text = extract_text_from_html(url)
processed_words = process_text(html_text)
unique_words = get_unique_words(processed_words)

print("Unique words:")
print(unique_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique words:
['service', 'fight', 'power', 'condition', 'strike', 'collecting', 'currency', 'concurrent', 'sitebetter', 'period', 'version', 'andor', 'formulating', 'qualify', 'definition', 'side', 'tpm', 'us', 'higher', 'followed', 'randomized', 'instant', 'lost', 'overall', 'regular', 'forfeit', 'spent', 'bot', 'competitive', 'unable', 'flash', 'often', 'initial', 'people', 'better', 'initiator', 'modification', 'tick', 'resultant', 'forwardbetter', 'onto', 'purchasing', 'shoot', 'attack', 'include', 'well', 'barrier', 'title', 'talked', 'orb', 'completing', 'creating', 'gamemodes', 'ever', 'entryway', 'ultimate', 'rather', 'november', 'among', 'premier', 'communication', 'enter', 'behind', 'watched', 'spray', 'director', 'ult', 'using', 'master', 'rank', 'signature', 'machine', 'found', 'goslin', 'lockin', 'blind', 'modern', 'trouble', 'kernel', 'trevor', 'dedicated', 'system', 'disabling', 'introduced', 'voicechat', 'third', 'hat', 'furthermore', 'yearlong', 'disabled', 'playstyle