In [None]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import urllib.request

In [None]:
# Download necessary resources for nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation and non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [None]:
response = urllib.request.urlopen("https://cp-algorithms.com/data_structures/segment_tree.html")
mybytes = response.read()

extracted_html = mybytes.decode("utf8")
response.close()

print(extracted_html)


<!doctype html>
<html lang="en" class="no-js">
  <head>
    
      <meta charset="utf-8">
      <meta name="viewport" content="width=device-width,initial-scale=1">
      
        <meta name="description" content="The goal of this project is to translate the wonderful resource http://e-maxx.ru/algo which provides descriptions of many algorithms and data structures especially popular in field of competitive programming. Moreover we want to improve the collected knowledge by extending the articles and adding new articles to the collection.">
      
      
      
        <link rel="canonical" href="https://cp-algorithms.com/data_structures/segment_tree.html">
      
      
        <link rel="prev" href="sqrt_decomposition.html">
      
      
        <link rel="next" href="treap.html">
      
      
        <link rel="alternate" type="application/rss+xml" title="RSS feed" href="../feed_rss_created.xml">
        <link rel="alternate" type="application/rss+xml" title="RSS feed of updated co

In [None]:
html_after_parsing = BeautifulSoup(extracted_html, 'html.parser')

# Extract text
text = html_after_parsing.get_text()

# Print the extracted text
print(text)















Segment Tree - Algorithms for Competitive Programming
















          Skip to content
        


















              Algorithms for Competitive Programming
            





              
                Segment Tree
              
            





































            Initializing search
          












    cp-algorithms/cp-algorithms
  















          
  
  Home

        



          
  
  Algebra

        



          
  
  Data Structures

        



          
  
  Dynamic Programming

        



          
  
  String Processing

        



          
  
  Linear Algebra

        



          
  
  Combinatorics

        



          
  
  Numerical Methods

        



          
  
  Geometry

        



          
  
  Graphs

        



          
  
  Miscellaneous

        














    Algorithms for Competitive Programming
  






    cp-algorithms/cp-algorithms
  







    Home
 

In [None]:
cleaned_tokens = preprocess_text(text)
print(cleaned_tokens)

['segment', 'tree', 'algorithm', 'competitive', 'programming', 'skip', 'content', 'algorithm', 'competitive', 'programming', 'segment', 'tree', 'initializing', 'search', 'home', 'algebra', 'data', 'structure', 'dynamic', 'programming', 'string', 'processing', 'linear', 'algebra', 'combinatorics', 'numerical', 'method', 'geometry', 'graph', 'miscellaneous', 'algorithm', 'competitive', 'programming', 'home', 'home', 'main', 'page', 'navigation', 'tag', 'index', 'contribute', 'code', 'conduct', 'preview', 'algebra', 'algebra', 'fundamental', 'fundamental', 'binary', 'exponentiation', 'euclidean', 'algorithm', 'computing', 'greatest', 'common', 'divisor', 'extended', 'euclidean', 'algorithm', 'linear', 'diophantine', 'equation', 'fibonacci', 'number', 'prime', 'number', 'prime', 'number', 'sieve', 'eratosthenes', 'linear', 'sieve', 'primality', 'test', 'integer', 'factorization', 'function', 'function', 'euler', 'totient', 'function', 'number', 'divisor', 'sum', 'divisor', 'modular', 'arit

In [None]:
# unique tokens
cleaned_tokens = set(cleaned_tokens)
print(cleaned_tokens)

{'second', 'due', 'available', 'derived', 'generating', 'enumerating', 'estimated', 'compute', 'triangulation', 'factorial', 'obviously', 'noted', 'std', 'sieve', 'touch', 'applying', 'answering', 'minus', 'child', 'parallel', 'task', 'correct', 'idea', 'break', 'tarjan', 'halve', 'recursively', 'particular', 'tr', 'satisfy', 'multiple', 'reduction', 'entry', 'traversing', 'occurrence', 'determinant', 'dp', 'structure', 'split', 'return', 'table', 'take', 'component', 'least', 'induction', 'partial', 'an', 'whenever', 'geometry', 'slightly', 'divisor', 'contain', 'receive', 'rb', 'translated', 'alternatively', 'int', 'request', 'somewhat', 'call', 'jxu', 'text', 'exponentiation', 'conducted', 'incrementally', 'manipulation', 'range', 'thus', 'spanning', 'splitting', 'proposition', 'aryamn', 'arbitrary', 'natural', 'simpler', 'traversed', 'assigning', 'nullptr', 'kuhn', 'equivalent', 'perform', 'descending', 'moment', 'gaurangtandon', 'hashing', 'try', 'lazy', 'convenient', 'class', 'cl