## Imports

In [35]:
import re

#  Scraping

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin


# Langchain
from langchain.document_loaders import WebBaseLoader


## Data

In [2]:
def extract_links_from_url(url):
    """
    Extracts all the links from the given URL.
    
    Parameters:
    - url (str): The URL from which links are to be extracted.
    
    Returns:
    - List of unique URLs present on the webpage.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links = set()
    base_url = urlparse(url).scheme + "://" + urlparse(url).hostname

    for a_tag in soup.find_all("a", href=True):
        href = a_tag.attrs["href"]
        full_url = urljoin(base_url, href)
        links.add(full_url)
    
    return list(links)

In [8]:
def scrape_multiple_webpages(urls, use_async=False, rps=2, verify_ssl=True, proxies=None):
    """
    Scrapes multiple webpages and returns them as documents.
    
    Parameters:
    - urls (list): A list of URLs to be scraped.
    - use_async (bool): Whether to load the URLs asynchronously.
    - rps (int): Requests per second, for asynchronous loading.
    - verify_ssl (bool): Whether to verify SSL certificates during requests.
    - proxies (dict): Dictionary containing http and https proxies.
    
    Returns:
    - List of documents corresponding to the URLs.
    """
    
    loader = WebBaseLoader(urls)
    loader.requests_kwargs = {'verify': verify_ssl}
    
    if proxies:
        loader.requests_kwargs['proxies'] = proxies

    if use_async:
        import nest_asyncio
        nest_asyncio.apply()
        
        loader.requests_per_second = rps
        return loader.aload()
    else:
        return loader.load()

## Get Data

In [3]:
#define origine
website_url = "https://www.sbert.net/docs/package_reference/SentenceTransformer.html"

In [4]:
#Scrap all url
all_links = extract_links_from_url(website_url)

In [6]:
all_links

['https://www.sbert.net/evaluation.html',
 'https://www.sphinx-doc.org/',
 'https://www.sbert.net/examples/unsupervised_learning/README.html',
 'https://twitter.com/Nils_Reimers',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.save',
 'https://www.sbert.net/examples/training/quora_duplicate_questions/README.html',
 'https://www.sbert.net/examples/applications/image-search/README.html',
 'https://www.sbert.net/examples/training/nli/README.html',
 'https://github.com/readthedocs/sphinx_rtd_theme',
 'https://www.sbert.net/pretrained_cross-encoders.html',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.tokenizer',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.tokenize',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.get_max_seq_length',
 'https://www.sbert.net/losses.html',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.evaluate'

## Filter web data

In [11]:
def filter_links(all_links):
    to_verify = []
    filtered_links = []
    
    for link in all_links:
        if link.startswith("https://www.sbert"):
            filtered_links.append(link)
        else:
            to_verify.append(link)
    
    return filtered_links, to_verify

In [12]:
all_links, to_verify = filter_links(all_links)

In [13]:
all_links

['https://www.sbert.net/evaluation.html',
 'https://www.sbert.net/examples/unsupervised_learning/README.html',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.save',
 'https://www.sbert.net/examples/training/quora_duplicate_questions/README.html',
 'https://www.sbert.net/examples/applications/image-search/README.html',
 'https://www.sbert.net/examples/training/nli/README.html',
 'https://www.sbert.net/pretrained_cross-encoders.html',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.tokenizer',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.tokenize',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.get_max_seq_length',
 'https://www.sbert.net/losses.html',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.evaluate',
 'https://www.sbert.net#sentence_transformers.SentenceTransformer.fit',
 'https://www.sbert.net/examples/applications/p

In [14]:
to_verify

['https://www.sphinx-doc.org/',
 'https://twitter.com/Nils_Reimers',
 'https://github.com/readthedocs/sphinx_rtd_theme',
 'https://github.com/UKPLab/sentence-transformers/blob/master/docs/package_reference/SentenceTransformer.md',
 'https://readthedocs.org']

## Etract text  data

In [15]:
# Now, we can use previously defined function to scrape these links:
documents = scrape_multiple_webpages(all_links)

In [17]:
type(documents)

list

In [19]:
documents[0]

Document(page_content='\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\nApache/2.4.29 (Ubuntu) Server at www.sbert.net Port 443\n\n', metadata={'source': 'https://www.sbert.net/evaluation.html', 'title': '404 Not Found', 'language': 'No language found.'})

In [20]:
documents[1]

Document(page_content='\n\n\n\n\n\nUnsupervised Learning — Sentence-Transformers  documentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n Sentence-Transformers\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\nOverview\n\nInstallation\nQuickstart\nPretrained Models\nPretrained Cross-Encoders\nPublications\nHugging Face ðŸ¤—\n\nUsage\n\nComputing Sentence Embeddings\nSemantic Textual Similarity\nSemantic Search\nRetrieve & Re-Rank\nClustering\nParaphrase Mining\nTranslated Sentence Mining\nCross-Encoders\nImage Search\n\nTraining\n\nTraining Overview\nMultilingual-Models\nModel Distillation\nCross-Encoders\nAugmented SBERT\n\nTraining Examples\n\nSemantic Textual Similarity\nNatural Language Inference\nParaphrase Data\nQuora Duplicate Questions\nMS MARCO\n\nUnsupervised Learning\n\nUnsupervised Learning\nTSDAE\nSimCSE\nCT\nCT (In-Batch Negative Sampling)\nMasked Language Model (MLM)\nGenQ\nGPL\nPerformance Comparison\n\n\nDomain Adaptation\n\nPackage Reference\n\nSentenceTransformer\nutil

## Clean Text data
- Remove 404 error content
- Clean text
   - Removing HTML tags and content
   - Removing Markdown-specific syntax
   - Converting Unicode characters to their actual representation
   - Removing URLs
   - Removing extra white spaces

### Filter 404

In [23]:
def filter_documents(documents):
    manual_check = []
    filtered_documents = []
    
    for doc in documents:
        if doc.metadata['title'] == '404 Not Found':
            manual_check.append(doc)
        else:
            filtered_documents.append(doc)
    
    return filtered_documents, manual_check

In [24]:
documents, manual_check = filter_documents(documents)

In [30]:
documents[0].metadata['title']

'Unsupervised Learning — Sentence-Transformers  documentation'

In [31]:
documents[0].page_content

'\n\n\n\n\n\nUnsupervised Learning — Sentence-Transformers  documentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n Sentence-Transformers\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\nOverview\n\nInstallation\nQuickstart\nPretrained Models\nPretrained Cross-Encoders\nPublications\nHugging Face ðŸ¤—\n\nUsage\n\nComputing Sentence Embeddings\nSemantic Textual Similarity\nSemantic Search\nRetrieve & Re-Rank\nClustering\nParaphrase Mining\nTranslated Sentence Mining\nCross-Encoders\nImage Search\n\nTraining\n\nTraining Overview\nMultilingual-Models\nModel Distillation\nCross-Encoders\nAugmented SBERT\n\nTraining Examples\n\nSemantic Textual Similarity\nNatural Language Inference\nParaphrase Data\nQuora Duplicate Questions\nMS MARCO\n\nUnsupervised Learning\n\nUnsupervised Learning\nTSDAE\nSimCSE\nCT\nCT (In-Batch Negative Sampling)\nMasked Language Model (MLM)\nGenQ\nGPL\nPerformance Comparison\n\n\nDomain Adaptation\n\nPackage Reference\n\nSentenceTransformer\nutil\nModels\nLosses\nEval

In [27]:
manual_check

[Document(page_content='\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\nApache/2.4.29 (Ubuntu) Server at www.sbert.net Port 443\n\n', metadata={'source': 'https://www.sbert.net/evaluation.html', 'title': '404 Not Found', 'language': 'No language found.'}),
 Document(page_content='\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\nApache/2.4.29 (Ubuntu) Server at www.sbert.net Port 443\n\n', metadata={'source': 'https://www.sbert.net/pretrained_cross-encoders.html', 'title': '404 Not Found', 'language': 'No language found.'}),
 Document(page_content='\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\nApache/2.4.29 (Ubuntu) Server at www.sbert.net Port 443\n\n', metadata={'source': 'https://www.sbert.net/losses.html', 'title': '404 Not Found', 'language': 'No language found.'}),
 Document(page_content='\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\nApache

### Clean text

In [32]:
def clean_text(text):
    """
    Cleans the provided text by:
    - Removing HTML tags and content
    - Removing Markdown-specific syntax
    - Converting Unicode characters to their actual representation
    - Removing URLs
    - Removing extra white spaces
    
    Args:
    - text (str): The input string to be cleaned.
    
    Returns:
    - str: The cleaned string.
    """
    
    # 1. Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    no_html = soup.get_text(separator=' ')
    
    # 2. Remove Markdown Syntax
    no_markdown = re.sub(r'\!\[.*?\]\(.*?\)|\[(.*?)\]\(.*?\)|\*\*.*?\*\*|\*.*?\*|#[^\n]*', '', no_html)
    
    # 3. Convert Unicode characters (for common entities; can be expanded further)
    no_unicode = re.sub(r'&amp;', '&', no_markdown)
    no_unicode = re.sub(r'&lt;', '<', no_unicode)
    no_unicode = re.sub(r'&gt;', '>', no_unicode)
    
    # 4. Remove URLs
    no_urls = re.sub(r'http[s]?://\S+', '', no_unicode)
    
    # 5. Remove extra white spaces
    clean_string = ' '.join(no_urls.split())
    
    return clean_string

In [33]:
def clean_documents(documents):
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)
    return documents

In [36]:
cleaned_documents = clean_documents(documents)

In [44]:
titles = [doc.metadata['title'] for doc in cleaned_documents]
print(titles)
len(titles)

['Unsupervised Learning — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'Quora Duplicate Questions — Sentence-Transformers  documentation', 'Image Search — Sentence-Transformers  documentation', 'Natural Language Inference — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'SentenceTransformers Documentation — Sentence-Transformers  documentation', 'Paraphrase Mining — Sentence-Transformers  documentation', 'Multilingual-Models — Sentence-Transformers  documentation', 'Cross-Encoders — Sentence-Transformers  documentation', 'Translated Sentence Mini

38

In [37]:
cleaned_documents[0].page_content

'Unsupervised Learning — Sentence-Transformers documentation Sentence-Transformers Overview Installation Quickstart Pretrained Models Pretrained Cross-Encoders Publications Hugging Face ðŸ¤— Usage Computing Sentence Embeddings Semantic Textual Similarity Semantic Search Retrieve & Re-Rank Clustering Paraphrase Mining Translated Sentence Mining Cross-Encoders Image Search Training Training Overview Multilingual-Models Model Distillation Cross-Encoders Augmented SBERT Training Examples Semantic Textual Similarity Natural Language Inference Paraphrase Data Quora Duplicate Questions MS MARCO Unsupervised Learning Unsupervised Learning TSDAE SimCSE CT CT (In-Batch Negative Sampling) Masked Language Model (MLM) GenQ GPL Performance Comparison Domain Adaptation Package Reference SentenceTransformer util Models Losses Evaluation Datasets cross_encoder Sentence-Transformers » Unsupervised Learning Edit on GitHub Unsupervised LearningÂ¶ This page contains a collection of unsupervised learning me