# Get sub URLs from a parent URL
Web page information embedding and retrieval
* https://www.youtube.com/watch?v=RBnuhhmD21U&t=5s
* https://colab.research.google.com/drive/1f_1HeD1mK_wXfjgvY4VGNFKSQBE5Imeh?usp=sharing
* https://python.langchain.com/docs/integrations/document_loaders/url

In [1]:
import requests
from bs4 import BeautifulSoup

In [12]:
# This function blindly take all the underlying url list

def get_all_links(url):
    # Send a GET request
    response = requests.get(url)
    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the content of the response
        page_content = response.content
        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(page_content, 'html.parser')
        # Find all 'a' tags on the page
        links = soup.find_all('a')
        # Create a list to store all valid URLs
        urls = []
        for link in links:
            # Get the href attribute of the link
            href = link.get('href')
            # If the href attribute is not None and it is a valid URL
            if href and href.startswith('http'):
                urls.append(href)
        return urls

In [13]:
PROGRAM_URL = ['https://www.cit.tum.de/en/cit/studies/degree-programs/master-informatics/', 
               "https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-management/",
               "https://www.tu.berlin/en/studying/study-programs/all-programs-offered/study-course/computer-science-informatik-m-sc",
               "https://www.tu.berlin/en/studierendensekretariat/dates-deadlines"]

In [4]:
SITEMAP = get_all_links(PROGRAM_URL)


InvalidSchema: No connection adapters were found for "['https://www.cit.tum.de/en/cit/studies/degree-programs/master-informatics/', 'https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-management/']"

In [5]:
SITEMAP;

NameError: name 'SITEMAP' is not defined

In [6]:
TEST_URLs = [
    'https://www.mosaicml.com/blog/mpt-7b',
    'https://stability.ai/blog/stability-ai-launches-the-first-of-its-stablelm-suite-of-language-models', 
    'https://lmsys.org/blog/2023-03-30-vicuna/'
]

# Scrape-langchain
https://github.com/linuxandchill/scrape-langchain-function-calling/blob/main/scrapey_scrape.ipynb

In [7]:
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from langchain.chains import create_extraction_chain



In [8]:
async def run_playwright(site):
    data = ""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)

        page = await browser.new_page()
        await page.goto(site)

        page_source = await page.content()
        soup = BeautifulSoup(page_source, "html.parser")
        
        for script in soup(["script", "style"]): # remove all javascript and stylesheet code
            script.extract()
        # get text
        text = soup.get_text()
        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        data = '\n'.join(chunk for chunk in chunks if chunk)

        await browser.close()
    return data



In [9]:
program_description = await run_playwright(PROGRAM_URL)

Error: url: expected string, got object

In [None]:
program_description

'Master Informatics - TUM - TUM School of Computation, Information and Technology\nSkip to content\nGoogle Suche\nde\nen\nMenu\nTUM School of Computation, Information and Technology\nTechnical University of Munich\nHomeStudiesBefore your StudiesDegree ProgramsBachelor BioinformatikBachelor Elektrotechnik InformationstechnikBachelor InformatikBachelor Informatics: Games EngineeringBachelor Information EngineeringBachelor WirtschaftsinformatikMaster BioinformatikMaster Biomedical ComputingMaster Communications and Electronics EngineeringMaster Computational Science EngineeringMaster Data Engineering and AnalyticsMaster Electrical Engineering and Information TechnologyMaster InformaticsInterdisciplinary ProjectCurriculumpart-timeMaster Informatics: Games EngineeringMaster Information SystemsMaster Neuroengineering – ElitestudiengangMaster Robotics, Cognition, IntelligenceMaster Finance and Information ManagementSoftware Engineering - Elite Graduate ProgramDuring your StudiesInternationalR

# LangChain URL loader 

In [None]:
#!brew install libmagic
# this is to fix the issue: libmagic is unavailable but assists in filetype detection on file-like objects. Please consider installing libmagic for better results.
# https://github.com/langchain-ai/langchain/issues/5342

In [14]:
from langchain.document_loaders import UnstructuredURLLoader
loaders = UnstructuredURLLoader(urls=PROGRAM_URL)
data = loaders.load()

In [15]:
# Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# divide first 1000 words as one chunk, overlap is 200 words. if your setence has dependency on previous sentence, better have a overlap
# if you have temperal dependency -> use overlap
# remember that overlap will be more expensive for embedding as well
text_splitter = RecursiveCharacterTextSplitter( 
                                      chunk_size=1000, 
                                      chunk_overlap=200)


docs = text_splitter.split_documents(data)

In [9]:
docs

[Document(page_content='Skip to content\n\nGoogle Suche\n\nde\n\nen\n\nMenu\n\nGoogle Custom Search\n\nWir verwenden Google für unsere Suche. Mit Klick auf „Suche aktivieren“ aktivieren Sie das Suchfeld und akzeptieren die Nutzungsbedingungen.\n\nHinweise zum Einsatz der Google Suche\n\nTUM School of Computation, Information and Technology\n\nTechnical University of Munich\n\nHome', metadata={'source': 'https://www.cit.tum.de/en/cit/studies/degree-programs/master-informatics/'}),
 Document(page_content='Hinweise zum Einsatz der Google Suche\n\nTUM School of Computation, Information and Technology\n\nTechnical University of Munich\n\nHome\n\nStudiesBefore your StudiesDegree ProgramsBachelor BioinformatikBachelor Elektrotechnik InformationstechnikBachelor InformatikBachelor Informatics: Games EngineeringBachelor Information EngineeringBachelor WirtschaftsinformatikMaster BioinformatikMaster Biomedical ComputingMaster Communications and Electronics EngineeringMaster Computational Science 

In [16]:
len(docs)

113

# Information Embedding 
* you can use ChromaDB, Pinecone, FAISS
* ChromaDB: https://www.youtube.com/watch?v=3yPBVii7Ct0&list=PL8motc6AQftk1Bs42EW45kwYbyJ4jOdiZ&index=21
* ensure that the input will not exceeding the token limits

In [17]:
import sys
sys.path.append("..") # Adds higher directory to python module path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

In [18]:
# !pip install langchain
# !pip install chromadb
# !pip install openai

In [20]:
from langchain.callbacks import get_openai_callback # to get the cost
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings() # text-embedding-ada-002

In [21]:
# Create the DB
# Embed and store the texts
# Supplying a persist_directory will store the embedding on disk
persist_directory = 'vectorDatabase'

## Here is the new embedding being used
embedding = embeddings

with get_openai_callback() as cb:
    vectordb = Chroma.from_documents(documents = docs,
                                     embedding=embedding,
                                     persist_directory=persist_directory)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 0
Prompt Tokens: 0
Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0


In [22]:
# persiste the db to disk
vectordb.persist()