###Packages installation

In [1]:
!playwright install
%pip install -q langchain-openai langchain playwright beautifulsoup4
! pip install streamlit==1.25.0 chromadb==0.4.3 tiktoken faiss-cpu==1.7.4 pydantic==1.10.18
! pip install tml2text==2020.1.16 google-api-core==2.11.1 google-api-python-client==2.95.0
! pip install google-auth google-auth-httplib2 googleapis-common-protos==1.59.1 langchain_community

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.10/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)[39m
    at async Registry._validateHostRequirements (/usr/local/lib/python3.10/dist-p

In [2]:
!pip install nest_asyncio




### Webscraping with BeautifulSoup

In [3]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
import nest_asyncio


# Load HTML
nest_asyncio.apply()
loader = AsyncChromiumLoader(["https://en.wikipedia.org/wiki/The_Tortured_Poets_Department"])
html = loader.load()

# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["span"])

# Result
docs_transformed[0].page_content[0:1000]



'Main menu Main page Contents Current events Random article About Wikipedia Contact us Donate Help Learn to edit Community portal Recent changes Upload file    Search Appearance Create account Log in Personal tools Create account Log in learn more Contributions Talk 1 Background and conception 2 Composition Toggle Composition subsection 2.1 Themes and lyrics 2.2 Production and music 3 Marketing Toggle Marketing subsection 3.1 Aesthetic 3.2 Promotion and release 4 Critical reception Toggle Critical reception subsection 4.1 Reviews 4.2 Post-review commentary 5 Commercial performance 6 Accolades 7 Track listing 8 Personnel 9 Charts Toggle Charts subsection 9.1 Weekly charts 9.2 Monthly charts 10 Certifications 11 Release history 12 Notes 13 References Toggle the table of contents 31 languages Azərbaycanca Català Čeština Deutsch Español Esperanto فارسی Français 한국어 Hrvatski Bahasa Indonesia Íslenska Italiano עברית Lietuvių Magyar Nederlands 日本語 Polski Português Română Русский Shqip Simple 

### Loader for Multiple URLs


In [4]:
from langchain_community.document_loaders import AsyncHtmlLoader

urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|##########| 2/2 [00:00<00:00,  2.43it/s]


### Webscraping with Html2Text

In [5]:
!pip install html2text
from langchain_community.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs_transformed[0].page_content[0:500]



'Skip to main content  Skip to navigation\n\n<\n\n>\n\nMenu\n\n## ESPN\n\n  *   *   *   * scores\n\nNEW! Find where to watch all of your favorite sports!\n\n  * NFL\n  * NBA\n  * MLB\n  * NCAAF\n  * NHL\n  * Soccer\n  * …\n\n    * WNBA\n    * Boxing\n    * CFL\n    * NCAA\n    * Cricket\n    * F1\n    * Golf\n    * Horse\n    * LLWS\n    * MMA\n    * NASCAR\n    * NBA G League\n    * NBA Summer League\n    * NCAAM\n    * NCAAW\n    * NWSL\n    * Olympics\n    * PLL\n    * Professional Wrestling\n    * Racing\n    * RN BB\n    * RN FB\n    '

### Web Scraping and Extraction with schema

In [10]:
#Using OpenAI
from langchain_openai import ChatOpenAI

from google.colab import userdata
openai_api_key = userdata.get('OPENAI_API_KEY')

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", openai_api_key=openai_api_key)

# # Using CohereAI
# !pip install --upgrade cohere
# from langchain_cohere import ChatCohere
# from google.colab import userdata
# cohere_api_key = userdata.get('COHERE_API_KEY')

# llm = ChatCohere(cohere_api_key=cohere_api_key)


In [16]:
from langchain.chains import create_extraction_chain

schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm.with_structured_output).run(content)

### Webscraping with Playwright

In [18]:
import pprint

from langchain_text_splitters import RecursiveCharacterTextSplitter


def scrape_with_playwright(urls, schema):
    nest_asyncio.apply()
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()

    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=['span']
    )

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)
    return splits

    # Process the first split
    # extracted_content = extract(schema=schema, content=splits[0].page_content)
    # pprint.pprint(extracted_content)
    # return extracted_content


urls = ["https://en.wikipedia.org/wiki/The_Tortured_Poets_Department"]
extracted_content = scrape_with_playwright(urls, schema=schema)



### Research automation

In [None]:
from langchain.retrievers.web_research import WebResearchRetriever
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [None]:
# Vectorstore
vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(openai_api_key=openai_api_key),
    persist_directory="./chroma_db_oai",
)

# LLM
llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

google_api_key = userdata.get('GOOGLE_API_KEY')
google_cse_id = userdata.get('GOOGLE_CSE_ID')

# Search
search = GoogleSearchAPIWrapper(google_api_key=google_api_key, google_cse_id=google_cse_id)

In [None]:
# Index those documents into a vectorstore
web_research_retriever = WebResearchRetriever.from_llm(
    vectorstore=vectorstore, llm=llm, search=search
)

In [None]:
# Run
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO)
from langchain.chains import RetrievalQAWithSourcesChain

user_input = "When does Taylor Swift release her new album?"
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, retriever=web_research_retriever
)
result = qa_chain({"question": user_input})
result

### WebScraping with Apify


In [None]:
!pip install apify-client langchain-openai langchain

from langchain.docstore.document import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.utilities import ApifyWrapper

from google.colab import userdata
apify_api_token = userdata.get('APIFY_API_TOKEN')

apify = ApifyWrapper(apify_api_token=apify_api_token)
# Call the Actor to obtain text from the crawled webpages
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={"startUrls": [{"url": "/docs/integrations/chat/"}]},
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

# Create a vector store based on the crawled data
index = VectorstoreIndexCreator().from_loaders([loader])

# Query the vector store
query = "Are any OpenAI chat models integrated in LangChain?"
result = index.query(query)
print(result)