# demo notebook

## setup

### imports

In [1]:
from dotenv import load_dotenv
from langchain.llms import OpenAI

import os


### api keys

In [2]:
load_dotenv()
#os.environ["OPENAI_API_KEY"]

True

## chain demo

In [3]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

llm = OpenAI(temperature=0.9)
prompt = PromptTemplate(
    input_variables=["product"],
    template="What is a good name for a company that makes {product}?",
)

from langchain.chains import LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

chain.run("colorful socks")

'\n\nBrightSteps Socks.'

## chromaDB vectorstore demo

In [4]:
%%capture
%pip install chromadb tiktoken

In [5]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

from langchain.document_loaders import TextLoader
loader = TextLoader('./data/demo/state_of_the_union.txt', encoding='utf8')

from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

query = "What did the president say about Ketanji Brown Jackson"
index.query_with_sources(query)

Using embedded DuckDB without persistence: data will be transient


{'question': 'What did the president say about Ketanji Brown Jackson',
 'answer': " The president said that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, one of the nation's top legal minds, to continue Justice Breyer's legacy of excellence. He also mentioned that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\n",
 'sources': './data/demo/state_of_the_union.txt'}

### chromaDB vectorstore demo step-by-step

In [6]:
from langchain.document_loaders import TextLoader
loader = TextLoader('./data/demo/state_of_the_union.txt', encoding='utf8')

documents = loader.load()

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from langchain.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

retriever = db.as_retriever()

qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

query = "what are some rocks you know?"
qa.run(query)

Using embedded DuckDB without persistence: data will be transient


" I don't know."

## urls to vectorstore demo

In [7]:
urls = [
    "https://thesaurus.geolba.ac.at/?uri=http://resource.geolba.ac.at/minres/28&lang=de",
    "https://thesaurus.geolba.ac.at/?uri=http://resource.geolba.ac.at/minres/83&lang=de",
    "https://thesaurus.geolba.ac.at/?uri=http://resource.geolba.ac.at/minres/78&lang=de"
]

### selenium

requires (headless) webdrivers to be installed for chrome or firefox

In [8]:
%%capture
%pip install unstructured tabulate pdf2image pytesseract selenium

In [14]:
from langchain.document_loaders import SeleniumURLLoader

loader = SeleniumURLLoader(urls=urls, browser='firefox')

data = loader.load()

data

[Document(page_content='Home\n                            (current)\n\nTutorial\n\nAbout\n\nFeedback\n\nDE\n                        \n                            Deutsch\n                            English\n\nAlunit\n\nURI: \n                                    http://resource.geolba.ac.at/minres/28\n                                        \xa0\xa0\xa0⇒\n                                     RDF download\n\nAlunit  de\n\nAlunite  en\n\nNotation:\n\nAln\n\nVor allem im Verwitterungsbereich von Braunkohlenvorkommen aus den begleitenden Sulfidkomponenten (Pyrit) hervorgegangenes sulfatisches Mineral, welches insbesondere in der Vergangenheit lokal auch zur Gewinnung von „Alaun“ herangezogen wurde. Ein Gewinnungsort war der Tiefbau Eglsee bei Krems . Alunit wird der Gruppe der Industrieminerale zugeordnet. (Weber & Weiss, 1983)\n\n[PDF]\xa0-\n\n[Catalog]\n\nVerwandte Begriffe, Relationen\n\nbroader\n\nIndustrieminerale\n\nexactMatch\n\nCommodityCodeValue/alunite (INSPIRE)\n\nskos:prefLabel

In [15]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from langchain.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

retriever = db.as_retriever()

qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

query = "what are some rocks you know?"
qa.run(query)

Using embedded DuckDB without persistence: data will be transient


' Alunit, Anhydrit, Anthrazit, Antimon, Arsen, Asbest, Baryt, Bauxit, Bentonit, Beryllium, Bismut, Blei, Braunkohle, Brecherprodukte aus Basalt, Diabas, Brecherprodukte aus Kalkstein, Dolomit, Brecherprodukte aus Sandstein, Quarzit, Diatomit, Disthen, Dolomit, Feldspat, Feuerfeste Tone, Fluorit, Gabbro, Dolerit (Dekorsteine), Gagat, Gangquarz, Gips, Glanzbraunkohle, Glimmer, Granit, Syenit u.s.w. (Dekorsteine), Hämatit.'

### playwright

requires (headless) webdrivers to be installed for chrome or firefox; and another library to be installed (see output of bash command `playwright install`)

In [11]:
%%capture
%pip install unstructured tabulate pdf2image pytesseract playwright
!playwright install

playwright requires async i/o, which we would need to custom-build, as jupyter notebooks run their own async loop - use a python file instead

In [12]:
#from langchain.document_loaders import PlaywrightURLLoader
#
#urls = [
#    "https://thesaurus.geolba.ac.at/?uri=http://resource.geolba.ac.at/minres/28&lang=de",
#    "https://thesaurus.geolba.ac.at/?uri=http://resource.geolba.ac.at/minres/83&lang=de",
#    "https://thesaurus.geolba.ac.at/?uri=http://resource.geolba.ac.at/minres/78&lang=de"
#]
#
#loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"], headless=True)
#
#data = loader.load()
#
#data

In [13]:
!python demo.py

[Document(page_content='Alunit\n\nURI: \n                                    http://resource.geolba.ac.at/minres/28\n                                        \xa0\xa0\xa0⇒\n                                     RDF download\n\nAlunit  de\n\nAlunite  en\n\nNotation:\n\nAln\n\nVor allem im Verwitterungsbereich von Braunkohlenvorkommen aus den begleitenden Sulfidkomponenten (Pyrit) hervorgegangenes sulfatisches Mineral, welches insbesondere in der Vergangenheit lokal auch zur Gewinnung von „Alaun“ herangezogen wurde. Ein Gewinnungsort war der Tiefbau Eglsee bei Krems . Alunit wird der Gruppe der Industrieminerale zugeordnet. (Weber & Weiss, 1983)\n\nVerwandte Begriffe, Relationen\n\nbroader\n\nIndustrieminerale\n\nexactMatch\n\nCommodityCodeValue/alunite (INSPIRE)\n\nskos:prefLabel\n\nAlunit  deAlunite  en\n\nskos:definition\n\nVor allem im Verwitterungsbereich von Braunkohlenvorkommen aus den begleitenden Sulfidkomponenten (Pyrit) hervorgegangenes sulfatisches Mineral, welches insbesondere