In [13]:
import os
from dotenv import load_dotenv
load_dotenv()

# openai
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# huggingface
HF_TOKEN = os.getenv('HF_ACCESS_TOKEN')

# google
GOOGLE_CSE_ID = os.getenv('GOOGLE_CSE_ID')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

# HTML2Text

HTML2Text converts HTML into plain text (with markdown-like formatting) without any specific tag manipulation.

To extract human-readable text without needing to manipulate specific HTML elements.

In [2]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer

urls = ["https://www.gesetze-im-internet.de/baf_gzuschlagsv/BJNR009350986.html",
        "https://www.gesetze-im-internet.de/baf_g/BJNR014090971.html"
        ]

loader = AsyncHtmlLoader(urls)
docs = loader.load()

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

Fetching pages: 100%|##########| 2/2 [00:04<00:00,  2.11s/it]


In [8]:
print(docs_transformed[0].page_content[1000:1400])

32 am 1.1.1991 in Kraft getreten

Nichtamtliches Inhaltsverzeichnis

### Eingangsformel

Auf Grund des § 13 Abs. 4 des Bundesausbildungsforderungsgesetzes in der
Fassung der Bekanntmachung vom 6. Juni 1983 (BGBl. I S. 645) verordnet die
Bundesregierung mit Zustimmung des Bundesrates:

Nichtamtliches Inhaltsverzeichnis

### § 1 Zuschlage zu dem Bedarf

(1) Bei einer Ausbildung im Ausland werden in 


# Beautiful Soup

Beautiful Soup offers more fine-grained control over HTML content, enabling specific tag extraction, removal, and content cleaning. It's suited for cases where you want to extract specific information and clean up the HTML content according to your needs.

In [None]:
import pprint

from langchain.chains import create_extraction_chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

import nest_asyncio

nest_asyncio.apply()


llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")


schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)


def scrape_with_playwright(urls, schema):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["span"]
    )
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content


urls = ["https://www.wsj.com"]
extracted_content = scrape_with_playwright(urls, schema=schema)

# Serper

In [8]:
<img src="https://miro.medium.com/v2/resize:fit:4800/format:webp/1*5TnpUZnp4-sq8TuJGYe_-w.png" alt="image info" />

SyntaxError: invalid syntax (4280331964.py, line 1)

In [39]:
from langchain import OpenAI, SerpAPIWrapper
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType

SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')

llm = OpenAI(temperature=0)

search = SerpAPIWrapper(serpapi_api_key=SERPAPI_API_KEY)

tools = [
    Tool(
        name="Intermediate Answer",
        func=search.run,
        description="useful for when you need to ask with search", ) ]

self_ask_with_search = initialize_agent(
    tools, llm, agent=AgentType.SELF_ASK_WITH_SEARCH, verbose=True
)
self_ask_with_search.run(
    "Where can I find the Wohnungsaufsichtsgesetz for Berlin Germany?"
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Yes.
Follow up: What is the Wohnungsaufsichtsgesetz?[0m
Intermediate answer: [36;1m[1;3m['Ein Wohnungsaufsichtsgesetz regelt Pflichten der Vermieter. Deutschland Bearbeiten. In Deutschland existieren in einigen Bundesländern ...'][0m
[32;1m[1;3mFollow up: Where can I find the Wohnungsaufsichtsgesetz for Berlin Germany?[0m
Intermediate answer: [36;1m[1;3m["It was based on a paragraph in Berlin's Wohnungsaufsichtsgesetz (housing regulation) which states that every rental flat must have adequate ..."][0m
[32;1m[1;3mSo the final answer is: It was based on a paragraph in Berlin's Wohnungsaufsichtsgesetz (housing regulation) which states that every rental flat must have adequate ...[0m

[1m> Finished chain.[0m


"It was based on a paragraph in Berlin's Wohnungsaufsichtsgesetz (housing regulation) which states that every rental flat must have adequate ..."

# Google Programmable Search 
https://cse.google.com/cse.js?cx=8566b4b2cb2364df3

In [53]:
from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper


search = GoogleSearchAPIWrapper(google_api_key=GOOGLE_API_KEY, google_cse_id=GOOGLE_CSE_ID)

def top5_results(query):
    return search.results(query, 6)


tool = Tool(
    name="Google Search Snippets",
    description="Search Google for recent results.",
    func=top5_results,
)


tool.run("Wohnungsaufsichtsgesetz?")

[{'title': 'Wohnungsaufsichtsgesetz – Wikipedia',
  'link': 'https://de.wikipedia.org/wiki/Wohnungsaufsichtsgesetz',
  'snippet': 'Ein Wohnungsaufsichtsgesetz regelt Pflichten der Vermieter. Deutschland Bearbeiten. In Deutschland existieren in einigen Bundesländern\xa0...'},
 {'title': 'Wohnungsaufsichtsgesetz (Berlin) – Wikipedia',
  'link': 'https://de.wikipedia.org/wiki/Wohnungsaufsichtsgesetz_(Berlin)',
  'snippet': 'Das Gesetz zur Beseitigung von Wohnungsmißständen in Berlin (Wohnungsaufsichtsgesetz – WoAufG Bln) in der Fassung vom 3.'},
 {'title': 'Susanna Kahlefeld - Gesetz- und Verordnungsblatt',
  'link': 'https://www.berlin.de/sen/justiz/service/gesetze-und-verordnungen/2020/ausgabe-nr-15-vom-18-4-2020-s-245-256.pdf',
  'snippet': 'Apr 18, 2020 ... zur Änderung des Wohnungsaufsichtsgesetzes. Vom 7. April 2020 ... Das Wohnungsaufsichtsgesetz in der Fassung vom 3. April 1990.'},
 {'title': 'Gesetz- und Verordnungsblatt',
  'link': 'https://www.berlin.de/sen/justiz/service/geset

# HfAgent

In [54]:
from huggingface_hub import login
# login("<YOUR_TOKEN>")
login("")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/mz/.cache/huggingface/token
Login successful


In [None]:
from transformers.tools import HfAgent
import textract

text = textract.process('debatten.pdf').decode('utf-8')

agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
agent.run(f"can you summarize {text} for me")

# Chat Prompt Template

In [43]:
import openai
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.chat import SystemMessage, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI

template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a very technical economist, reformulating the user's text into economic terms,"
                 " with references to income, work and quality of living." 
                 "You try to extract economic keywords from the user's text," 
                 " the keywords should help to find the corresponding statistic in a statistical database."
            )
        ),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
)


llm = ChatOpenAI(model_name="gpt-3.5-turbo")
llm(template.format_messages(text='20 years ago I could afford to eat steak every day.'))

AIMessage(content='Two decades ago, my income allowed me to purchase steak on a daily basis, indicating a relatively high standard of living.', additional_kwargs={}, example=False)

# Retriever

In [None]:
from langchain.chat_models.openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.web_research import WebResearchRetriever
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.vectorstores import Chroma

# Vectorstore
vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(),
    persist_directory="./chroma_db_oai",
)

# LLM
llm = ChatOpenAI( temperature=0 )

# Custom Search with Google Programmable Search Engine
search = GoogleSearchAPIWrapper()

# Initialize
web_research_retriever = WebResearchRetriever.from_llm(
    vectorstore=vectorstore,
    llm=llm,
    search=search,
)

from langchain.chains import RetrievalQAWithSourcesChain

user_input = "What was the relative income 20 years ago in germany."
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, retriever=web_research_retriever
)

result = qa_chain({"question": user_input})
result