### Download Dropbox data to use for demonstration

In [None]:
%pip install llama-index
%pip install azure-core
%pip install pdf2image
%pip install pandas 
%pip install matplotlib
%pip install opencv-python
%pip install Pillow
%pip install azure-cognitiveservices-vision-computervision
%pip install azure-cognitiveservices-search-imagesearch
%pip install azure-cognitiveservices-search-newssearch
%pip install numpy
%pip install azure-cognitiveservices-search-websearch
%pip install tabulate
%pip install html2text

# Hide output and messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
from llama_index import download_loader
from pathlib import Path
import os
os.environ['OPENAI_API_KEY'] = "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"

In [None]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)
loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2023]
for year in years:
    ubs_docs = loader.load_data(file=Path(f'../data/full-report-ubs-group-ag-consolidated-1q23.pdf'), split_documents=False)
    # insert year metadata into each year
    for d in ubs_docs:
        d.extra_info = {"year": year}
    doc_set[year] = ubs_docs
    all_docs.extend(ubs_docs)

### Setup Service Context

In [None]:
from llama_index import ServiceContext, GPTVectorStoreIndex, GPTTreeIndex
service_context = ServiceContext.from_defaults(chunk_size_limit=512)

In [None]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTVectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index

In [None]:
help(index_set[2023])

In [None]:
index_set[2023].set_index_id("chatmyapp")

# Demonstrate MONGODB connection

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from llama_index import GPTListIndex, SimpleMongoReader
from IPython.display import Markdown, display
import os

In [None]:
host = "<host>"
port = "<port>"
db_name = "<db_name>"
collection_name = "<collection_name>"
# query_dict is passed into db.collection.find()
query_dict = {}
field_names = ["text"]
reader = SimpleMongoReader(host, port)
documents = reader.load_data(db_name, collection_name, field_names, query_dict=query_dict)

In [None]:
query_dict = {}
field_names = ["text"]
reader = SimpleMongoReader(host, port)

# documents = reader.load_data(db_name, collection_name, field_names, query_dict=query_dict)
# set Logging to DEBUG for more detailed outputs
query_engine = new_var.as_query_engine()
response = query_engine.query("summarize financial results for the quarter")
display(Markdown(f"<b>{response}</b>"))

### Slack data connector



In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from llama_index import GPTListIndex, SlackReader
from IPython.display import Markdown, display

In [None]:
slack_token = os.getenv("SLACK_BOT_TOKEN")
channel_ids = ["<channel_id>"]
documents = SlackReader(slack_token=slack_token).load_data(channel_ids=channel_ids)
index = GPTListIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("<query_text>")
display(Markdown(f"<b>{response}</b>"))

### Web Page Reader

In [None]:
from llama_index import GPTListIndex, SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(["http://paulgraham.com/worked.html"])

In [None]:
display(Markdown(f"<b>{documents[0]}</b>"))

In [21]:
from llama_index import ServiceContext, GPTVectorStoreIndex

service_context = ServiceContext.from_defaults(chunk_size_limit=512)

In [23]:
global_index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 30776 tokens
> [build_index_from_nodes] Total embedding token usage: 30776 tokens
> [build_index_from_nodes] Total embedding token usage: 30776 tokens


In [36]:
# set Logging to DEBUG for more detailed outputs
query_engine = global_index.as_query_engine()
response = query_engine.query("List bullet points summarizing this text. Do not reapeat the author at the beginning of each bullet point.")

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 23 tokens
> [retrieve] Total embedding token usage: 23 tokens
> [retrieve] Total embedding token usage: 23 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1159 tokens
> [get_response] Total LLM token usage: 1159 tokens
> [get_response] Total LLM token usage: 1159 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens


In [37]:
display(Markdown(f"<b>{response}</b>"))

<b>
- Explored how author chose what to work on in the past
- Discovered answer was long and messy
- Thought others might find it interesting and encouraging
- Wrote a more detailed version for others to read
- Noted experience skipped step in evolution of computers
- Italian words for abstract concepts can be predicted from English cognates
- Described walk to Accademia in Florence
- Noted painting people like still lives
- Explained how Lisp was better than other languages
- Noted difference between putting something online and publishing it online
- Noted customs continue to constrain even after restrictions that caused them have disappeared
- Noted independent-minded people will have an advantage in fields affected by rapid change
- Noted can't always predict which fields will be affected by rapid change</b>

In [38]:
response = query_engine.query("How did the author explain the difference between putting something online and publishing it online?")
display(Markdown(f"<b>{response}</b>"))

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 16 tokens
> [retrieve] Total embedding token usage: 16 tokens
> [retrieve] Total embedding token usage: 16 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1082 tokens
> [get_response] Total LLM token usage: 1082 tokens
> [get_response] Total LLM token usage: 1082 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens


<b>
The author explained the difference between putting something online and publishing it online by noting that in the print era, there was a narrow channel to readers, guarded by fierce monsters known as editors. The only way to get an audience for anything you wrote was to get it published as a book, or in a newspaper or magazine. Now, with the internet, anyone could publish anything. This meant that there would be a whole new generation of essays that had never been written before because there had been no way to publish them.</b>

### Composing a Graph to synthesize answers across documents

In [44]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph

In [45]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)