### Download Dropbox data to use for demonstration

In [None]:
%pip install llama-index
%pip install azure-core
%pip install pdf2image
%pip install pandas 
%pip install matplotlib
%pip install opencv-python
%pip install Pillow
%pip install azure-cognitiveservices-vision-computervision
%pip install azure-cognitiveservices-search-imagesearch
%pip install azure-cognitiveservices-search-newssearch
%pip install numpy
%pip install azure-cognitiveservices-search-websearch
%pip install tabulate
%pip install html2text

# Hide pip messages 
import warnings
warnings.filterwarnings('ignore')

In [46]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [47]:
from llama_index import download_loader
from pathlib import Path
import os
os.environ['OPENAI_API_KEY'] = "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"

In [49]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)
loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2023, 2022]

from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('../data').load_data()

for year in years:
    ubs_docs = loader.load_data(file=Path(f'../data/full-report-ubs-group-ag-consolidated-1q23.pdf'), split_documents=False)
    # insert year metadata into each year
    for d in ubs_docs:
        d.extra_info = {"year": year}
    doc_set[year] = ubs_docs
    all_docs.extend(ubs_docs)

[nltk_data] Downloading package punkt to /Users/lnshuti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lnshuti/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


ImportError: pypdf is required to read PDF files: `pip install pypdf`

### Setup Service Context

In [None]:
from llama_index import ServiceContext, GPTVectorStoreIndex, GPTTreeIndex
service_context = ServiceContext.from_defaults(chunk_size_limit=512)

In [None]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTVectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index

In [None]:
help(index_set[2023])

In [None]:
index_set[2023].set_index_id("chatmyapp")

# Demonstrate MONGODB connection

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from llama_index import GPTListIndex, SimpleMongoReader
from IPython.display import Markdown, display
import os

In [None]:
host = "<host>"
port = "<port>"
db_name = "<db_name>"
collection_name = "<collection_name>"
# query_dict is passed into db.collection.find()
query_dict = {}
field_names = ["text"]
reader = SimpleMongoReader(host, port)
documents = reader.load_data(db_name, collection_name, field_names, query_dict=query_dict)

In [None]:
query_dict = {}
field_names = ["text"]
reader = SimpleMongoReader(host, port)

# documents = reader.load_data(db_name, collection_name, field_names, query_dict=query_dict)
# set Logging to DEBUG for more detailed outputs
query_engine = new_var.as_query_engine()
response = query_engine.query("summarize financial results for the quarter")
display(Markdown(f"<b>{response}</b>"))

### Slack data connector



In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from llama_index import GPTListIndex, SlackReader
from IPython.display import Markdown, display

In [None]:
slack_token = os.getenv("SLACK_BOT_TOKEN")
channel_ids = ["<channel_id>"]
documents = SlackReader(slack_token=slack_token).load_data(channel_ids=channel_ids)
index = GPTListIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("<query_text>")
display(Markdown(f"<b>{response}</b>"))

### Web Page Reader

In [None]:
from llama_index import GPTListIndex, SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(["http://paulgraham.com/worked.html"])

In [None]:
display(Markdown(f"<b>{documents[0]}</b>"))

In [None]:
from llama_index import ServiceContext, GPTVectorStoreIndex

service_context = ServiceContext.from_defaults(chunk_size_limit=512)

In [None]:
global_index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = global_index.as_query_engine()
response = query_engine.query("List bullet points summarizing this text. Do not reapeat the author at the beginning of each bullet point.")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = query_engine.query("How did the author explain the difference between putting something online and publishing it online?")
display(Markdown(f"<b>{response}</b>"))

### Composing a Graph to synthesize answers across documents

In [None]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph

In [None]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [None]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)

In [None]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)

In [None]:
query_configs = [
    {
        "index_struct_type": "dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            # "include_summary": True
        }
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
        }
    },
]

In [None]:
response_summary = graph.query(risk_query_str, query_configs=query_configs)

In [None]:
print(response_summary)

In [None]:
print(response_summary.get_formatted_sources())

In [None]:
response_tmp = index_set[2022].query(risk_query_str)

In [None]:
str(response_tmp)

In [None]:
# query a global index
response = global_index.query(risk_query_str, similarity_top_k=4)

In [None]:
str(response)