# WORK IN PROGRESS

# Notebook 1: RAG Approaches for Q&A on your Documents

In this notebook we will show how to use RAG with your documents as well as a couple methods to answer a broader array of question types.

First, we will import libraries and load the 10k. Then, using the Tesla (TSLA) 10k report, the notebook will divided into 4 sections
1. Part 1: Building a Basic LLM RAG Pipeline
2. Part 2: Basic Improvements: Try to tailored parsing 
3. Part 3: Adding query-splitting / multi-query

I will not touch on adding chat history in this notebook, but certainly that is important as well. That can be as easy as saving the messages and adding that to the messages list. 

In this notebook, we will only use [langchain](https://python.langchain.com/docs/get_started/introduction) for parsing and retrieval to make the prompting explicit. 

# Import libraries and load the 10k

In [1]:
import subprocess
import tiktoken
import pandas as pd
import os
import csv
import json
import time
import re
import transformers
import torch
import numpy as np
from datetime import datetime

#We will use langchanin to create a vector store to retrieve stronger negatives
import faiss
from langchain.vectorstores.faiss import FAISS
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader, UnstructuredPDFLoader
from langchain.docstore.document import Document

MODEL = "gpt-4-1106-preview"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"## "BAAI/bge-base-en-v1.5""all-MiniLM-L6-v2"

In [2]:
import openai

from dotenv import load_dotenv
# Load environment variables
load_dotenv()

openai.api_key = os.getenv('OPENAI_API_KEY')
if os.getenv('OPENAI_API_BASE'):
    openai.api_base = os.getenv('OPENAI_API_BASE')
if os.getenv('OPENAI_API_TYPE'):
    openai.api_type = os.getenv('OPENAI_API_TYPE')
if os.getenv('OPENAI_API_VERSION'):
    openai.api_version = os.getenv('OPENAI_API_VERSION')

In [3]:
this_pdf = "./data/tsla-20231231-gen.pdf"

# read and load 10k pdf file
loader = UnstructuredPDFLoader(this_pdf)
docs_10k = loader.load()

# Part 1: Building a Basic LLM RAG Pipeline

### Split docs and store in FAISS Vector Database for 10k

Here, we use a chunk size of 128 and overlap of 64 because we found that to be successful for most use cases if using all-MiniLM-L6-v2. 

If using BAAI/bge-base-en-v1.5, chunk size of 256 with overlap of 64 seems to perform slightly better.

Also, for simplicity, this just uses the tiktoken encoder as opposed to the embedding model specific encoder. Feel free to change. 

In [4]:
#let's split the document into chunks chunk size 128 and overlap size 64
def create_and_get_retriever(docs, emb_model_name, chunk_size = 128, chunk_overlap = 64, top_k=8, add_start_index = True, 
                             separators = ["\n\n\n","\n\n", "\n", " ", ""], context_header = []): #r"^(\d+)\s*\n\1\s*\n\1\s*$", is_separator_regex=True 
    print(f"Original # of docs {len(docs)}")
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = chunk_size, chunk_overlap  = chunk_overlap, add_start_index = add_start_index,
                                                                         separators=separators)
    docs_split = text_splitter.split_documents(docs)
    print(f"Split # of docs {len(docs_split)}")

    embedding_function = HuggingFaceEmbeddings(
            model_name=emb_model_name,
            cache_folder="./models/sentencetransformers"
        )
    if len(context_header) > 0: #if adding metadata to the page content to aide retrieval
        for doc in docs_split:
            header_str = ""
            for header in context_header:
                if header in doc.metadata.keys():
                    header_str += str(header)+ ": " + doc.metadata[header] + "; " 
            doc.page_content = header_str +"\n"+ doc.page_content
            doc.metadata = {"context_header": context_header}

    db = FAISS.from_documents(docs_split, embedding_function)
    db.save_local("./data/faiss")
    return VectorStoreRetriever(vectorstore=db, search_kwargs={"k": top_k})

In [14]:
# use 16 docs since smaller chunk size
retriever = create_and_get_retriever(docs_10k, EMBEDDING_MODEL_NAME, top_k=16)

Original # of docs 1
Split # of docs 2531


In [15]:
def generate_kb_response(prompt, model, retriever, system_prompt="",template=None, temperature=0, include_source=False):
    """
    Generate a response to a prompt using the given model and the knowledge base retriever.
    Args:
    prompt: The prompt to generate a response to.
    model: OpenAI model to use to generate the response (can change to other models if needed)
    retriever: The knowledge base retriever to use to retrieve relevant documents.
    system_prompt: The system prompt to use for the OpenAI model.
    template: The template to use for the prompt. If None, a default template will be used. Please use {prompt} and {context} as placeholders for the prompt and context.
    temperature: The temperature to use for the OpenAI model.
    include_source: Whether to include the source documents metadata as part of context for LLM. Useful if you want the LLM to include the source documents in the response.
    """

    relevant_docs = retriever.get_relevant_documents(prompt)

    relevant_docs_str = ""
    docs_with_source = ""
    for doc in relevant_docs:
        if include_source:
            docs_with_source += doc.page_content + "\n" + "Source: " + str(doc.metadata) + "\n\n"
        else:
            relevant_docs_str += doc.page_content + "\n\n"
            docs_with_source += doc.page_content + "\n" + "Source: " + str(doc.metadata) + "\n\n"
    if include_source:
        relevant_docs_str = docs_with_source

    if template is None:
        prompt_full = f"""Answer based on the following context

        {relevant_docs_str}

        Question: {prompt}"""
    else:
        prompt_full = template.format(prompt=prompt, context=relevant_docs_str)

    try:
        response_full = openai.chat.completions.create(model=model, messages=[{"role": "system", "content": system_prompt},{"role": "user", "content": prompt_full }],temperature=temperature)
    except Exception as e:
        print("OpenAI API call failed. Waiting 5 seconds and trying again.")
        time.sleep(5)
        response_full = openai.chat.completions.create(model=model, messages=[{"role": "system", "content": system_prompt},{"role": "user", "content": prompt_full }],temperature=temperature)
    except Exception as e:
        return {'answer':"Exception: OpenAI API call failed. Please check code or try again later.", 'source_documents':docs_with_source} 
    
    #response = response_full['choices'][0]['message']['content']
    response = response_full.choices[0].message.content
    
    return {'answer':response, 'source_documents':docs_with_source}


In [7]:
prompt = "What products and services does Tesla offer?"
result = generate_kb_response(prompt, MODEL, retriever)
print(result['answer'])

Tesla offers a range of products and services related to electric vehicles, sustainable energy, and artificial intelligence. Here's a summary of what they offer:

1. Electric Vehicles: Tesla designs, manufactures, and sells electric vehicles, including sedans like the Model S, crossovers like the Model X, and more affordable models like the Model 3 and Model Y. They have also begun production and deliveries of a commercial electric vehicle, the Tesla Semi, and have plans for additional electric vehicles to address various market segments.

2. Solar Energy Generation Systems: Tesla provides solar energy solutions, including solar panels and Solar Roof tiles for residential and commercial customers who wish to generate their own renewable energy.

3. Energy Storage Products: The company offers energy storage products like the Powerwall for residential use, Powerpack for commercial use, and Megapack for utility-scale projects, which store energy for later use and help manage energy consum

Wow! That looks great. Let's try another prompt

In [16]:
prompt = "What was the most unexpected and material headwind discussed in the Management's Discussion section?"
result = generate_kb_response(prompt, MODEL, retriever)
print(result['answer'])

Based on the provided context, the most unexpected and material headwind discussed in the Management's Discussion section appears to be the severe winter storms in the first quarter of 2021 that had a widespread impact on utilities and transportation. This event is highlighted as a significant and unforeseen challenge that affected the company's operations. Natural disasters like this can have a substantial impact on production, supply chains, and overall business continuity, which makes them material headwinds for any company.


So we seem to have an issue here. This is referencing the Risk Factors section and is not particularly relevant.

But, how could our retriever know to bring in the needed context?

Let's look at what was pulled for Part 2.

# Part 2: Basic Improvements

Let's take a look of the documents that were pulled.

In [17]:
returned_docs = retriever.get_relevant_documents(prompt)
docs_str = ""
for idx, doc in enumerate(returned_docs):
    docs_str+=f"\nDoc {idx+1}\n"+doc.page_content.replace("\t", " ") #replacing \t with space for better readability
print(docs_str)


Doc 1
increased our employee headcount and operations, we are and may continue to be subject to increased scrutiny, including litigation and government investigations, that we will need to defend against. If we are unable to successfully defend ourselves in such litigation or government investigations, it
Doc 2
As discussed in and subject to the considerations referenced in Part II, Item 7, Management's Discussion and Analysis of Financial Condition and Results of Operations—Management Opportunities, Challenges and Uncertainties and 2023 Outlook—Cash Flow and Capital Expenditure Trends in this
Doc 3
commentary by a range of third parties. Such attention can include criticism, which may be exaggerated or unfounded, such as speculation regarding the sufficiency or stability of our management team. Any such negative perceptions, whether caused by us or not, may harm our business and make it more difficult to raise additional funds if needed.
Doc 4
Under the supervision and with the parti

We notice very little from the "Management's Discussion and Analysis of Financial Condition and Results of Operations". The second doc reference the section, but is from a different section. What should we do? Let's try to parse the doc different to try to keep some context.

A nice overview of PDF parsing methods can be found at: https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf

Here we create our own for the SEC html version, but use some of the learnings there.

In [18]:
from bs4 import BeautifulSoup
this_10k = "./data/tsla-20231231.html"

# Load your XBRL file
with open(this_10k, 'r', encoding='windows-1252') as file:
    contents = file.read()

soup = BeautifulSoup(contents, 'html.parser')

# Extract all text within <div> tags (REVIST THIS TO MAKE SURE NOT MISSING ANYTHING)
content = soup.find_all('div')

snippets = []
curr_txt = ""
section = ""
cur_fs = None
cur_fw = None
new_section = False
for c in content:
    sp = c.find('span')
    if len(c.text)==0:
        curr_txt += "\n"
        continue
    if not sp:
        continue
    st = sp.get('style')
    if not st:
        continue

    #if font weight is bold or larger font size, then it's likely a new section
    fs = re.findall('font-size:(\d+)px',st)
    if fs:
        if cur_fs and int(fs[0]) > cur_fs and curr_txt.endswith("\n"):
            new_section = True
        cur_fs = int(fs[0])
    fw = re.findall('font-weight:(\d+)',st)
    if fw:
        if cur_fw and int(fw[0]) > cur_fw and curr_txt.endswith("\n"):
            new_section = True
        cur_fw = int(fw[0])

    is_underline = len(c.find_all('span', style=lambda value: value and 'text-decoration:underline' in value))>0
    # if is_underline:
    #     print(f"Underline {c.text}")
    is_italic = len(c.find_all('span', style=lambda value: value and 'font-style:italic' in value))>0
    # if is_italic:
    #     print(f"Italic {c.text}")

    #potential to be new section if new line
    new_section = new_section or is_underline or is_italic
    
    if curr_txt.endswith("\n") and c.text.lower().startswith("item "):
        snippets.append((curr_txt,section))
        section = c.text
        curr_txt = c.text
    elif curr_txt.endswith("\n") and new_section:
        snippets.append((curr_txt,section))
        curr_txt = c.text 
    else:
        curr_txt = curr_txt + "\n" + c.text
    new_section = False
snippets.append((curr_txt,section))

html_10k_split = []
for idx, snippet in enumerate(snippets):
    html_10k_split.append(Document(snippet[0], metadata={"source":this_10k,"section":snippet[1], "idx":idx}))
html_10k_split

[Document(page_content='\n\n\n\n\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n(Mark One)\nxANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2023\nOR\noTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from _________ to _________\nCommission File Number: 001-34756\nTesla, Inc.\n(Exact name of registrant as specified in its charter)\nDelaware91-2197729(State or other jurisdiction ofincorporation or organization)(I.R.S. EmployerIdentification No.)\n1 Tesla RoadAustin, Texas78725(Address of principal executive offices)(Zip Code)\n1 Tesla Road\nAustin, Texas\n(512) 516-8177\n(Registrant’s telephone number, including area code)\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each classTrading Symbol(s)Name of each exchange on which registeredCommon stockTSLAThe Nasdaq Global Select Market

This is looking promising!

In [19]:
new_retriever = create_and_get_retriever(html_10k_split, EMBEDDING_MODEL_NAME, chunk_size = 128, chunk_overlap = 64, top_k=16, add_start_index = True, context_header = ['section'])

Original # of docs 98
Split # of docs 1261


Let's check out updated retriever and notice most of the top docs are from the desired section(s)! 

In [20]:
print(prompt) 
returned_docs = new_retriever.get_relevant_documents(prompt)
docs_str = ""
for idx, doc in enumerate(returned_docs):
    docs_str+=f"\nDoc {idx+1}\n"+doc.page_content.replace("\t", " ") #replacing \t with space for better readability
print(docs_str)

What was the most unexpected and material headwind discussed in the Management's Discussion section?

Doc 1
section: ITEM 1A. RISK FACTORS; 
products, business, results of operations, and statements and actions of Tesla and its management are subject to significant amounts of commentary by a range of third parties. Such attention can include criticism, which may be exaggerated or unfounded, such as speculation regarding the sufficiency or stability of our management team. Any such negative perceptions, whether caused by us or not, may harm our business and make it more difficult to raise additional funds if needed.
Doc 2
section: ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS; 
34
34
34
Doc 3
section: ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS; 
39
39
39
Doc 4
section: ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS; 
33
33
33
Doc 5
section: IT

In [21]:
result = generate_kb_response(prompt, MODEL, new_retriever)
print(result['answer'])

The most unexpected and material headwind discussed in the Management's Discussion section appears to be the significant decrease in gross margin for the total automotive & services and other segment, which decreased from 26.5% to 18.2% in the year ended December 31, 2023, as compared to the year ended December 31, 2022. This decrease is primarily attributed to the automotive gross margin decrease. This substantial reduction in gross margin would likely be considered both unexpected and material, as it directly impacts profitability.


WOW - what a difference! 

Let's see if we can stump this new pipeline :) 

In [24]:
prompt = "What are the key risk factors for Tesla and do they hedge foreign currency risk?"

result = generate_kb_response(prompt, MODEL, new_retriever)
print(result['answer'])

The key risk factors for Tesla, as outlined in the provided context, include:

1. **Public Perception and Commentary**: Tesla's products, business, and management are subject to significant public attention and commentary, including criticism that may be exaggerated or unfounded. Negative perceptions can harm Tesla's business and its ability to raise additional funds.

2. **Foreign Currency Risk**: Tesla conducts business globally and is exposed to foreign currency risks as it transacts in multiple currencies. This risk affects Tesla's operating results when expressed in U.S. dollars because they do not typically hedge foreign currency risk.

3. **Market Demand for Electric Vehicles**: Tesla's growth is dependent on the demand for electric vehicles. If the market does not develop as expected, or if demand decreases, Tesla's business could be adversely affected.

4. **Competition for Talent**: Tesla operates in a competitive labor market and competes for talented individuals with automo

Look's pretty good again! Note: when I tried this with the old retriever it did not know if TSLA hedge foreign currency risk.

# Add Query Splitting

I am not going to motivate this one as it is easy to motivate by thinking about multiple documents (say we were comparing Ford and Tesla), but often it can be very beneficial to ask an LLM to transform a query into one or more queries for the retrieval process. Let's see how this can be done.

In [33]:
# Function to generate queries using OpenAI's ChatGPT
def generate_subqueries(original_query, model, second_try=False):
    MIN_QUESTION_LENGTH = 20
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that helps users identify how to search for relevant information. You do not introduce any new topics, specific details, or key words, but provide users with sub-queries phrased as questions to gather information to answer the original input query."},
                {"role": "user", "content": 
                f"""Please return 2 sub-queries that would be helpful to gather the required information to answer the original query. The format of your response should be:
                1. Sub-query 1
                2. Sub-query 2

                Original query: {original_query}

                Sub-queries:"
                """}
            ]
        )
    except Exception as e:
        print(e)
        time.sleep(5)
        if second_try:
            return []
        return generate_subqueries(original_query, model, second_try=True)

    generated_queries = response.choices[0].message.content.strip()
    if len(generated_queries) < MIN_QUESTION_LENGTH or generated_queries[:MIN_QUESTION_LENGTH].lower().find("n/a") >-1:
        generated_queries = []
    else:
        generated_queries_list = generated_queries.split("\n")
        generated_queries = []
        for i in range(len(generated_queries_list)):            
            generated_queries_list[i] = generated_queries_list[i].strip()
            # if generated_queries[i] startswith a number and a period, remove it
            if len(generated_queries_list[i])>MIN_QUESTION_LENGTH and generated_queries_list[i][0].isdigit():
                generated_queries_list[i] = generated_queries_list[i][min(4,generated_queries_list[i].find(" ")+1):]
            if len(generated_queries_list[i]) > MIN_QUESTION_LENGTH:
                generated_queries.append(generated_queries_list[i])
            
    return generated_queries


In [34]:
orig_query = "What are they major technology difference in Volkswagen's and Tesla's electric vehicles?"
generated_queries = generate_subqueries(orig_query, MODEL, second_try=False)
lookup_queries = generated_queries + [orig_query]
lookup_queries

['What are the key technology features of Volkswagen electric vehicles?',
 'What are the key technology features of Tesla electric vehicles?',
 "What are they major technology difference in Volkswagen's and Tesla's electric vehicles?"]