# WORK IN PROGRESS

# Notebook 2: Approaches for Tabular Data

In this notebook we will show a few ways to use LLM pipelines with tabular data.

We will consider on Tesla (TSLA) stock prices data.
* TBD
* TBD

# Import libraries and load the 10k and stock data

In [None]:
import subprocess
import tiktoken
import pandas as pd
import os
import csv
import json
import time
import re
import transformers
import torch
import numpy as np
from datetime import datetime

# To use with the router
from rouge_score import rouge_scorer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

#We will use langchanin to create a vector store to retrieve stronger negatives
import faiss
from langchain.vectorstores.faiss import FAISS
from langchain.docstore import InMemoryDocstore
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, csv_loader
# from langchain.embeddings.sentence_transformer import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.utils import mock_now



EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"## "BAAI/bge-base-en-v1.5""all-MiniLM-L6-v2"

In [None]:
# read and load 10k pdf file
loader = UnstructuredPDFLoader("./data/tsla-20231231-gen.pdf")
docs_10k = loader.load()

#load stock data. Load dataframe and load directly as docs
tsla_stock = pd.read_csv("./data/TSLA.csv")
loader  = csv_loader.CSVLoader(file_path="./data/TSLA.csv")
stock_data_docs = loader.load()
stock_data_docs[0]

Document(page_content='Date: 2010-06-29\nOpen: 1.266667\nHigh: 1.666667\nLow: 1.169333\nClose: 1.592667\nAdj Close: 1.592667\nVolume: 281494500', metadata={'source': './data/TSLA.csv', 'row': 0})

# Split docs and store in FAISS Vector Database for 10k

In [None]:
#let's split the document into chunks chunk size 128 and overlap size 64
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 128, chunk_overlap  = 64, add_start_index = True)
docs_split = text_splitter.split_documents(docs_10k)
print(len(docs_split))

2530


In [None]:
embedding_function = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        cache_folder="./models/sentencetransformers"
    )

In [None]:
db = FAISS.from_documents(docs_split, embedding_function)
db.save_local("./data/faiss")

# Build Static Knowledge Base of Stock Data

In [None]:
#first let's add more context to help with Retrieval and add date to metadata (for later use)
for ii in range(0, len(stock_data_docs)):
    stock_data_docs[ii].page_content = "Daily stock market data for Tesla (TSLA):\n" + stock_data_docs[ii].page_content
    date = re.findall(r'Date: (\d{4}-\d{2}-\d{2})', stock_data_docs[ii].page_content)
    if len(date) > 0:
        stock_data_docs[ii].metadata['last_accessed_at'] = datetime.strptime(date[0], '%Y-%m-%d')
    else:
        stock_data_docs[ii].metadata['last_accessed_at'] = None
stock_data_docs[0]

Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2010-06-29\nOpen: 1.266667\nHigh: 1.666667\nLow: 1.169333\nClose: 1.592667\nAdj Close: 1.592667\nVolume: 281494500', metadata={'source': './data/TSLA.csv', 'row': 0, 'last_accessed_at': datetime.datetime(2010, 6, 29, 0, 0)})

In [None]:
db_data = FAISS.from_documents(stock_data_docs, embedding_function)
db_data.save_local("./data/faiss_stock")

# Build Question Bank and Similarity Functions

To get the idea across, I used a simplistic method to route questions / prompt to a given LLM pipeline using ROUGE F1 and Dense Similarity with a templated question bank. A more common approach is to use the LLM for orchestration, but this can incur additional cost.

In [None]:
COMPANY_DETAILS_QUESTIONS = ["Describe {company_name}'s business.", "Describe what {company_name} does.", "Describe {company_name}.", "What sector or industry does {company_name} operate in?",
                             "What market does {company_name} serve?", "What products does {company_name} offer?", "What services does the {company_name} offer?",
                             "Who are the {company_name}'s clients or customers?", "Who are the {company_name}'s suppliers?", "Who are the {company_name}'s vendors?"]

STOCK_MARKET_QUESTIONS = ["What is the current stock price for {company_name}?", "What is the stock price for {company_name}?", "What is the stock price for {company_name} today?",
                          "What is the latest stock price for {company_name}?", "What was the trading volume for {company_name}?", "What was the open price for {company_name}?",
                          "What was the close price for {company_name}?", "What was the high price for {company_name}?", "What was the low price for {company_name}?"]

#use equity research questions (for other research questions, we can use the same format)
COVERAGE_QUESTIONS = ["What is the analyst coverage for {company_name}?", "What is the analyst rating for {company_name}?", "What is the analyst price target for {company_name}?"]

# Route added if we had financial data
FINANCIALS_QUESTIONS = ["What is the revenue for {company_name}?", "What is the net income for {company_name}?","What is the operating income for {company_name}?","What's {company_name}'s EBITDA?",
                        "What is the gross profit for {company_name}?", "What's {company_name}'s gross margin like?", "How much cash does {company_name} have?", "How much revenue did {company_name} earn in the last quarter?"]

QUESTION_LISTS = [COMPANY_DETAILS_QUESTIONS, STOCK_MARKET_QUESTIONS]

In [None]:
# Initialize the lemmatizer and get list of stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_max_rouge_f1(prompt, question_list, company_name, lower=True):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    if lower:
        prompt = prompt.lower()
        question_list = [question.lower() for question in question_list]
    words = word_tokenize(prompt)
    # Remove stop words and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    prompt_clean = ' '.join(words)

    question_list_clean=[]
    max_score = 0
    best_question = ''
    for question in question_list:
        question = question.format(company_name=company_name)
        words = word_tokenize(question)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        question_clean = ' '.join(words)
        question_list_clean.append(question_clean)
        scores = scorer.score(prompt_clean, question_clean)
        if scores['rouge1'].fmeasure > max_score:
            max_score = scores['rouge1'].fmeasure
            best_question = question
    return max_score, best_question

In [None]:
def get_max_similarity(prompt, question_list, company_name):

    prompt_embedding = embedding_function.embed_documents([prompt])
    question_list = [question.format(company_name=company_name) for question in question_list]
    question_list_embeddings = embedding_function.embed_documents(question_list)

    similarity = cosine_similarity(prompt_embedding, question_list_embeddings)

    max_score = np.max(similarity)
    best_question = question_list[np.argmax(similarity)]
    return max_score, best_question

In [None]:
question = "How much was stock price recently?"

print("COMPANY_DETAILS_QUESTIONS: {}".format(get_max_rouge_f1(question, COMPANY_DETAILS_QUESTIONS, company_name="Tesla")))
print("STOCK_MARKET_QUESTIONS: {}".format(get_max_rouge_f1(question, STOCK_MARKET_QUESTIONS, company_name="Tesla")))

COMPANY_DETAILS_QUESTIONS: (0, '')
STOCK_MARKET_QUESTIONS: (0.5714285714285715, 'what is the stock price for Tesla?')


# Build Basic LLM Pipelines for Structured Data

Note: Our stock data is just through 2024-02-02

In [None]:
top_k=16
retriever_stock = VectorStoreRetriever(vectorstore=db_data, search_kwargs={"k": top_k})
retriever = VectorStoreRetriever(vectorstore=db, search_kwargs={"k": top_k})

def generate_response(prompt, retriever):
    #today's date - let's pretend it is 2024-02-02
    today = "2024-02-02"
    #replace "current" or "today" with today's date
    prompt = re.sub(r'current|today', today, prompt, flags=re.IGNORECASE)
    print("Prompt: ", prompt)
    # Get the top k most similar documents
    results = retriever.get_relevant_documents(prompt)
    return results

In [None]:
question ="What is TSLA's current close price?"
generate_response(question, retriever_stock)

Prompt:  What is TSLA's 2024-02-02 close price?


[Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2021-08-03\nOpen: 239.666672\nHigh: 240.883331\nLow: 233.669998\nClose: 236.580002\nAdj Close: 236.580002\nVolume: 64860900', metadata={'source': './data/TSLA.csv', 'row': 2793}),
 Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2021-04-20\nOpen: 239.139999\nHigh: 245.750000\nLow: 236.896667\nClose: 239.663330\nAdj Close: 239.663330\nVolume: 106827000', metadata={'source': './data/TSLA.csv', 'row': 2720}),
 Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2022-08-18\nOpen: 306.000000\nHigh: 306.500000\nLow: 301.853333\nClose: 302.869995\nAdj Close: 302.869995\nVolume: 47500500', metadata={'source': './data/TSLA.csv', 'row': 3056}),
 Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2023-10-25\nOpen: 215.880005\nHigh: 220.100006\nLow: 212.199997\nClose: 212.419998\nAdj Close: 212.419998\nVolume: 107065100', metadata={'source': './data/TSLA.csv', '

Notice that even when we replace "current" with today's date, our retrieval process is not strong enough to only pick recent dates.

Option 1: TimeWeightedVectorStoreRetriever<br>
Option 2: Add date to the top of context

In [None]:
# Solution 1
embedding_size = len(embedding_function.embed_documents([question])[0])
index = faiss.IndexFlatL2(384)
vectorstore = FAISS(embedding_function, index, InMemoryDocstore({}), {})
tw_retriever_stock = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore,decay_rate=0.005, k=top_k)
# Notice the last access time is that date time
tw_retriever_stock.add_documents(stock_data_docs)
with mock_now(datetime(2024, 2, 2, 23, 0)):
    rel_docs = generate_response(question, tw_retriever_stock)
rel_docs

Prompt:  What is TSLA's 2024-02-02 close price?


[Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2024-02-02\nOpen: 185.039993\nHigh: 188.690002\nLow: 182.000000\nClose: 187.910004\nAdj Close: 187.910004\nVolume: 110505100', metadata={'source': './data/TSLA.csv', 'row': 3422, 'last_accessed_at': MockDateTime(2024, 2, 2, 23, 0), 'created_at': datetime.datetime(2024, 2, 7, 17, 12, 39, 463554), 'buffer_idx': 3422}),
 Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2024-02-01\nOpen: 188.500000\nHigh: 189.880005\nLow: 184.279999\nClose: 188.860001\nAdj Close: 188.860001\nVolume: 91843300', metadata={'source': './data/TSLA.csv', 'row': 3421, 'last_accessed_at': MockDateTime(2024, 2, 2, 23, 0), 'created_at': datetime.datetime(2024, 2, 7, 17, 12, 39, 463554), 'buffer_idx': 3421}),
 Document(page_content='Daily stock market data for Tesla (TSLA):\nDate: 2024-01-31\nOpen: 187.000000\nHigh: 193.970001\nLow: 185.850006\nClose: 187.289993\nAdj Close: 187.289993\nVolume: 103221400', metadata={'source

This technically worked, but is not what we want. The 'last_accessed_at' was updated so it is not longer using the date for the stock and this will hurt performance when asking for previous dates. Let's try option 2.

In [None]:
for ii in range(0, len(stock_data_docs)):
    stock_data_docs[ii].page_content = stock_data_docs[ii].page_content.replace('Daily stock market data for Tesla (TSLA)', str(stock_data_docs[ii].metadata['last_accessed_at'].date())+ ' stock market data for Tesla (TSLA)')
stock_data_docs[0]

Document(page_content='2010-06-29 stock market data for Tesla (TSLA):\nDate: 2010-06-29\nOpen: 1.266667\nHigh: 1.666667\nLow: 1.169333\nClose: 1.592667\nAdj Close: 1.592667\nVolume: 281494500', metadata={'source': './data/TSLA.csv', 'row': 0, 'last_accessed_at': datetime.datetime(2010, 6, 29, 0, 0)})

In [None]:
db_data_v2 = FAISS.from_documents(stock_data_docs, embedding_function)
db_data_v2.save_local("./data/faiss_stock_v2")

retriever_stock_v2 = VectorStoreRetriever(vectorstore=db_data_v2, search_kwargs={"k": top_k})
generate_response(question, retriever_stock_v2)


Prompt:  What is TSLA's 2024-02-02 close price?


[Document(page_content='2022-12-19 stock market data for Tesla (TSLA):\nDate: 2022-12-19\nOpen: 154.000000\nHigh: 155.250000\nLow: 145.820007\nClose: 149.869995\nAdj Close: 149.869995\nVolume: 139390600', metadata={'source': './data/TSLA.csv', 'row': 3141, 'last_accessed_at': datetime.datetime(2022, 12, 19, 0, 0)}),
 Document(page_content='2022-05-20 stock market data for Tesla (TSLA):\nDate: 2022-05-20\nOpen: 237.996674\nHigh: 240.526672\nLow: 211.000000\nClose: 221.300003\nAdj Close: 221.300003\nVolume: 144973200', metadata={'source': './data/TSLA.csv', 'row': 2995, 'last_accessed_at': datetime.datetime(2022, 5, 20, 0, 0)}),
 Document(page_content='2022-10-12 stock market data for Tesla (TSLA):\nDate: 2022-10-12\nOpen: 215.330002\nHigh: 219.300003\nLow: 211.509995\nClose: 217.240005\nAdj Close: 217.240005\nVolume: 66860700', metadata={'source': './data/TSLA.csv', 'row': 3094, 'last_accessed_at': datetime.datetime(2022, 10, 12, 0, 0)}),
 Document(page_content='2022-02-25 stock market 

This still didn't work. Look's like we may need a more complex approach.

# Create CSV Agent

### To be implemented at a later date

# Route Questions using Similarity Functions