# WORK IN PROGRESS 
# LLM Pipeline Routing

In this notebook we will show how to use routing to different LLM pipelines as well as how to work with tabular data.

We will consider on Tesla (TSLA) 10k report and stock prices data. 
* If a question pertains to the 10k, we will route to a vanilla RAG
* If a question pertains to the stock price, we will route to the tabular data GenAI path

# Import libraries and load the 10k and stock data

In [12]:
import subprocess
import tiktoken
import pandas as pd
import os
import csv
import json
import time
import re
import transformers
import torch
import numpy as np

# To use with the router
from rouge_score import rouge_scorer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

#We will use langchanin to create a vector store to retrieve stronger negatives
from langchain.vectorstores.faiss import FAISS
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, csv_loader 
# from langchain.embeddings.sentence_transformer import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings


EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"## "BAAI/bge-base-en-v1.5""all-MiniLM-L6-v2" 

In [15]:
# read and load 10k pdf file
loader = UnstructuredPDFLoader("./data/tsla-20231231-gen.pdf")
docs_10k = loader.load()

#load stock data. Load dataframe and load directly as docs
tsla_stock = pd.read_csv("./data/TSLA.csv")
loader  = csv_loader.CSVLoader(file_path="./data/TSLA.csv")
data = loader.load()
data[0]

Document(page_content='Date: 2010-06-29\nOpen: 1.266667\nHigh: 1.666667\nLow: 1.169333\nClose: 1.592667\nAdj Close: 1.592667\nVolume: 281494500', metadata={'source': './data/TSLA.csv', 'row': 0})

# Split docs and store in FAISS Vector Database for 10k

In [4]:
#let's split the document into chunks chunk size 128 and overlap size 64
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 128, chunk_overlap  = 64, add_start_index = True)
docs_split = text_splitter.split_documents(docs_10k)
print(len(docs_split))

2530


In [5]:
embedding_function = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        cache_folder="./models/sentencetransformers"
    )

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [9]:
db = FAISS.from_documents(docs_split, embedding_function)
db.save_local("./data/faiss")

# Build Static Knowledge Base of Stock Data

# Create CSV Agent

# Build Question Bank and Similarity Functions to Help Route Questions

In [17]:
COMPANY_DETAILS_QUESTIONS = ["Describe {company_name}'s business?", "What sector or industry does {company_name} operate in?", "What market does {company_name} serve?",
                             "What products does {company_name} offer?", "What services does the {company_name} offer?", "Who are the {company_name}'s clients or customers?", 
                             "Who are the {company_name}'s suppliers?", "Who are the {company_name}'s vendors?", "Who are the {company_name}'s competitors?"]

#use equity research questions (for other research questions, we can use the same format)
COVERAGE_QUESTIONS = ["What is the analyst coverage for {company_name}?", "What is the analyst rating for {company_name}?", "What is the analyst price target for {company_name}?"]

FINANCIALS_QUESTIONS = ["What is the revenue for {company_name}?", "What is the net income for {company_name}?","What is the operating income for {company_name}?","What's {company_name}'s EBITDA?", 
                        "What is the gross profit for {company_name}?", "What's {company_name}'s gross margin like?", "How much cash does {company_name} have?", "How much revenue did {company_name} earn in the last quarter?"]


QUESTION_LISTS = [COMPANY_DETAILS_QUESTIONS, COVERAGE_QUESTIONS, FINANCIALS_QUESTIONS]

In [18]:
# Initialize the lemmatizer and get list of stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_max_rouge_f1(prompt, question_list, company_name, lower=True):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    if lower:
        prompt = prompt.lower()
        question_list = [question.lower() for question in question_list]
    words = word_tokenize(prompt)
    # Remove stop words and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    prompt_clean = ' '.join(words)

    question_list_clean=[]
    max_score = 0
    best_question = ''
    for question in question_list:
        question = question.format(company_name=company_name)
        words = word_tokenize(question)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        question_clean = ' '.join(words)
        question_list_clean.append(question_clean)
        scores = scorer.score(prompt_clean, question_clean)
        if scores['rouge1'].fmeasure > max_score:
            max_score = scores['rouge1'].fmeasure
            best_question = question
    return max_score, best_question

In [19]:
def get_max_similarity(prompt, question_list, company_name):

    prompt_embedding = embedding_function.embed_documents([prompt])
    question_list = [question.format(company_name=company_name) for question in question_list]
    question_list_embeddings = embedding_function.embed_documents(question_list)

    similarity = cosine_similarity(prompt_embedding, question_list_embeddings)

    max_score = np.max(similarity)
    best_question = question_list[np.argmax(similarity)]
    return max_score, best_question

In [20]:
question = "How much was Tesla's revenue for the 2023 fiscal year?"

print("company: {}".format(get_max_rouge_f1(question, COMPANY_DETAILS_QUESTIONS, company_name="Tesla")))
print("FINANCIALS_QUESTIONS: {}".format(get_max_rouge_f1(question, FINANCIALS_QUESTIONS, company_name="Tesla")))
print("COVERAGE_QUESTIONS: {}".format(get_max_rouge_f1(question, COVERAGE_QUESTIONS, company_name="Tesla")))

company: (0.4, "who are the Tesla's suppliers?")
FINANCIALS_QUESTIONS: (0.4615384615384615, 'how much revenue did Tesla earn in the last quarter?')
COVERAGE_QUESTIONS: (0.2, 'what is the analyst coverage for Tesla?')
