In [None]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial # for calculating vector similarities for search
import os
 
openai.api_key = os.environ["OPENAI_API_KEY"]

VECTOR_OUTPUT_FILE = "vectorized_beige.csv"

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

In [2]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [3]:
# Headers in the Beige documents
headers_to_split_on = [
    ("#", "Overall"),
    ("#", "District:"),
    ("##", "Category:"),
    ("##", "District:"),
    ("##", "Overall Economic Activity:"),
    ("###", "Period:"),
]

In [4]:
# Read the whole document into the memory

def read_large_file_into_buffer(file_path):
   with open(file_path, 'r') as file:
      file_contents = file.read()
   return file_contents

file_content = read_large_file_into_buffer("combined-beige.txt")

In [5]:
# Split the document into chunks based on the header info

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(file_content)

In [6]:
# Show number of chunk splitted

len(md_header_splits)

756

In [7]:
# Inpspect the first chunk

md_header_splits[0]

Document(page_content="Period: This report was prepared based on information collected from December 01,2022 to January 8, 2023.\nOverall economic activity was relatively unchanged since the previous report. Five Districts reported slight or modest increases in overall activity, six noted no change or slight declines, and one cited a significant decline. On balance, contacts generally expected little growth in the months ahead. Consumer spending increased slightly, with some retailers reporting more robust sales over the holidays. Other retailers noted that high inflation continued to reduce consumers' purchasing power, particularly among low- and moderate-income households. Auto sales were flat on average, but some dealers noted that increased vehicle availability had boosted sales. Tourism contacts reported moderate to robust activity augmented by strong holiday travel. Manufacturers indicated that activity declined modestly on average, and, in many Districts, reported that supply ch

In [8]:
# Inpspect the first chunk's page content

md_header_splits[0].page_content

"Period: This report was prepared based on information collected from December 01,2022 to January 8, 2023.\nOverall economic activity was relatively unchanged since the previous report. Five Districts reported slight or modest increases in overall activity, six noted no change or slight declines, and one cited a significant decline. On balance, contacts generally expected little growth in the months ahead. Consumer spending increased slightly, with some retailers reporting more robust sales over the holidays. Other retailers noted that high inflation continued to reduce consumers' purchasing power, particularly among low- and moderate-income households. Auto sales were flat on average, but some dealers noted that increased vehicle availability had boosted sales. Tourism contacts reported moderate to robust activity augmented by strong holiday travel. Manufacturers indicated that activity declined modestly on average, and, in many Districts, reported that supply chain disruptions had ea

In [9]:
# Inpspect the first chuck's meta data

md_header_splits[0].metadata

{'Overall': 'Overall', 'Category:': 'Overall Economic Activity:'}

In [10]:
# Merge the meta back into the chunk's page content so that the meta data will also be embedded and become searchable in the similarity serach

for x in range(len(md_header_splits)):
    md_header_splits[x].page_content = str(md_header_splits[x].metadata) + "\n" + md_header_splits[x].page_content

In [11]:
md_header_splits[0]

Document(page_content="{'Overall': 'Overall', 'Category:': 'Overall Economic Activity:'}\nPeriod: This report was prepared based on information collected from December 01,2022 to January 8, 2023.\nOverall economic activity was relatively unchanged since the previous report. Five Districts reported slight or modest increases in overall activity, six noted no change or slight declines, and one cited a significant decline. On balance, contacts generally expected little growth in the months ahead. Consumer spending increased slightly, with some retailers reporting more robust sales over the holidays. Other retailers noted that high inflation continued to reduce consumers' purchasing power, particularly among low- and moderate-income households. Auto sales were flat on average, but some dealers noted that increased vehicle availability had boosted sales. Tourism contacts reported moderate to robust activity augmented by strong holiday travel. Manufacturers indicated that activity declined m

In [13]:
# Initialize the embedding function

from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()


In [14]:
# Do the embedding and persiste the embedded info into the Chroma vector DB

vectordb = Chroma.from_documents(
    documents=md_header_splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [15]:
# Check if the numnber of embedded entries mateches the number of documents

print(vectordb._collection.count())
len(md_header_splits)

756


756

In [16]:
# Do a similarity search, ask the Chroma Vector DB to return the top 12 documents with the closest similarity with the question

question = "Tell me the Federal Reserve Bank of San Francisco District labor markets situation in the month of January"
docs = vectordb.similarity_search(question,k=12)

In [17]:
# Check the number of documents returned by the Chroma DB
len(docs)

12

In [18]:
# Inspect the content of the first returned document

docs[0]

Document(page_content="{'Overall': 'District: Federal Reserve Bank of San Francisco', 'Category:': 'Category: Labor Markets'}\nPeriod: This report was prepared based on information collected from December 01,2022 to January 8, 2023.\nEmployment levels grew at a modest pace during the reporting period as labor availability improved across the District. Job turnover and voluntary quits reportedly fell in recent weeks, and hiring difficulties eased in consumer services sectors such as retail, food services, and hospitality. Contacts reported strong competition for labor and difficulties attracting experienced talent in health care, legal services, manufacturing, and skilled trades. Several real estate firms and mortgage providers reported reducing the number of open positions in response to moderating demand and noted that recent hiring freezes and layoffs in the technology sector improved the size and quality of the applicant pool. Contacts in Alaska and Hawaii continued to report challe