In [None]:
# ID rough cost for doing semantic chunking

In [5]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
import tiktoken
import os
from dotenv import load_dotenv
import pandas as pd

# Load OpenAI API key from .env file
load_dotenv("../../.env")
openai_api_key = os.getenv("OPENAI_API_KEY")

In [36]:
# Read in files (average size and largest)

In [37]:
with open("../data/dss-non-medicaid-pt2.txt") as f:
    largest_transcript = f.read()
with open("../data/ltgov-dhss-p1.txt") as f:
    average_transcript = f.read() 
transcripts = {
    'largest_transcript': largest_transcript,
    'average_transcript': average_transcript,
}

In [38]:
# Estimate text splitting cost for documents

In [39]:
embedding_cost = {
    'text-embedding-3-small': .02/1000000,
    'text-embedding-3-large': .13/1000000
}
for transcript in transcripts:
    encoding = tiktoken.encoding_for_model('text-embedding-3-small')
    num_tokens = len(encoding.encode(transcripts[transcript]))
    print(f"{transcript} ({num_tokens} tokens)")
    for model in ['text-embedding-3-small', 'text-embedding-3-large']:
        print(f"* {model} estimated cost: {embedding_cost[model] * num_tokens}")

largest_transcript (49384 tokens)
* text-embedding-3-small estimated cost: 0.0009876800000000001
* text-embedding-3-large estimated cost: 0.00641992
average_transcript (17929 tokens)
* text-embedding-3-small estimated cost: 0.00035858
* text-embedding-3-large estimated cost: 0.00233077


In [15]:
# Instantiate Text Splitter

In [6]:
text_splitter = SemanticChunker(OpenAIEmbeddings(openai_api_key=openai_api_key))

In [40]:
# Execute chunking on longer transcript with largest model

In [43]:
docs = text_splitter.create_documents([transcripts['largest_transcript']])
# actual cost based on looking at the dashboard: <$0.01

In [51]:
# Translate chunks into dataframe and export csv for labeling to facilitate evals

In [54]:
data = {'chunk': []}

for doc in docs:
    data['chunk'].append(doc.page_content)

df = pd.DataFrame(data)
df.to_csv("../data/dss-non-medicaid-pt2.csv", index=False)

In [2]:
# Chunk files with the most references to IT spend according to Chat GPT

In [None]:
def chunk_file(file_name:str):
    """
    Takes a file name input and outputs a semantically chunked version of the file to the path with '-chunked.csv' appended to it

    file_name: assumes that file is situated within the '../data/' directory.  See example below
        - file_name: 'sa-dolir-solir'
        - read path: '../data/sa-dolir-solir.txt'
        - output path: '../data/sa-dolir-solir-chunked.csv'
    """
    # Read in the file
    with open(f"../data/{file_name}.txt") as f:
        txt_file = f.read()
    # Chunk text
    docs = text_splitter.create_documents([txt_file])
    # Transform into dataframe
    data = {'chunk': []}
    for doc in docs:
        data['chunk'].append(doc.page_content)
    df = pd.DataFrame(data)
    # Output file to csv
    output_file_path = f"../data/{file_name}-chunked.csv"
    df.to_csv(output_file_path, index=False)
    print(f"Chunked file successfully output to {output_file_path}")

In [10]:
with open("../data/sa-dolir.txt") as f:
    sa_dolir = f.read()
docs = text_splitter.create_documents([sa_dolir])

data = {'chunk': []}

for doc in docs:
    data['chunk'].append(doc.page_content)

df = pd.DataFrame(data)
df.to_csv("../data/sa-dolir-chunked.csv", index=False)

In [8]:
with open("../data/ded.txt") as f:
    ded = f.read()
docs = text_splitter.create_documents([ded])

data = {'chunk': []}

for doc in docs:
    data['chunk'].append(doc.page_content)

df = pd.DataFrame(data)
df.to_csv("../data/ded-chunked.csv", index=False)

Document(metadata={}, page_content="So, this is the core budget. This funds everything except Medicaid fraud compliance and our safe kit. Absent any questions, I'll move to page 18. This is an NDI for our Public Protection and Criminal Appeals Division. Last year, we had 473 appeals. We filed 412 briefs, argued more than 100 cases in the Missouri Court of Appeals and the Missouri Supreme Court. As of January 1st, 2025, we've got 1,367 pending appeals. As this body knows, we handle 100% of felony appeals. Cops catch bad guys, prosecutors lock them up, and then we defend those convictions on appeal on behalf of the state. So the volume and workload has increased over time. We have special prosecutors that prosecute across the state of Missouri. They civilly commit sexually violent predators. Most of the cases that we were involved in at the trial court level are child sex cases, sexual assault cases, and murders, so we're putting the worst of the worst away. In 2024, we had cases in 28 d

In [9]:
data = {'chunk': []}

for doc in docs:
    data['chunk'].append(doc.page_content)

df = pd.DataFrame(data)
df.to_csv("../data/ded-chunked.csv", index=False)