In [None]:
# ID rough cost for doing semantic chunking

In [48]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
import tiktoken
import os
from dotenv import load_dotenv
import pandas as pd

# Load OpenAI API key from .env file
load_dotenv("../../.env")
openai_api_key = os.getenv("OPENAI_API_KEY")

In [36]:
# Read in files (average size and largest)

In [37]:
with open("../data/dss-non-medicaid-pt2.txt") as f:
    largest_transcript = f.read()
with open("../data/ltgov-dhss-p1.txt") as f:
    average_transcript = f.read()
transcripts = {
    'largest_transcript': largest_transcript,
    'average_transcript': average_transcript,
}

In [38]:
# Estimate text splitting cost for documents

In [39]:
embedding_cost = {
    'text-embedding-3-small': .02/1000000,
    'text-embedding-3-large': .13/1000000
}
for transcript in transcripts:
    encoding = tiktoken.encoding_for_model('text-embedding-3-small')
    num_tokens = len(encoding.encode(transcripts[transcript]))
    print(f"{transcript} ({num_tokens} tokens)")
    for model in ['text-embedding-3-small', 'text-embedding-3-large']:
        print(f"* {model} estimated cost: {embedding_cost[model] * num_tokens}")

largest_transcript (49384 tokens)
* text-embedding-3-small estimated cost: 0.0009876800000000001
* text-embedding-3-large estimated cost: 0.00641992
average_transcript (17929 tokens)
* text-embedding-3-small estimated cost: 0.00035858
* text-embedding-3-large estimated cost: 0.00233077


In [15]:
# Instantiate Text Splitter

In [41]:
text_splitter = SemanticChunker(OpenAIEmbeddings(openai_api_key=openai_api_key))

In [40]:
# Execute chunking on longer transcript with largest model

In [43]:
docs = text_splitter.create_documents([transcripts['largest_transcript']])
# actual cost based on looking at the dashboard: <$0.01

In [51]:
# Translate chunks into dataframe and export csv for labeling to facilitate evals

In [54]:
data = {'chunk': []}

for doc in docs:
    data['chunk'].append(doc.page_content)

df = pd.DataFrame(data)
df.to_csv("../data/dss-non-medicaid-pt2.csv", index=False)