In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import pandas as pd

In [2]:
# Remove unwanted columns before loading into vectorstore
columns_to_drop = ['name', 'email_address', 'd4_staff_member', 'constituent_email_2', 'd4_response_2']
d4_emails_df = pd.read_csv('../resources/d4_emails_topics.csv')
d4_emails_df = d4_emails_df.drop(columns=columns_to_drop)
d4_emails_df.to_csv('../resources/d4_emails_responses.csv', index=False)

In [3]:
# Create a document loader for fifa_countries_audience.csv
loader = CSVLoader('../resources/d4_emails_responses.csv', encoding='utf-8')

# Load the document
data = loader.load()
data
print(data[0])

page_content='affected_address: 6864 East Bucknell Place
case_number: 0
date: 2024-08-05
constituent_email_1: The lack of police presence and code enforcement is sending a growing message that these violations are not important…and that reckless behavior is not of great concern. Second item: affordable denver and wanting more information about how the tax will accomplish the goals set by Mayor.
d4_response_1: Good morning Ron, 

Thank you for reaching out, and I apologize for the delayed response. Council Pro Tem Romero Campbell maintains regular communication with DPD District 3, which serves Southeast Denver. We have a strong relationship with Commander Bell and Chief Thomas, consistently supporting DPD's resource and policy needs. With budget season approaching, we carefully consider input from our officers during council votes.
We also attend monthly community advisory board meetings to address concerns. For more details on DPD’s recruitment and specific traffic enforcement, I reco

In [4]:
# Split the documents into chunks
chunk_size = 5000
chunk_overlap = 100

# Create an instance of the splitter class
splitter = RecursiveCharacterTextSplitter(
    separators=["\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

# Split the document and print the chunks
docs = splitter.split_documents(data)
print(docs[0].page_content)
#print([len(doc) for doc in docs])

affected_address: 6864 East Bucknell Place
case_number: 0
date: 2024-08-05
constituent_email_1: The lack of police presence and code enforcement is sending a growing message that these violations are not important…and that reckless behavior is not of great concern. Second item: affordable denver and wanting more information about how the tax will accomplish the goals set by Mayor.
d4_response_1: Good morning Ron, 

Thank you for reaching out, and I apologize for the delayed response. Council Pro Tem Romero Campbell maintains regular communication with DPD District 3, which serves Southeast Denver. We have a strong relationship with Commander Bell and Chief Thomas, consistently supporting DPD's resource and policy needs. With budget season approaching, we carefully consider input from our officers during council votes.
We also attend monthly community advisory board meetings to address concerns. For more details on DPD’s recruitment and specific traffic enforcement, I recommend reaching

In [5]:
# Create an instance of the embedding class
embeddings = OpenAIEmbeddings()

# Create the vector_store with the documents
vector_store = Chroma.from_documents(
    docs,
    embeddings,
    persist_directory='../chroma_db'
)

# Save the vector_store to disk
# vector_store.persist()

  embeddings = OpenAIEmbeddings()
