In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from typing import List
from uuid import uuid4
import pandas as pd
import os


In [2]:
# Remove unwanted columns before loading into vectorstore
def preprocess_d4_emails(file_in: str, file_out: str):
    columns_to_drop = ['name', 'email_address', 'd4_staff_member', 'constituent_email_2', 'd4_response_2']
    d4_emails_df = pd.read_csv(file_in)
    d4_emails_df = d4_emails_df.drop(columns=columns_to_drop)
    d4_emails_df.to_csv(file_out, index=False)
    return d4_emails_df

# Split the D4 Emails into document chunks

In [3]:
# Split the emails into document chunks
def split_docs(csv_file_path, chunk_size=5000, chunk_overlap=100) -> tuple[List[Document], list[str]]:
    # Create a document loader for D4 Emails
    loader = CSVLoader(csv_file_path, encoding='utf-8')

    # Load the document
    data = loader.load()
    
    # Create an instance of the splitter class with the given chunk size and overlap
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    # Split the emails into document chunks and create uuids
    docs = splitter.split_documents(data)
    uuids = [
        f"{str(docs[i].metadata['source']).split('/')[-1].replace('.csv', '')}_{docs[i].metadata['row']}" for i in range(len(docs))
    ]
    
    return docs, uuids

# Load the vector store with the data from the preprocessed csv file

In [4]:
def get_vector_store(embeddings, persist_directory='../chroma_db'):
    return Chroma(embedding_function=embeddings, persist_directory=persist_directory)

In [5]:
def load_vectorstore_docs(docs: List[Document], uuids: list[str], embeddings, persist_directory='../chroma_db'):

    # Create the vector_store with the documents and save it to disk
    try:
        if os.path.exists(persist_directory) and os.listdir(persist_directory):
            print(f"Updating vector store at {persist_directory}", flush=True)
            # vector_store = get_vector_store(embeddings=embeddings, persist_directory=persist_directory)
            # for i in range(len(uuids)):
            #     if vector_store.get([uuids[i]]) is not None:
            #         print(f"deleting uuid: {uuids[i]}", flush=True)
            #         vector_store.delete(ids=[uuids[i]])
                # print(f"adding uuid: {uuids[i]}", flush=True)
                # vector_store.add_documents([docs[i]], ids=[uuids[i]])
        else:
            print(f"Creating vector store at {persist_directory}", flush=True)
            
        vector_store = Chroma.from_documents(
            docs,
            embeddings,
            persist_directory=persist_directory,
            ids=uuids
        )
    except Exception as e:
        print(f"Error creating/updating vector store: {str(e)}", flush=True)
        
    

## Define variables for the functions

In [None]:
# Define the path to the input and output csv files
d4_emails_file = '../resources/d4_emails_topics.csv'
d4_emails_responses_file = '../resources/d4_emails_responses.csv'

# Create an instance of the embedding class
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


## Preprocess the D4 emails and split them into document chunks

In [6]:
# Preprocess the d4_emails.csv file
preprocess_d4_emails(d4_emails_file, d4_emails_responses_file)

# Split the emails into document chunks
docs, uuids = split_docs(d4_emails_responses_file)

# Check the first document
print(docs[0].page_content, docs[0].metadata)

  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


## Load the document chunks into the vector store

In [7]:
# NOTE: This cell kills the kernel if run more than once on the same Chroma DB
# You can delete the chroma_db directory if you want to start from a clean slate

# Create the vector_store with the data from the preprocessed csv file
load_vectorstore_docs(docs=docs, uuids=uuids, embeddings=embeddings)


Creating vector store at ../chroma_db


## Perform various queries on the vector store

In [9]:
# Get the first document from the vector store by id
vector_store = get_vector_store(embeddings=embeddings)
vector_store.get([uuids[0]])

  return Chroma(embedding_function=embeddings, persist_directory=persist_directory)


{'ids': ['d4_emails_responses_0'],
 'embeddings': None,
 'documents': ["affected_address: 6864 East Bucknell Place\ncase_number: 0\ndate: 2024-08-05\nconstituent_email_1: The lack of police presence and code enforcement is sending a growing message that these violations are not important…and that reckless behavior is not of great concern. Second item: affordable denver and wanting more information about how the tax will accomplish the goals set by Mayor.\nd4_response_1: Good morning Ron, \n\nThank you for reaching out, and I apologize for the delayed response. Council Pro Tem Romero Campbell maintains regular communication with DPD District 3, which serves Southeast Denver. We have a strong relationship with Commander Bell and Chief Thomas, consistently supporting DPD's resource and policy needs. With budget season approaching, we carefully consider input from our officers during council votes.\nWe also attend monthly community advisory board meetings to address concerns. For more deta

In [10]:
# Query the vector store by csv row number
vector_store.get(where={"row": 0})

{'ids': ['d4_emails_responses_0'],
 'embeddings': None,
 'documents': ["affected_address: 6864 East Bucknell Place\ncase_number: 0\ndate: 2024-08-05\nconstituent_email_1: The lack of police presence and code enforcement is sending a growing message that these violations are not important…and that reckless behavior is not of great concern. Second item: affordable denver and wanting more information about how the tax will accomplish the goals set by Mayor.\nd4_response_1: Good morning Ron, \n\nThank you for reaching out, and I apologize for the delayed response. Council Pro Tem Romero Campbell maintains regular communication with DPD District 3, which serves Southeast Denver. We have a strong relationship with Commander Bell and Chief Thomas, consistently supporting DPD's resource and policy needs. With budget season approaching, we carefully consider input from our officers during council votes.\nWe also attend monthly community advisory board meetings to address concerns. For more deta

In [11]:
# Get the ids portion of the document returned by the query
vector_store.get(where={"row": 1})['ids'][0]

'd4_emails_responses_1'

In [15]:
# Perform a similarity search
results = vector_store.similarity_search(
    "Dahlia & High Line intersection",
    k=3,
    #filter={"row": 12},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* affected_address: Dahlia & High Line intersection
case_number: 0
date: 2024-08-19
constituent_email_1: I would like to know if there is a possible to put a yellow flashing pedestrian light on Dahlia and the intersection of the Highline Canal.. Also is there aywya that a turn signal could be implemented at Hampden and Dahlia?
d4_response_1: Thank you for reaching out to our office.  The area engineers have looked at this intersection where the High Line Canal crosses Dahlia multiple times at our request and have determined that a flashing signal is not warranted at this location.  However, they are looking at better signage and clearer striping for this crossing.  Councilwoman Romero Campbell will continue to advocate for this in her quarterly meetings with the area engineers.

Please report the overgrown vegetation to 311.  By reporting this online the case will be reviewed by Forestry and put on the list for a trimming if necessary.

The intersection at Dahlia and Happy Canyon is be