In [None]:
from llama_index.core import (
    VectorStoreIndex,
    SummaryIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
    Settings,
    Document
)

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from collections import defaultdict

In [None]:
import os

os.environ["OPENAI_API_KEY"]
Settings.llm = OpenAI(temperature=0, model="gpt-4o")
# service_context = ServiceContext.from_defaults(llm=llm)

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['folder_name', 'data_folder_path', 'vector_description', 'summary_description', 'top_level_description'])

In [None]:
# Loop through the current directory and populate the DataFrame --> generate summaries, etc.
for folder in os.listdir('./Data'):
    folder_path = "./Data/" + folder # Get the absolute path

    this_documents = SimpleDirectoryReader(input_dir=folder_path).load_data()
    this_documents_joined = Document(text="\n\n".join([doc.text for doc in this_documents]))
    this_index = VectorStoreIndex.from_documents([this_documents_joined])

    this_query_engine = this_index.as_query_engine()

    vector_description = this_query_engine.query("What types of specific questions would the documents be useful for answering? Tell me in two sentences. The users are generally interested in Community-Led Monitoring. This information will help guide a chatbot to the correct information, so be sure to capture the definining topics in these documents. Be sure to emphasize the relevant geographies, document types, program areas, etc.").response
    vector_description = "Useful for specific CLM questions related to " + folder + ". " + vector_description

    summary_description  = this_query_engine.query("What types of summary questions would the documents be useful for answering? Tell me in two sentences. The users are generally interested in Community-Led Monitoring. This information will help guide a chatbot to the correct information, so be sure to capture the definining topics in these documents. Be sure to emphasize the relevant geographies, document types, program areas, etc.").response
    summary_description = "Useful for CLM questions that require a wholistic summary related to " + folder + ". " + summary_description

    tl_string = "What types of questions would the documents be useful for answering? Tell me in two sentences. The users are generally interested in Community-Led Monitoring. This information will help guide a chatbot to the correct information, so be sure to capture the definining topics in these documents. Be sure to emphasize the relevant geographies, document types, program areas, etc."
    top_level_description  = this_query_engine.query(tl_string).response
    top_level_description = top_level_description + " If using this tool, mention that the response comes from " + folder + " guidance."



    new_row = {
        'folder_name': folder,
        'data_folder_path': folder_path,
        'vector_description': vector_description,
        'summary_description': summary_description, 
        'top_level_description': top_level_description
    }

    df.loc[len(df)] = new_row


print(df)

In [None]:
df.to_excel("CLM_Agents_AutoGen.xlsx", index = False)