In [None]:
from llama_index.core import (
    VectorStoreIndex,
    SummaryIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
    Settings,
    Document
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from collections import defaultdict
import pandas as pd

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "sk-proj-****"
Settings.llm = OpenAI(temperature=0, model="gpt-4o")

In [None]:
# Load folder names and paths from the Excel file
excel_file_path = 'CLM_Agents_AutoGen.xlsx'
df = pd.read_excel(excel_file_path)

In [None]:
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    folder_name = row['folder_name']
    data_folder_path = row['data_folder_path']
    index_storage_path = "./storage/" + folder_name #row['index_storage_path']

    # List files in the specified folder
    this_files = os.listdir(data_folder_path)
    this_input_files = [os.path.join(data_folder_path, f) for f in this_files]

    # Load documents from the specified folder
    this_documents = SimpleDirectoryReader(input_files=this_input_files).load_data()

    # Group documents by file_name
    grouped_documents = defaultdict(list)
    for doc in this_documents:
        file_name = doc.metadata.get('file_name')  # Assuming file_name is in the metadata
        grouped_documents[file_name].append(doc)

    # Create a new list to hold the combined Document objects
    combined_documents = []

    for file_name, docs in grouped_documents.items():
        # Join the text of all documents with the same file_name
        combined_text = "\n\n".join(doc.text for doc in docs)

        # Create a new Document with the combined text
        combined_doc = Document(text=combined_text, metadata={"file_name": file_name})
        combined_documents.append(combined_doc)

    # Create vector store index
    vector_index = VectorStoreIndex.from_documents(combined_documents)

    # Persist vector index
    vector_index.storage_context.persist(persist_dir=index_storage_path + "_vector")

    # Create summary index
    summary_index = SummaryIndex.from_documents(combined_documents)

    # Persist summary index (change storage path if needed)
    summary_index.storage_context.persist(persist_dir=index_storage_path + "_summary")

    print(f"Processed folder: {folder_name}")

print("Indexing completed for all folders.")