In [None]:
%pip install Flask
%pip install llama-index
%pip install openai
%pip install python-dotenv

In [9]:
import os
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    PromptTemplate,
)
from llama_index.readers.file import CSVReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.node_parser import SimpleNodeParser
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(dotenv_path="./variable.env")


# Retrieve the OpenAI API key and paths from environment variables
api_key = os.getenv("OPENAI_API_KEY")
persist_dir = os.getenv("Storage")
documents_path = os.getenv("DATA_path")

os.environ["OPENAI_API_KEY"] = api_key

# Initialize the CSV reader
csv_reader = CSVReader()
file_extractor = {".csv": csv_reader}

# Initialize embedding model
embed_model = OpenAIEmbedding()

# Iterate through each subdirectory in the DATA directory
for subdir in os.listdir(documents_path):
    subdir_path = os.path.join(documents_path, subdir)
    if os.path.isdir(subdir_path):
        # Load documents from the subdirectory
        documents = SimpleDirectoryReader(
            subdir_path, file_extractor=file_extractor, recursive=True
        ).load_data()

        splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model)

        # also baseline splitter
        base_splitter = SentenceSplitter(chunk_size=512)
        nodes = splitter.get_nodes_from_documents(documents)
    
        index = VectorStoreIndex(nodes)


        # Create a storage path for the current subdirectory
        subdir_storage_path = os.path.join(persist_dir, subdir)
        os.makedirs(subdir_storage_path, exist_ok=True)

        # Persist the vector index to the subdirectory's storage path
        index.storage_context.persist(persist_dir=subdir_storage_path)
