In [1]:
import os
from langchain.document_loaders import PyPDFDirectoryLoader, CSVLoader, JSONLoader
from langchain.text_splitter import CharacterTextSplitter

In [2]:
def load_documents(data_path):
  """
  Load documents from different file formats in specified subdirectories.

  Args:
    data_path: The root directory containing subdirectories for different file types.

  Returns:
    List of Document objects: Loaded documents from all supported file formats.
  """
  documents = []

  # PDF files
  pdf_path = os.path.join(data_path, "raw_data", "PDF Files")
  if os.path.exists(pdf_path):
    pdf_loader = PyPDFDirectoryLoader(pdf_path)
    documents.extend(pdf_loader.load())

  # CSV files
  csv_path = os.path.join(data_path, "raw_data", "CSV Files")
  if os.path.exists(csv_path):
    csv_loader = CSVLoader(csv_path)
    documents.extend(csv_loader.load())

  # JSON file
  json_path = os.path.join(data_path, "raw_data", "JSON Files")
  if os.path.exists(json_path):
    json_loader = JSONLoader(json_path)
    documents.extend(json_loader.load())

  return documents


In [3]:
def save_documents(documents):
    """
    Save documents as text files in the specified output directory.
    """
    os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
    for i, doc in enumerate(documents):
        with open(os.path.join(PROCESSED_DATA_PATH, f"document_{i}.txt"), "w") as f:
            f.write(doc.page_content)

In [4]:
DATA_PATH = "data/"
PROCESSED_DATA_PATH = "data/processed_data/documents"
documents = load_documents(DATA_PATH)
# print(documents[0])
save_documents(documents)