In [1]:
#!pip install -U langchain-community
#!pip install sentence-transformers chroma
#!pip install langchain

In [2]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil
import pandas as pd

In [3]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [4]:
def load_documents2():
    loader = DirectoryLoader(DATA_PATH, glob="*.csv")
    documents = loader.load()
    return documents

In [5]:
# Function to load documents
def load_documents():
    documents = []
    for root, _, files in os.walk(DATA_PATH):  # Walk through the directory
        for file in files:  # Iterate over the files
            if file.endswith(".csv"):  # Check if the file is a CSV
                file_path = os.path.join(root, file)  # Get the full file path
                try:
                    #df = pd.read_csv(file_path, encoding='ISO-8859-1')
                    df = pd.read_csv(file_path, encoding='utf-8')  # Read the CSV file
                    for _, row in df.iterrows():  # Iterate over each row in the CSV
                        content = ' '.join(map(str, row))  # Convert row to string content
                        documents.append(Document(page_content=content))  # Append as Document
                except UnicodeDecodeError as e:  # Handle possible encoding errors
                    print(f"Error loading file {file_path}: {e}")  # Print error message
    return documents  # Return the list of documents

In [6]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # Print the content and metadata of one example chunk
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [7]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents
    db = Chroma.from_documents(
        chunks, HuggingFaceEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


In [8]:
documents = load_documents()

In [9]:
chunks = split_text(documents)

Split 3020 documents into 25007 chunks.
Le cyborg dans la mégapole en ruine Le cyborg restaure l'ordre en franchissant 7 niveaux urbains, piratant des systèmes de sécurité, désactivant des drones, traversant des bâtiments effondrés, déjouant des robots, reprogrammant des systèmes de défense, sauvant des civils et combattant un boss
{'start_index': 0}


In [None]:
save_to_chroma(chunks)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print("OK")