In [None]:
import re
import os
import csv
import datetime
from datetime import datetime
import pandas as pd
from collections import Counter

# vector store set up  

import chromadb

from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

#Langchain
from langchain.chains import RetrievalQAWithSourcesChain

# Sentence Transformers
from sentence_transformers import SentenceTransformer

In [None]:
def remove_chars_before_first_letter(input_string):
    for i, char in enumerate(input_string):
        if char.isalpha():
            return input_string[i:]
    return input_string

In [None]:
def parse_source(source):
    path = str()
    parts = source.split('/')
    title = remove_chars_before_first_letter(parts[1]) if len(parts) > 1 else ""  # First part before the first '/'
    paragraph = remove_chars_before_first_letter(parts[-1]) if len(parts) > 3 else ""  # Text after the second '/'
    
    if len(parts) > 4:
        chapter = ""
        for i in range(2, len(parts)-1):
            chapter += remove_chars_before_first_letter(parts[i])
            if i < len(parts)-2:
                chapter += "/"
    else:
        chapter = remove_chars_before_first_letter(parts[2]) if len(parts) > 2 else ""
    return title, chapter, paragraph

# Retrieve Data

In [None]:
chroma_client = client = chromadb.PersistentClient(path='C:/Users/Nathan/Kratos_data-Science/Chroma/v7')
model = SentenceTransformer('all-MiniLM-L6-v2',  device='cuda')

In [None]:
chroma_client.list_collections()

In [None]:
chroma_client.delete_collection(name="EPOCH-T")

In [None]:
# Initialize Chroma
vectorstore = chroma_client.get_or_create_collection(name="EPOCH",  metadata={"hnsw:space": "cosine"})

In [None]:
# Lists to store the new information for documents
new_documents_list = []
new_embeddings_list = []
new_metadatas_list = []
new_ids_list = []


In [None]:
old_documents = vectorstore.get()

In [None]:
type(old_documents),len(old_documents)

In [None]:
for key, value in old_documents.items():
    if value is not None:
        print(f"{key} = {len(value)}")
    else:
        print(f"{key} = None (no elements to count)")

# Create new Vector Store 

In [None]:
# Extracting the lists of metadata and documents
old_metadatas = old_documents['metadatas']
old_docs_content = old_documents['documents']

# Verifying that both lists are of the same length
if len(old_metadatas) != len(old_docs_content):
    raise ValueError("Metadata and document content lists are not the same length!")

# Lists for new data
new_documents_list = []
new_embeddings_list = []
new_metadatas_list = []
new_ids_list = []

# Now, we loop through each metadata and document pair and process them
for metadata, content in zip(old_metadatas, old_docs_content):
    # Extracting metadata components
    documentation = metadata['documentation']
    source = metadata['source']
    file_path = metadata['file_path']
    word_count = metadata['word_count']

    # Parse the 'source' to extract 'Title', 'Chapter', and 'Paragraph'
    title, chapter, paragraph = parse_source(source)

    # Construct the new context string
    context_elements = []
    
    # Adding elements to context only if they exist
    if documentation:
        context_elements.append(f"Documentation = {documentation}")
    if title:
        context_elements.append(f"Title = {title}")
    if chapter:  # Only add if chapter is not empty
        context_elements.append(f"Chapter = {chapter}")
    if paragraph:  # Only add if paragraph is not empty
        context_elements.append(f"Paragraph = {paragraph}")

    # Constructing the context string based on the existing elements
    context_info = "Context : (" + ", ".join(context_elements) + ") " if context_elements else ""

    # Combine new context with the old content
    full_content_with_context = context_info + content 

    # Create a new embedding
    new_embedding = model.encode(full_content_with_context)

    # Reconstruct the metadata (if there are any changes or additions, make them here)
    new_metadata = {
        "source": source,
        "documentation": documentation,
        "file_path": file_path,
        "word_count": word_count
    }

    # Add the modified document, new embedding, and metadata to the lists
    new_documents_list.append(full_content_with_context)
    new_embeddings_list.append(new_embedding.tolist())
    new_metadatas_list.append(new_metadata)

# Generate new IDs for the documents (assuming you want unique new IDs)
new_ids_list = ["v" + str(i + 1) for i in range(len(new_documents_list))]

In [None]:
new_documents_list

In [None]:
# Create a new collection (or clear the existing one)
new_vectorstore = chroma_client.get_or_create_collection(name="EPOCH-T", metadata={"hnsw:space": "cosine"})

# Add the new embedded documents to the new collection in Chroma
new_vectorstore.add(
    documents=new_documents_list,
    embeddings=new_embeddings_list,
    metadatas=new_metadatas_list,
    ids=new_ids_list
)

In [None]:
chroma_client.list_collections()