In [None]:
import os
import re
from typing import List, Dict, Any

import chromadb
import ollama

In [None]:
def build_rag_dict(directory: str, method_chunks=False) -> Dict[str, Any]:
  text_contents = {}

  for filename in os.listdir(directory):
    if filename.endswith(".txt"):
      file_path = os.path.join(directory, filename)

      with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

      original_file_name = filename.split('--')[0]

      if method_chunks:
        text_contents[filename] = {'contents': content, 'file_name': original_file_name}
      else:
        text_contents[filename] = content

  return text_contents

In [None]:
def collect_code_files(path: str) -> Dict[str, Any]:
    """
    Gets all of the documents in the `code` folder to pass to the Chroma database.
    """

    text_files = {}
    sub_directories = ['code']

    for sub_directory in sub_directories:
        
        raw_data_location = os.path.join(path, sub_directory)

        text_files.update(build_rag_dict(raw_data_location, method_chunks=True))
    return text_files

In [None]:
def chunk_files(files: Dict[str, str], chunk_size=1400) -> Dict[str, Any]:
    chunked_files = {}

    for file_name, contents in files.items():
        chunks = {}
        char_count = 0
        start = 0

        while start < len(contents):
            # Take a chunk of specified size
            chunk = contents[start:start+chunk_size]
            
            # Create chunk name
            chunk_count = len(chunks)
            chunk_name = f"{file_name}_{chunk_count}"
            
            # Store the chunk
            chunks[chunk_name] = {
                'contents': chunk, 
                'file_name': file_name
            }

            # Move to next chunk
            start += chunk_size

        chunked_files.update(chunks)

    return chunked_files

In [None]:
def collect_chunked_classes_files(path: str) -> Dict[str, Any]:
    """
    Gets all of the documents in the `classes` folder and 
    separates them into chunks to pass to the Chroma database.
    """
    text_files = {}
    sub_directories = ['classes']

    for sub_directory in sub_directories:
        
        raw_data_location = os.path.join(path, sub_directory)

        files = build_rag_dict(raw_data_location)

        chunks = chunk_files(files)

        text_files.update(chunks)
    return text_files

In [None]:
def get_embeddings(chunks):
  embeds = ollama.embed(model="snowflake-arctic-embed", input=chunks)
  return embeds.get('embeddings', [])

In [None]:
chroma_client = chromadb.HttpClient(host="localhost", port=8000)

In [None]:
def embed(collection_name: str, data_source: str, source_type: str):
    if source_type == 'code':
        data = collect_code_files(data_source)
    elif source_type == 'class':
        data = collect_chunked_classes_files(data_source)
    else:
        raise Exception("Invalid source type")
    
    data_names = list(data.keys())
    data_array = data.values()
    contents = [item['contents'] for item in data_array]
    embeddings = []
    metadata = [{"source": item['file_name']} for item in data_array]

    for item in contents:
        truncated_item = item[:1400]
        print(truncated_item + '\n\n\n')
        embeddings += get_embeddings(truncated_item)

    collection = chroma_client.get_or_create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})
    collection.add(ids=data_names, documents=contents, embeddings=embeddings, metadatas=metadata)

In [None]:
embed('desktop_code', 'desktop_documents', 'code')
embed('mobile_code', 'mobile_documents', 'code')
embed('driver_code', 'driver_documents', 'code')
embed('desktop_class', 'desktop_documents', 'class')
embed('mobile_class', 'mobile_documents', 'class')
embed('driver_class', 'driver_documents', 'class')