In [22]:
"""This is a simple jupyter notebook just for the purposes of testing the ingestion pipeline. It tests that following works as desired:
- Ingestion functionality (using the test dataset)
- Embedding functionality (using the test dataset, small so can be run on CPU with a small model)
- Fixing metadata

The entire process works like this
(1) ingestion [dataset download, embedding, documentation: text2dataset] => (2) training [training layers: dataset2dataset] => (3) comparison/analsis [dataset2viz]

Ingestion refers to:
1. Downloading the dataset from huggingface (this is done via `download_datsets.py` in `flask download_ds`)
2. Ingesting the dataset (this is done via `ingest_ds.py` in `flask ingest_ds` and also `../modern/ingestion.py`)
3. Fixing the metadata (this is done via `fix_metadata.py` in `flask metadata_ds` + `../modern/ingestion.py`)
    (there is help from `model_sizes.py` among others).
"""
import os

import click
import torch
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer

from owlergpt.modern.collection_utils import MODEL_NAMES


print("=" * 40 + " Testing me" + "=" * 40)
if os.environ.get("CUDA_VISIBLE_DEVICES") is None:
    os.environ["CUDA_VISIBLE_DEVICES"] = click.prompt(
        "Please enter the CUDA_VISIBLE_DEVICES value", type=str
    )
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
    print("WARNING: CUDA is not available, using CPU")
else:
    print(f"Using CUDA device {os.environ['CUDA_VISIBLE_DEVICES']}")

# default parameters
selected_folder = "test_dataset"
tokens_per_chunk = 256
chunk_overlap = 25

# Filter for small models, cap at 3
small_model_names = [m for m in MODEL_NAMES if "small" in m.lower()][:3]
assert all("/" in m for m in small_model_names), f"Expected all small models to be HF models, got {small_model_names}"  # fmt: skip
assert (
    len(small_model_names) == 3
), f"Expected 3 small models, got {len(small_model_names)}"  # should be multiple
if len(small_model_names) == 0:
    raise ValueError("No small models found in OPENAI_MODELS")
models = [
    SentenceTransformer(model_name, device=device) for model_name in small_model_names
]
model_names = [
    s_model_name.split("/")[-1] for s_model_name in small_model_names
]  # get the model names for saving
text_splitters = [
    SentenceTransformersTokenTextSplitter(
        model_name=s_model_name,
        chunk_overlap=chunk_overlap,
        tokens_per_chunk=tokens_per_chunk,
    )
    for s_model_name in small_model_names
]
print(small_model_names)
print(models)
print(model_names)
print(text_splitters)

['BAAI/bge-small-en-v1.5', 'intfloat/e5-small-v2', 'thenlper/gte-small']
[SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), SentenceTransformer(
  (0): Transformer({'max

In [23]:
test_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "All that glitters is not gold.",
    "Actions speak louder than words.",
    "Beauty is in the eye of the beholder.",
    "Every cloud has a silver lining.",
    "Fortune favors the bold.",
    "Knowledge is power.",
    "Practice makes perfect.",
    "Time heals all wounds.",
    # add one long entry here so that we can pass the 256 limit
    "donkey is happy, " * 400,  # surely at least 2 chunks at least with toks
]
test_queries = [
    "What is the meaning of life?",
    "Do donkeys like apples?",
    "Where is 4??",
]
record_ids = [f"test_record_{i}" for i in range(len(test_sentences))]
print(test_sentences)
print(record_ids)

['The quick brown fox jumps over the lazy dog.', 'A journey of a thousand miles begins with a single step.', 'All that glitters is not gold.', 'Actions speak louder than words.', 'Beauty is in the eye of the beholder.', 'Every cloud has a silver lining.', 'Fortune favors the bold.', 'Knowledge is power.', 'Practice makes perfect.', 'Time heals all wounds.', 'donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is happy, donkey is 

In [40]:
"""
Test that we are able to create a test dataset and ingest it into ChromaDB from text and via the StringsToJSONDataset class.
"""
import importlib
import tempfile
from pathlib import Path

import chromadb
from tqdm import tqdm

import owlergpt.modern.ingestion.ingestors


importlib.reload(owlergpt.modern.ingestion.ingestors)  # For debugging the library
StringsToJSONDataset = owlergpt.modern.ingestion.ingestors.StringsToJSONDataset
OriginalIngestion = owlergpt.modern.ingestion.ingestors.OriginalIngestion

with tempfile.TemporaryDirectory() as temp_dir:
    print("Creating dataset and regular collection")
    chroma_client = chromadb.PersistentClient(
        path=temp_dir, settings=chromadb.Settings(anonymized_telemetry=False)
    )

    collections = []
    # Create collections and populate with embeddings
    for i, model in tqdm(enumerate(models), desc="Creating collections"):
        # Create collection
        # NOTE this has to be parseable
        # `{selected_folder}_{transformer_model}_CharacterSplitting_{tokens_per_chunk}`
        collection_name = (
            f"{selected_folder}_{model_names[i]}_CharacterSplitting_{tokens_per_chunk}"
        )
        collection = chroma_client.create_collection(
            name=collection_name,
            metadata={
                "hnsw:space": "cosine"
            },  # Using cosine as default distance function
        )

        # Generate embeddings
        embeddings = model.encode(test_sentences, convert_to_tensor=False)

        # Create metadata for each embedding
        metadatas = [
            {
                "record_id": record_ids[i],
                "record_text": test_sentences[i],
                "record_type": "document",
            }
            for i in range(len(test_sentences))
        ]

        # Add to collection
        collection.add(
            embeddings=embeddings.tolist(),
            documents=test_sentences,
            metadatas=metadatas,
            ids=record_ids,
        )

        collections.append(collection)
print("Done creating collections")
print(collections)

"""
Try a cartesian product of 2 datasets (each is a clone of the other one though) w/ 3 models.
"""

with tempfile.TemporaryDirectory() as _temp_dir:
    temp_dir = Path(_temp_dir)
    chroma_path = temp_dir / "chroma"
    jsonls_path = temp_dir / "jsonls"
    print("Using JSONDataset to embed...")
    print(">>>> >>>> >>>> >>>> 1. Creating JSONL files")
    datasets_pre_jsonl = [
        (test_sentences, test_queries),
        (test_queries, test_sentences),
    ]
    locations = [
        jsonls_path / f"test_dataset_{i}" for i in range(len(datasets_pre_jsonl))
    ]
    dataset_names = [f"test_dataset_{i}" for i in range(len(datasets_pre_jsonl))]
    for location, (sentences, queries) in zip(
        locations, datasets_pre_jsonl, strict=False
    ):  # just do it twice in two ifferent folders
        obj = StringsToJSONDataset(output_path=location)
        obj.create_dataset(texts=sentences, queries=queries)
    print(
        ">>>> >>>> >>>> >>>> 2. Creating JSONL files using the default ingestion pipeline"
    )

    total_num_collections = len(small_model_names) * len(locations)
    i = 0
    for model_name in small_model_names:
        for location, dataset_name in zip(locations, dataset_names, strict=False):
            i += 1
            print(
                f">>>> >>>> >>>> >>>> 3. Creating collection for {model_name} w/ {dataset_name} (location={location}) ({i}/{total_num_collections})"
            )
            chroma_client, collection = OriginalIngestion.create_collection(
                chroma_client=None,  # use a new client every time
                vector_dataset_path=chroma_path.as_posix(),
                selected_folders=[dataset_name],
                tokens_per_chunk=tokens_per_chunk,
                chunk_overlap=chunk_overlap,
                normalize_embeddings=False,
                model_name=model_name,
                batch_size=1,
                dataset_folder_path=jsonls_path.as_posix(),
                vector_search_chunk_prefix="passage: ",
                vector_search_distance_function="cosine",
                num_workers=0,  # TODO(Adriano) WTF is going on with Mac local runtime non-cpu errors?
            )

Creating dataset and regular collection


Creating collections: 3it [00:01,  1.81it/s]


Done creating collections
[Collection(name=test_dataset_bge-small-en-v1.5_CharacterSplitting_256), Collection(name=test_dataset_e5-small-v2_CharacterSplitting_256), Collection(name=test_dataset_gte-small_CharacterSplitting_256)]
Using JSONDataset to embed...
>>>> >>>> >>>> >>>> 1. Creating JSONL files
>>>> >>>> >>>> >>>> 2. Creating JSONL files using the default ingestion pipeline
>>>> >>>> >>>> >>>> 3. Creating collection for BAAI/bge-small-en-v1.5 w/ test_dataset_0 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_0) (1/6)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Created dataset-specific DB test_dataset_0_256 to store embeddings.


| Creating databases + collections |: 1it [00:00, 304.97it/s]


Processing dataset test_dataset_0
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_0


| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 51.32it/s]

Processed 11 documents, generated 13 embeddings.



| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 53.61it/s]

Processed 22 documents, generated 26 embeddings.





>>>> >>>> >>>> >>>> 3. Creating collection for BAAI/bge-small-en-v1.5 w/ test_dataset_1 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_1) (2/6)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Created dataset-specific DB test_dataset_1_256 to store embeddings.


| Creating databases + collections |: 1it [00:00, 300.84it/s]


Processing dataset test_dataset_1
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_1


| Computing embeddings |: 100%|██████████| 3/3 [00:00<00:00, 102.53it/s]

Processed 3 documents, generated 1 embeddings.



| Computing embeddings |: 100%|██████████| 3/3 [00:00<00:00, 149.60it/s]

Processed 6 documents, generated 2 embeddings.
>>>> >>>> >>>> >>>> 3. Creating collection for intfloat/e5-small-v2 w/ test_dataset_0 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_0) (3/6)
Getting text_splitter, transformer_model, client





Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Dataset-specific DB test_dataset_0_256 already exists. Using it to store embeddings


| Creating databases + collections |: 1it [00:00, 239.35it/s]


Processing dataset test_dataset_0
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_0


| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 52.63it/s]

Processed 11 documents, generated 13 embeddings.



| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 59.75it/s]

Processed 22 documents, generated 26 embeddings.





>>>> >>>> >>>> >>>> 3. Creating collection for intfloat/e5-small-v2 w/ test_dataset_1 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_1) (4/6)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Dataset-specific DB test_dataset_1_256 already exists. Using it to store embeddings


| Creating databases + collections |: 1it [00:00, 426.42it/s]


Processing dataset test_dataset_1
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_1


| Computing embeddings |: 100%|██████████| 3/3 [00:00<00:00, 85.05it/s]

Processed 3 documents, generated 1 embeddings.



| Computing embeddings |: 100%|██████████| 3/3 [00:00<00:00, 168.38it/s]

Processed 6 documents, generated 2 embeddings.
>>>> >>>> >>>> >>>> 3. Creating collection for thenlper/gte-small w/ test_dataset_0 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_0) (5/6)
Getting text_splitter, transformer_model, client





Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Dataset-specific DB test_dataset_0_256 already exists. Using it to store embeddings


| Creating databases + collections |: 1it [00:00, 326.81it/s]


Processing dataset test_dataset_0
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_0


| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 52.49it/s]

Processed 11 documents, generated 13 embeddings.



| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 56.86it/s]

Processed 22 documents, generated 26 embeddings.





>>>> >>>> >>>> >>>> 3. Creating collection for thenlper/gte-small w/ test_dataset_1 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_1) (6/6)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Dataset-specific DB test_dataset_1_256 already exists. Using it to store embeddings


| Creating databases + collections |: 1it [00:00, 423.75it/s]


Processing dataset test_dataset_1
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp5dhsws26/jsonls/test_dataset_1


| Computing embeddings |: 100%|██████████| 3/3 [00:00<00:00, 196.61it/s]

Processed 3 documents, generated 1 embeddings.



| Computing embeddings |: 100%|██████████| 3/3 [00:00<00:00, 231.82it/s]

Processed 6 documents, generated 2 embeddings.





In [44]:
"""
Now we try to visualize and make sure that all the metadata is indeed correct.
"""

import owlergpt.modern.ingestion.fix_metadata


importlib.reload(owlergpt.modern.ingestion.ingestors)
importlib.reload(owlergpt.modern.ingestion.fix_metadata)
OriginalIngestion = owlergpt.modern.ingestion.ingestors.OriginalIngestion
EmbeddingMetadataPopulatorArgs = (
    owlergpt.modern.ingestion.fix_metadata.EmbeddingMetadataPopulatorArgs
)
EmbeddingMetadataPopulatorCoordinator = (
    owlergpt.modern.ingestion.fix_metadata.EmbeddingMetadataPopulatorCoordinator
)
EmbeddingMetadataPopulator = (
    owlergpt.modern.ingestion.fix_metadata.EmbeddingMetadataPopulator
)


with tempfile.TemporaryDirectory() as _temp_dir:
    temp_dir = Path(_temp_dir)
    chroma_path = temp_dir / "chroma"
    jsonls_path = temp_dir / "jsonls"
    print("Using JSONDataset to embed...")
    print(">>>> >>>> >>>> >>>> 1. Creating JSONL files")
    datasets_pre_jsonl = [
        (test_sentences, test_queries)
    ]  # NOTE <---- one dataset (as we will do) but multiple models => x3
    locations = [
        jsonls_path / f"test_dataset_{i}" for i in range(len(datasets_pre_jsonl))
    ]
    dataset_names = [f"test_dataset_{i}" for i in range(len(datasets_pre_jsonl))]
    for location, (sentences, queries) in zip(
        locations, datasets_pre_jsonl, strict=False
    ):  # just do it twice in two ifferent folders
        obj = StringsToJSONDataset(output_path=location)
        obj.create_dataset(texts=sentences, queries=queries)
    print(
        ">>>> >>>> >>>> >>>> 2. Creating JSONL files using the default ingestion pipeline"
    )

    total_num_collections = len(small_model_names) * len(locations)
    i = 0

    # Create collections
    location = locations[0]
    dataset_name = dataset_names[0]
    assert len(locations) == len(dataset_names) == 1

    collections: list[chromadb.Collection] = []
    chroma_client = None
    for model_name in small_model_names:
        i += 1
        print(
            f">>>> >>>> >>>> >>>> 3. Creating collection for {model_name} w/ {dataset_name} (location={location}) ({i}/{total_num_collections})"
        )
        chroma_client, collection = OriginalIngestion.create_collection(
            chroma_client=chroma_client,  # <--- try reusing the client
            vector_dataset_path=chroma_path.as_posix(),
            selected_folders=[dataset_name],
            tokens_per_chunk=tokens_per_chunk,
            chunk_overlap=chunk_overlap,
            normalize_embeddings=False,
            model_name=model_name,
            batch_size=1,
            dataset_folder_path=jsonls_path.as_posix(),
            vector_search_chunk_prefix="passage: ",
            vector_search_distance_function="cosine",
            num_workers=0,  # TODO(Adriano) WTF is going on with Mac local runtime non-cpu errors?
        )
        collections.append(collection)
    print("We have a client: ", chroma_client)
    print("We have collections: ", collections)

    print(">>>> >>>> >>>> >>>> 4. Visualizing the metadata FML")
    raise NotImplementedError("Not implemented")

    print(">>>> >>>> >>>> >>>> 5. Testing metadata correction")
    raise NotImplementedError("Not implemented")

Using JSONDataset to embed...
>>>> >>>> >>>> >>>> 1. Creating JSONL files
>>>> >>>> >>>> >>>> 2. Creating JSONL files using the default ingestion pipeline
>>>> >>>> >>>> >>>> 3. Creating collection for BAAI/bge-small-en-v1.5 w/ test_dataset_0 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp_l6joizl/jsonls/test_dataset_0) (1/3)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Created dataset-specific DB test_dataset_0_256 to store embeddings.


| Creating databases + collections |: 1it [00:00, 314.34it/s]


Processing dataset test_dataset_0
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp_l6joizl/jsonls/test_dataset_0


| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 54.69it/s]

Processed 11 documents, generated 13 embeddings.



| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 51.19it/s]

Processed 22 documents, generated 26 embeddings.





>>>> >>>> >>>> >>>> 3. Creating collection for intfloat/e5-small-v2 w/ test_dataset_0 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp_l6joizl/jsonls/test_dataset_0) (2/3)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Dataset-specific DB test_dataset_0_256 already exists. Using it to store embeddings


| Creating databases + collections |: 1it [00:00, 365.77it/s]


Processing dataset test_dataset_0
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp_l6joizl/jsonls/test_dataset_0


| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 48.92it/s]

Processed 11 documents, generated 13 embeddings.



| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 45.71it/s]

Processed 22 documents, generated 26 embeddings.





>>>> >>>> >>>> >>>> 3. Creating collection for thenlper/gte-small w/ test_dataset_0 (location=/var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp_l6joizl/jsonls/test_dataset_0) (3/3)
Getting text_splitter, transformer_model, client
Creating split embedding models
Creating collection
Creating 1 databases


| Creating databases + collections |: 0it [00:00, ?it/s]

Dataset-specific DB test_dataset_0_256 already exists. Using it to store embeddings


| Creating databases + collections |: 1it [00:00, 402.56it/s]


Processing dataset test_dataset_0
Processing dataset /var/folders/j5/qcb5hwfx53q57rz91nm4rvr40000gn/T/tmp_l6joizl/jsonls/test_dataset_0


| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 54.07it/s]

Processed 11 documents, generated 13 embeddings.



| Computing embeddings |: 100%|██████████| 11/11 [00:00<00:00, 54.43it/s]

Processed 22 documents, generated 26 embeddings.



