# Kolosal Plane
Dataset augmentation for LLM or Embedding Fine-tuning.

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

## Augmentation Knowledge

In [2]:
# lets start by defining the LLM were going to use to augment the dataset
from distilabel.llms import AzureOpenAILLM, OpenAILLM

# For AzureOpenAI
llm = AzureOpenAILLM(base_url=os.getenv("AZURE_OPENAI_ENDPOINT"),
                     api_key=os.getenv("AZURE_OPENAI_API_KEY"),
                     api_version=os.getenv("AZURE_API_VERSION"),
                     model="gpt-4o",
                     generation_kwargs={
                         "max_new_tokens": 1024
})

# For OpenAI directly
# llm = OpenAILLM(api_key=os.getenv("OPENAI_API_KEY"),
#                 model="gpt-4o",
#                 generation_kwargs={
#                     "max_new_tokens": 1024
# })

# Refer to the documentation for more details on the LLM class and its parameters:
# https://distilabel.argilla.io/latest/api/models/llm/llm_gallery/

  from distilabel.llms import AzureOpenAILLM, OpenAILLM
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Lets load the dataset we want to augment
import pandas as pd

dataset = pd.read_csv("example/documents.csv")

dataset.head()

Unnamed: 0,Documents
0,The Sun accounts for 99.86% of the mass in the...
1,A day on Venus is longer than its year.
2,Neutron stars can spin up to 700 times per sec...
3,The largest volcano in the Solar System is Oly...
4,Light from the Moon is actually reflected sunl...


In [5]:
# Lets define the instruction we want to give to the LLM
conversation_starter_instruction = "Act like an Ivy Leauge students that is passionate in astronomy, generate questions that would be asked to a professor based on the given topic."
conversation_personalization_instruction = "Answer the questions in a friendly manner, as if you are talking to a friend."
system_prompt = "You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy"

In [6]:
# The augmentation knowledge pipeline can operate in either synchronous or asynchronous mode. For optimal performance, we recommend using the asynchronous mode.
from kolosal_plane.augmentations.knowledge_simplified import AsyncSimpleKnowledge

pipeline_async = AsyncSimpleKnowledge(conversation_starter_instruction=conversation_starter_instruction,
                                       conversation_personalization_instruction=conversation_personalization_instruction,
                                       system_prompt=system_prompt,
                                       conversation_starter_count=3,
                                       documents=dataset["Documents"].tolist(),
                                       max_conversations=3,
                                       llm_model=llm)

In [None]:
# To start the augmentation process, we can use the start_augmentation() function
pipeline_async.start_augmentation()

<Task pending name='Task-1' coro=<AsyncSimpleKnowledge.augmentate_async() running at d:\Genta Technology\Development\Kolosal-Data-Augmentation\kolosal_plane\augmentations\knowledge_simplified.py:189>>

Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Augmenting conversations:   0%|          | 0/3 [00:00<?, ?it/s]Step 'None' hasn't received a pipeline, an

In [8]:
# To get the current status we can use the get_status() function
pipeline_async.get_status()

('Running',
 {'llm_input_token_count': 11869,
  'llm_output_token_count': 5659,
  'tlm_input_token_count': 0,
  'tlm_output_token_count': 0})

In [9]:
# To get the dataset result we can use the get_result() function
pipeline_async.get_result()

(shape: (45, 3)
 ┌────────────────────────────┬──────────────────────────────────┬──────────────────────────────────┐
 │ chat_history               ┆ document                         ┆ response                         │
 │ ---                        ┆ ---                              ┆ ---                              │
 │ list[struct[2]]            ┆ str                              ┆ str                              │
 ╞════════════════════════════╪══════════════════════════════════╪══════════════════════════════════╡
 │ [{"system","You are an Ivy ┆ The Sun accounts for 99.86% of…  ┆ Oh, that's a fantastic questio…  │
 │ Lea…                       ┆                                  ┆                                  │
 │ [{"system","You are an Ivy ┆ The Sun accounts for 99.86% of…  ┆ Oh, absolutely! Determining th…  │
 │ Lea…                       ┆                                  ┆                                  │
 │ [{"system","You are an Ivy ┆ The Sun accounts for 99.86% of…  ┆

In [23]:
# The augmented dataset is in a form of polars
augmented_dataset, metadata = pipeline_async.get_result()
augmented_dataset

chat_history,document,response
list[struct[2]],str,str
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""How does the Sun's massive presence influence the orbital paths of planets in the Solar System?""}]","""The Sun accounts for 99.86% of…","""Oh, that's a fantastic questio…"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""Could you explain the process by which the Sun's mass was determined to be 99.86% of the total Solar System mass?""}]","""The Sun accounts for 99.86% of…","""Oh, absolutely! Determining th…"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""In what ways has the Sun's dominant mass impacted the formation and evolution of other celestial bodies in our Solar System?""}]","""The Sun accounts for 99.86% of…","""Oh, that's such a fascinating …"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""What factors contribute to the Sun's dominance in the mass distribution of our Solar System?""}]","""A day on Venus is longer than …","""Great question! The Sun truly …"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""Can you elucidate the astronomical phenomena that cause a day on Venus to be longer than its year?""}]","""A day on Venus is longer than …","""Of course, I'd be delighted to…"
…,…,…
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""What factors contribute to the exceptionally long day on Venus compared to its year?""}, … {""user"",""Explain how Olympus Mons on Mars compares to the largest volcanic structures on Earth in terms of size and formation processes.""}]","""The largest volcano in the Sol…","""Oh, I'd love to chat about Oly…"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""Describe the mechanisms that allow neutron stars to achieve such incredibly high rotational speeds.""}, … {""user"",""Can you explain how the immense mass of the Sun influences the orbits and behaviors of other objects within the Solar System?""}]","""The largest volcano in the Sol…","""Oh, absolutely! The Sun's mass…"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""Could you explain the implications of the Sun comprising 99.86% of the mass in our Solar System?""}, … {""user"",""Can you illustrate the significance of the Great Red Spot on Jupiter and its impact on our understanding of planetary atmospheres?""}]","""Light from the Moon is actuall…","""Of course, let's chat about th…"
"[{""system"",""You are an Ivy League professor, answer the questions in a friendly manner, as if you are teaching passionatly about astronomy""}, {""user"",""How does the length of a day on Venus affect our understanding of its atmospheric conditions?""}, … {""user"",""Given the unique length of a day on Venus, could you elaborate on how this affects the potential for creating a stable climate model for the planet?""}]","""Light from the Moon is actuall…","""Oh, absolutely! It’s quite a t…"


In [None]:
# We can save the dataset to a JSON file, which could be further used for training a model
augmented_dataset.write_json("example/augmented_dataset.json")

## Augmentation Embedding

In [4]:
# Lets define the instruction we want for the augmentation and use the same documents as the base knowledge
instruction = "Create potential user questions that could be asked to a chatbot based on the content of the following document."


In [5]:
# Similar to the augmentation knowledge, there is a non-async and async version of the pipeline, async is recommended for performance
from kolosal_plane.augmentations.embeddings import AsyncEmbeddingAugmentation

pipeline_async = AsyncEmbeddingAugmentation(documents=dataset["Documents"].tolist(),
                                            instruction=instruction,
                                            lm=llm,
                                            question_per_document=100,
                                            batch_size=10)

In [None]:
# To start the augmentation process, we can use the start_augmentation() function
pipeline_async.start_augmentation()

<Task pending name='Task-1' coro=<AsyncEmbeddingAugmentation.augmentate_async() running at d:\Genta Technology\Development\Kolosal-Data-Augmentation\kolosal_plane\augmentations\embeddings.py:94>>

Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Augmenting documents: 100%|██████████| 5/5 [00:11<00:00,  2.34s/it]


In [8]:
# To get the current status we can use the get_status() function
pipeline_async.get_status()

('Running', {'input_token_count': 1920, 'output_token_count': 1633})

In [13]:
# To get the dataset result we can use the get_result() function
pipeline_async.get_result()

(shape: (500, 2)
 ┌─────────────────────────────────┬─────────────────────────────────┐
 │ question                        ┆ document                        │
 │ ---                             ┆ ---                             │
 │ str                             ┆ str                             │
 ╞═════════════════════════════════╪═════════════════════════════════╡
 │ Explain how the Sun's mass inf… ┆ The Sun accounts for 99.86% of… │
 │ What would happen to the Solar… ┆ The Sun accounts for 99.86% of… │
 │ Describe the process by which … ┆ The Sun accounts for 99.86% of… │
 │ How does the Sun's mass compar… ┆ The Sun accounts for 99.86% of… │
 │ Why is the Sun's mass crucial … ┆ The Sun accounts for 99.86% of… │
 │ …                               ┆ …                               │
 │ Illustrate how sunlight travel… ┆ Light from the Moon is actuall… │
 │ Is the reflected sunlight from… ┆ Light from the Moon is actuall… │
 │ Analyze the differences betwee… ┆ Light from the Moon is 

In [14]:
# The augmented dataset is in a form of polars
augmented_dataset, metadata = pipeline_async.get_result()
augmented_dataset

question,document
str,str
"""Explain how the Sun's mass inf…","""The Sun accounts for 99.86% of…"
"""What would happen to the Solar…","""The Sun accounts for 99.86% of…"
"""Describe the process by which …","""The Sun accounts for 99.86% of…"
"""How does the Sun's mass compar…","""The Sun accounts for 99.86% of…"
"""Why is the Sun's mass crucial …","""The Sun accounts for 99.86% of…"
…,…
"""Illustrate how sunlight travel…","""Light from the Moon is actuall…"
"""Is the reflected sunlight from…","""Light from the Moon is actuall…"
"""Analyze the differences betwee…","""Light from the Moon is actuall…"
"""Discuss the impact of the Moon…","""Light from the Moon is actuall…"


In [15]:
# We can save the dataset to a JSON file, which could be further used for training a model
augmented_dataset.write_json("example/augmented_dataset.json")