In [3]:
import os
from groq import Groq
import sys
from dotenv import load_dotenv

sys.path.insert(1, "source")

# dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
# load_dotenv(dotenv_path)

from prompts.agent_prompts import (
    agent_manager_prompt,
    data_agent_prompt,
    model_agent_prompt,
    prompt_agent,
    operation_agent_prompt,
)


In [4]:
# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
api_key=os.environ.get("GROQ_API_KEY")
# print(api_key)

In [5]:
#########################
# 1. AGENT BASE CLASSES #
#########################

class AgentBase:
    def __init__(self, role, model, description, **kwargs):
        self.role = role
        self.model = model
        self.description = description
        self.kwargs = kwargs

    def execute(self, messages):
        """Executes a task using the defined role and model."""
        return client.chat.completions.create(
            messages=messages,
            model=self.model,
            **self.kwargs
        )

In [6]:
# ----------------------------
# Manager Agent (inherits from AgentBase)
# ----------------------------
class AgentManager(AgentBase):
    def __init__(self, role, model, description, json_schema, **kwargs):
        super().__init__(role, model, description, **kwargs)
        self.json_schema = json_schema

    def parse_to_json(self, user_input):
        """Parses the user input into a JSON format based on the schema."""
        messages = [
            {
                "role": "system",
                "content": f"""
{agent_manager_prompt.strip()}

# JSON SPECIFICATION SCHEMA #
{self.json_schema}
""",
            },
            {"role": "user", "content": user_input},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content


In [7]:

# ----------------------------
# Prompt Agent (inherits from AgentBase)
# ----------------------------
class PromptAgent(AgentBase):
    def __init__(self, role, model, description, json_specification, **kwargs):
        super().__init__(role, model, description, **kwargs)
        self.json_specification = json_specification

    def generate_json(self, user_input):
        """Generates a JSON response strictly adhering to the specification."""
        messages = [
            {
                "role": "system",
                "content": f"""
{prompt_agent.strip()}

# JSON SPECIFICATION SCHEMA #
'''json
{self.json_specification}
'''
""",
            },
            {"role": "user", "content": user_input},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content



In [8]:
# ----------------------------
# AutoML Agent (inherits from AgentBase)
# ----------------------------
class AutoMLAgent(AgentBase):
    def __init__(self, role, model, description, data_path="./data", **kwargs):
        super().__init__(role, model, description, **kwargs)
        self.data_path = data_path

    def retrieve_dataset(self, query):
        """Retrieves a dataset based on user instructions or searches for one."""
        dataset_path = os.path.join(self.data_path, "renttherunway_cleaned.csv")
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": query},
        ]
        response = self.execute(messages)
        # Save the retrieved dataset to the specified path (placeholder implementation)
        with open(dataset_path, "w") as file:
            file.write(response.choices[0].message.content)
        return dataset_path

    def preprocess_data(self, instructions):
        """Performs data preprocessing based on user instructions or best practices."""
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": f"Instructions: {instructions}"},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def augment_data(self, augmentation_details):
        """Performs data augmentation as necessary."""
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": f"Augmentation Details: {augmentation_details}"},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def visualize_data(self, visualization_request):
        """Generates meaningful visualizations to understand the dataset."""
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": visualization_request},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content



In [9]:

# ----------------------------
# Model Agent (inherits from AgentBase)
# ----------------------------
class ModelAgent(AgentBase):
    def __init__(self, role, model, description, **kwargs):
        super().__init__(role, model, description, **kwargs)

    def retrieve_models(self, dataset_details):
        """Retrieve a list of well-performing models or algorithms based on dataset details."""
        messages = [
            {"role": "system", "content": model_agent_prompt.strip()},
            {"role": "user", "content": dataset_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def optimize_model(self, hyperparameter_details):
        """Perform hyperparameter optimization on candidate models."""
        messages = [
            {"role": "system", "content": model_agent_prompt.strip()},
            {"role": "user", "content": hyperparameter_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def profile_models(self, profiling_details):
        """Perform metadata extraction and profiling on candidate models."""
        messages = [
            {"role": "system", "content": model_agent_prompt.strip()},
            {"role": "user", "content": profiling_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content



In [10]:
# ----------------------------
# Operations Agent (inherits from AgentBase)
# ----------------------------
class OperationsAgent(AgentBase):
    def __init__(self, role, model, description, **kwargs):
        super().__init__(role, model, description, **kwargs)

    def deploy_model(self, deployment_details):
        """Prepare and deploy the model based on the provided details."""
        messages = [
            {"role": "system", "content": operation_agent_prompt.strip()},
            {"role": "user", "content": deployment_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content


In [11]:
#################################
# 2. AGENT INSTANTIATION SETUP  #
#################################

# Define JSON specification schema
JSON_SCHEMA = """json
{
    "task": "string",
    "priority": "string",
    "deadline": "string",
    "resources": [
        {
            "type": "string",
            "quantity": "integer"
        }
    ]
}
"""

# Create agent instances
manager_agent = AgentManager(
    role="manager",
    model="llama-3.3-70b-versatile",
    description="Assistant project manager for parsing user requirements into JSON.",
    json_schema=JSON_SCHEMA,
    stream=False
)

prompt_parser_agent = PromptAgent(
    role="prompt_parser",
    model="llama-3.3-70b-versatile",
    description="Assistant project manager for JSON parsing.",
    json_specification=JSON_SCHEMA,
    stream=False
)

automl_agent = AutoMLAgent(
    role="data_scientist",
    model="llama-3.3-70b-versatile",
    description="Automated machine learning agent for dataset retrieval, preprocessing, augmentation, and visualization.",
    data_path="data",
    stream=False
)

model_agent = ModelAgent(
    role="ml_researcher",
    model="llama-3.3-70b-versatile",
    description="Machine learning research agent for model optimization and profiling.",
    stream=False
)

operations_agent = OperationsAgent(
    role="mlops",
    model="llama-3.3-70b-versatile",
    description="MLOps agent for deployment and application development.",
    stream=False
)

# A dictionary to hold all agents if needed
agents = {
    "manager": manager_agent,
    "prompt": prompt_parser_agent,
    "automl": automl_agent,
    "model": model_agent,
    "operations": operations_agent
}

In [12]:
from source.embedding_manager import State
from source.memory import CSVEmbeddingManager

In [40]:
#################################
# 3. PIPELINE AGENT DEFINITION  #
#################################

class PipelineAgent(AgentBase):
    """
    PipelineAgent is responsible for orchestrating the full data-to-deployment pipeline.
    It uses the various agents (data, model, operations, etc.) to execute their tasks sequentially.
    This version augments each step (preprocessing, model selection, and deployment)
    with additional context retrieved from the embedded dataset.
    """
    def __init__(self, agents, state: State, memory_manager: CSVEmbeddingManager,
                 dataset_dir="data", **kwargs):
        # You can give this pipeline a role and model description if needed.
        super().__init__(role="pipeline", model="n/a", description="Pipeline to orchestrate all agents", **kwargs)
        self.agents = agents
        self.state = state
        self.memory_manager = memory_manager
        self.dataset_dir = dataset_dir

    def run_pipeline(self, preprocessing_input, model_request, deployment_details):
        """
        Executes a full pipeline:
          1. Preprocess data (augmented with context from the embedded dataset).
          2. Retrieve candidate models (augmented with context).
          3. Deploy the selected model (augmented with context).
        """
        os.makedirs(self.dataset_dir, exist_ok=True)
        self.state.make_dir()

        # --- Step 0: Query the embedded dataset ---
        # Use the memory manager (Chroma DB) to get relevant dataset rows based on the input.
        print("[Pipeline] Querying embedded dataset for additional context...")
        memory_results = self.memory_manager.query_collection(preprocessing_input, n_results=5)
        # Convert each document to a string to avoid type errors.
        memory_context = "\n".join([str(doc) for doc in memory_results.get("documents", [])])
        print("[Pipeline] Retrieved memory context:\n", memory_context)

        # --- Step 1: Preprocess the dataset using the AutoML agent ---
        # Combine the original preprocessing input with the retrieved memory context.
        combined_preprocessing_input = (
            f"{preprocessing_input}\n\nRelevant dataset context:\n{memory_context}"
        )
        preprocessed_data = self.agents["automl"].preprocess_data(combined_preprocessing_input)
        preprocessed_path = os.path.join(self.dataset_dir, "preprocessed_data.md")
        with open(preprocessed_path, "w") as f:
            f.write(preprocessed_data)
        print(f"Preprocessed data saved to: {preprocessed_path}")

        # Update state memory for preprocessing step.
        self.state.update_memory({"preprocessing": preprocessed_data})
        self.state.persist_memory()  # save memory to disk
        self.state.next_step()

        # --- Step 2: Retrieve candidate models using the Model agent ---
        # Augment the model request with the same memory context.
        combined_model_request = (
            f"{model_request}\n\nRelevant dataset context:\n{memory_context}"
        )
        model_list = self.agents["model"].retrieve_models(combined_model_request)
        model_list_path = os.path.join(self.dataset_dir, "model_list.md")
        with open(model_list_path, "w") as f:
            f.write(model_list)
        print(f"Model list saved to: {model_list_path}")

        # Update state memory for model retrieval step.
        self.state.update_memory({"model_list": model_list})
        self.state.persist_memory()
        self.state.next_step()

        # --- Step 3: Deploy the model using the Operations agent ---
        # Similarly, augment the deployment details with the memory context.
        combined_deployment_details = (
            f"{deployment_details}\n\nRelevant dataset context:\n{memory_context}"
        )
        deployment_output = self.agents["operations"].deploy_model(combined_deployment_details)
        deployment_output_path = os.path.join(self.dataset_dir, "deployment_output.md")
        with open(deployment_output_path, "w") as f:
            f.write(deployment_output)
        print(f"Deployment output saved to: {deployment_output_path}")

        # Update state memory for deployment step.
        self.state.update_memory({"deployment_output": deployment_output})
        self.state.persist_memory()
        self.state.next_step()

        # Return a dictionary of results (if needed).
        return {
            "preprocessed_data": preprocessed_data,
            "model_list": model_list,
            "deployment_output": deployment_output
        }


In [41]:
import os
import json
import pandas as pd
from tqdm import tqdm
import re
import chromadb
from chromadb.config import Settings

def split_text(text: str, max_chunk_length: int = 8000, overlap_ratio: float = 0.1):
    """
    Splits a long string into overlapping chunks.
    """
    if not (0 <= overlap_ratio < 1):
        raise ValueError("Overlap ratio must be between 0 and 1 (exclusive).")
    
    overlap_length = int(max_chunk_length * overlap_ratio)
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chunk_length, len(text))
        chunks.append(text[start:end])
        start += max_chunk_length - overlap_length
    return chunks

class ImprovedCSVEmbeddingManager:
    """
    ImprovedCSVEmbeddingManager embeds CSV data into a Chroma DB collection
    using batch processing and optional text chunking. This should speed up the
    embedding process compared to row-by-row insertion.
    """
    def __init__(self, collection_name="default_collection", db_path="chromadb", embedding_model=None, cache_size=10_000_000_000):
        self.settings = Settings(
            chroma_segment_cache_policy="LRU",
            chroma_memory_limit_bytes=cache_size
        )
        # Initialize persistent client for Chroma DB
        self.client = chromadb.PersistentClient(path=db_path, settings=self.settings)
        # Create or get the collection, specifying cosine similarity
        self.collection = self.client.get_or_create_collection(collection_name, metadata={"hnsw:space": "cosine"})
        if embedding_model is None:
            raise ValueError("An embedding_model must be provided.")
        self.embedding_model = embedding_model
        self.id_counter = 0  # To assign unique IDs if needed

    def embed_csv(self, csv_file_path: str, batch_size: int = 100):
        """
        Reads a CSV file and embeds its content into the collection in batches.
        Each row is converted to a JSON string (excluding the 'id' column if present).
        If a row's text is too long, it is split into chunks.
        """
        if not os.path.exists(csv_file_path):
            raise FileNotFoundError(f"CSV file not found: {csv_file_path}")

        # Read the CSV into a DataFrame
        df = pd.read_csv(csv_file_path)
        # Ensure there is an 'id' column; if not, create one
        if 'id' not in df.columns:
            df['id'] = df.index.astype(str)
        
        # Convert each row into a dictionary
        rows = df.to_dict(orient='records')
        
        batch_ids = []
        batch_documents = []
        batch_metadatas = []
        
        for row in tqdm(rows, desc="Embedding CSV rows"):
            # Get the document id (as a string)
            doc_id = str(row.get('id', self.id_counter))
            # Remove the 'id' field for the embedding
            row_copy = {k: v for k, v in row.items() if k != 'id'}
            # Convert the remaining data to a JSON string
            doc_text = json.dumps(row_copy)
            
            # Check if the document is too long; if so, split into chunks.
            if len(doc_text) > 8000:
                chunks = split_text(doc_text, max_chunk_length=8000, overlap_ratio=0.1)
                for chunk in chunks:
                    batch_documents.append(chunk)
                    # Create a unique id for each chunk
                    batch_ids.append(f"{doc_id}_{self.id_counter}")
                    batch_metadatas.append({"doc_name": os.path.basename(csv_file_path)})
                    self.id_counter += 1
            else:
                batch_documents.append(doc_text)
                batch_ids.append(doc_id)
                batch_metadatas.append({"doc_name": os.path.basename(csv_file_path)})
                self.id_counter += 1

            # If the batch is full, upsert into the collection in one call.
            if len(batch_documents) >= batch_size:
                # Compute embeddings for the entire batch at once.
                # Directly convert each embedding (a NumPy array) to a list.
                embeddings = [self.embedding_model.encode(doc).tolist() for doc in batch_documents]
                self.collection.add(
                    documents=batch_documents,
                    ids=batch_ids,
                    embeddings=embeddings,
                    metadatas=batch_metadatas
                )
                batch_ids = []
                batch_documents = []
                batch_metadatas = []

        # Upsert any remaining documents not in a full batch.
        if batch_documents:
            embeddings = [self.embedding_model.encode(doc).tolist() for doc in batch_documents]
            self.collection.add(
                documents=batch_documents,
                ids=batch_ids,
                embeddings=embeddings,
                metadatas=batch_metadatas
            )
        
        print(f"Finished embedding CSV: {csv_file_path}")

    def query_collection(self, query: str, n_results: int = 5) -> dict:
        """
        Queries the collection using the provided query string and returns the results.
        """
        query_embedding = self.embedding_model.encode(query).tolist()
        results = self.collection.query(query_embeddings=query_embedding, n_results=n_results, include=['documents', 'metadatas', 'distances'])
        return results


In [42]:
# !pip install sentence_transformers

In [43]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [61]:
state = State(phase="Model development", competition="MyCompetition")
state.make_context() # build context info

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# create an instance of the csv embedding manager
memory_manager = ImprovedCSVEmbeddingManager(collection_name="auto_ml_memory",
                                             embedding_model=embedding_model)
# memory_manager.embed_csv("data/renttherunway_cleaned.csv")


# Define sample inputs for the pipeline.
preprocessing_input = (
        "I have uploaded the dataset obtained from Rent the Runway, "
        "which relates to fit fiber clothing for women. Develop a model with at least 90 percent F1 score. "
        "The target variable is fit."
    )
model_request = "Find the top 3 models for classifying this dataset."
deployment_details = "Deploy the selected model as a web application."

# Create the pipeline agent with the dictionary of agents and dataset directory.
pipeline = PipelineAgent(agents=agents, state=state, memory_manager=memory_manager,
                         dataset_dir="data")

# Execute the pipeline
results = pipeline.run_pipeline(preprocessing_input, model_request, deployment_details)

# Optionally, print the results for debugging.
print("Pipeline execution completed. Results:")

print(results)

[Pipeline] Querying embedded dataset for additional context...
[Pipeline] Retrieved memory context:
 ['Here\'s how you can deploy the selected model as a web application using Gradio.\n\n### Step 1: Install Gradio\n\nFirst, we need to install Gradio. You can do this by running the following command in your terminal:\n\n```bash\npip install gradio\n```\n\n### Step 2: Import Libraries and Load Model\n\nNext, we import the necessary libraries and load the trained model.\n\n```python\nimport gradio as gr\nimport torch\nfrom PIL import Image\nfrom torchvision import transforms\n\n# Load the model\nmodel = torch.load(\'model.pth\', map_location=torch.device(\'cpu\'))\n```\n\n### Step 3: Preprocess Input\n\nWe define a function to preprocess the input image.\n\n```python\n# Define the preprocessing function\ndef preprocess_image(image):\n    transform = transforms.Compose([\n        transforms.Resize((224, 224)),\n        transforms.ToTensor(),\n        transforms.Normalize(mean=[0.485, 0.456

## Additional test cases

In [55]:
import os
import asyncio
import json
import logging
from datetime import datetime

# Assume these imports come from your modules
# from state import State
# from memory import CSVEmbeddingManager  # or ImprovedCSVEmbeddingManager if you renamed it
# Your agent prompt texts (agent_manager_prompt, prompt_agent, data_agent_prompt, model_agent_prompt, operation_agent_prompt)
# and agent classes (AgentBase, AgentManager, PromptAgent, AutoMLAgent, ModelAgent, OperationsAgent) are defined elsewhere

# For asynchronous HTTP calls to your LLM, you might need to use an async HTTP client.
# For simplicity, we wrap synchronous calls with asyncio.to_thread() if needed.

# -----------------------------------------------------------------------------
# Asynchronous Wrappers for Agent Methods
# -----------------------------------------------------------------------------

async def async_execute(agent, method_name, *args, **kwargs):
    """
    Run an agent method asynchronously.
    """
    # Use asyncio.to_thread to offload synchronous work to a thread
    method = getattr(agent, method_name)
    return await asyncio.to_thread(method, *args, **kwargs)

# -----------------------------------------------------------------------------
# MemoryAgent (Asynchronous Variant)
# -----------------------------------------------------------------------------
import numpy as np

class AsyncMemoryAgent:
    """
    Asynchronous MemoryAgent that queries and updates the shared memory (Chroma DB)
    and the state. It is responsible for checking if similar output exists,
    and if not, storing the new output.
    """
    def __init__(self, state: State, memory_manager: CSVEmbeddingManager, similarity_threshold: float = 0.2):
        self.state = state
        self.memory_manager = memory_manager
        self.similarity_threshold = similarity_threshold

    async def query_memory(self, query: str, n_results: int = 1) -> dict:
        # Wrap the memory manager query in an async call.
        return await asyncio.to_thread(self.memory_manager.query_collection, query, n_results)

    async def remember(self, new_output: str) -> None:
        # Update state memory and persist the new output.
        self.state.update_memory({"memory_agent": new_output})
        self.state.persist_memory()
        
        # Compute the embedding.
        embedding_result = self.memory_manager.embedding_model.encode(new_output)[0]
        
        # Ensure the embedding is at least 1-D.
        embedding = np.atleast_1d(embedding_result)
        
        # Check if the embedding has the expected dimension.
        expected_dim = 384
        if embedding.shape[0] != expected_dim:
            logging.warning(
                f"The embedding dimension is {embedding.shape[0]} instead of the expected {expected_dim}. "
                "Using fallback method to create a dummy embedding."
            )
            # Option 1: Replicate the scalar value across the expected dimensions.
            # This assumes the embedding is a scalar (or 1-element array).
            if embedding.size == 1:
                embedding = np.full((expected_dim,), embedding[0])
            else:
                # Option 2: Use zeros (or any other fallback logic).
                embedding = np.zeros(expected_dim, dtype=float)
        
        # Convert the numpy array to a list of floats.
        embedding_list = embedding.tolist()
        
        # Create a unique ID for this memory record.
        unique_id = f"memory_{len(self.state.memory)}"
        
        # Add the document, embedding, and metadata to the collection.
        self.memory_manager.collection.add(
            documents=[new_output],
            ids=[unique_id],
            embeddings=[embedding_list],
            metadatas=[{"source": "MemoryAgent", "timestamp": str(datetime.now())}]
        )
        logging.info(f"[MemoryAgent] Stored new output with id: {unique_id}")
        
    async def check_and_remember(self, new_output: str) -> str:
        results = await self.query_memory(new_output, n_results=1)
        # Check if a similar output exists.
        if results.get("distances") and results["distances"][0]:
            distance = results["distances"][0][0]
            if distance < self.similarity_threshold:
                repeated_output = results["documents"][0][0]
                logging.info(f"[MemoryAgent] Found similar output (distance {distance}). Reusing it.")
                return repeated_output
        logging.info(f"[MemoryAgent] No similar output found. Storing new output.")
        await self.remember(new_output)
        return new_output

# -----------------------------------------------------------------------------
# Asynchronous Pipeline Agent
# -----------------------------------------------------------------------------

class AsyncPipelineAgent:
    """
    This pipeline agent runs the entire sequence asynchronously. It runs each agent's
    work in an asynchronous task, passing along shared state and memory.
    Finally, it compiles a single unified output that includes the entire context.
    """
    def __init__(self, agents: dict, state: State, memory_agent: AsyncMemoryAgent, dataset_dir: str = "./data"):
        self.agents = agents
        self.state = state
        self.memory_agent = memory_agent
        self.dataset_dir = dataset_dir

    async def run_pipeline(self, preprocessing_input: str, model_request: str, deployment_details: str) -> dict:
        os.makedirs(self.dataset_dir, exist_ok=True)
        self.state.make_dir()

        outputs = {}

        # Step 1: Preprocessing (AutoML Agent)
        preprocessed_data = await async_execute(self.agents["automl"], "preprocess_data", preprocessing_input)
        preprocessed_path = os.path.join(self.dataset_dir, "preprocessed_data.md")
        with open(preprocessed_path, "w") as f:
            f.write(preprocessed_data)
        logging.info(f"[Pipeline] Preprocessed data saved to: {preprocessed_path}")

        # Use memory agent to check/store/reuse the preprocessed output.
        preprocessed_final = await self.memory_agent.check_and_remember(preprocessed_data)
        self.state.update_memory({"preprocessing": preprocessed_final})
        self.state.persist_memory()
        self.state.next_step()
        outputs["preprocessing"] = preprocessed_final

        # Step 2: Model Retrieval (Model Agent)
        model_list = await async_execute(self.agents["model"], "retrieve_models", model_request)
        model_list_path = os.path.join(self.dataset_dir, "model_list.md")
        with open(model_list_path, "w") as f:
            f.write(model_list)
        logging.info(f"[Pipeline] Model list saved to: {model_list_path}")

        model_list_final = await self.memory_agent.check_and_remember(model_list)
        self.state.update_memory({"model_list": model_list_final})
        self.state.persist_memory()
        self.state.next_step()
        outputs["model_list"] = model_list_final

        # Step 3: Deployment (Operations Agent)
        deployment_output = await async_execute(self.agents["operations"], "deploy_model", deployment_details)
        deployment_output_path = os.path.join(self.dataset_dir, "deployment_output.md")
        with open(deployment_output_path, "w") as f:
            f.write(deployment_output)
        logging.info(f"[Pipeline] Deployment output saved to: {deployment_output_path}")

        deployment_final = await self.memory_agent.check_and_remember(deployment_output)
        self.state.update_memory({"deployment_output": deployment_final})
        self.state.persist_memory()
        self.state.next_step()
        outputs["deployment_output"] = deployment_final

        # Compile final output by combining context and each step's output.
        final_output = {
            "context": self.state.context,
            "outputs": outputs,
            "full_pipeline": f"Preprocessing: {outputs['preprocessing']}\n"
                             f"Model Selection: {outputs['model_list']}\n"
                             f"Deployment: {outputs['deployment_output']}"
        }

        # Update state with the final output.
        self.state.update_memory({"final_output": final_output})
        self.state.persist_memory()

        return final_output

# -----------------------------------------------------------------------------
# Example Usage
# -----------------------------------------------------------------------------

async def main():
    # Initialize shared state.
    state = State(phase="Model Development", competition="MyCompetition")
    state.make_context()  # build context info

    # # Initialize your embedding model. (Make sure it has an encode() method.)
    # from llm import OpenaiEmbeddings
    # embedding_model = OpenaiEmbeddings(api_key="your_api_key")  # Adjust as needed

    # Initialize the CSV embedding manager (memory manager).
    # from memory import CSVEmbeddingManager  # or use your improved version if available
    memory_manager = ImprovedCSVEmbeddingManager(collection_name="auto_ml_memory", embedding_model=embedding_model)
    
    # Optionally, embed your initial CSV (if not already embedded)
    # if not os.path.exists("data/embedded_flag.txt"):
    #     memory_manager.embed_csv("data/renttherunway_cleaned.csv")
    #     # Create a flag file to avoid re-embedding on every run.
    #     with open("data/embedded_flag.txt", "w") as f:
    #         f.write("embedded")
    
    # Create your agents.
    # (Assume you have already defined your AgentManager, PromptAgent, AutoMLAgent, ModelAgent, and OperationsAgent.)
    # For brevity, reusing your synchronous agent instantiations:
    JSON_SCHEMA = """json
    {
        "task": "string",
        "priority": "string",
        "deadline": "string",
        "resources": [
            {
                "type": "string",
                "quantity": "integer"
            }
        ]
    }
    """
    manager_agent = AgentManager(role="manager", model="llama-3.3-70b-versatile",
                                  description="Assistant project manager", json_schema=JSON_SCHEMA, stream=False)
    prompt_parser_agent = PromptAgent(role="prompt_parser", model="llama-3.3-70b-versatile",
                                      description="Assistant project manager", json_specification=JSON_SCHEMA, stream=False)
    automl_agent = AutoMLAgent(role="data_scientist", model="llama-3.3-70b-versatile",
                               description="AutoML agent for data tasks", data_path="data", stream=False)
    model_agent = ModelAgent(role="ml_researcher", model="llama-3.3-70b-versatile",
                             description="ML research agent", stream=False)
    operations_agent = OperationsAgent(role="mlops", model="llama-3.3-70b-versatile",
                                       description="MLOps agent", stream=False)

    agents = {
        "manager": manager_agent,
        "prompt": prompt_parser_agent,
        "automl": automl_agent,
        "model": model_agent,
        "operations": operations_agent
    }

    # Instantiate the asynchronous MemoryAgent.
    memory_agent = AsyncMemoryAgent(state=state, memory_manager=memory_manager, similarity_threshold=0.2)

    # Create the asynchronous pipeline agent.
    pipeline = AsyncPipelineAgent(agents=agents, state=state, memory_agent=memory_agent, dataset_dir="data")

    # Define pipeline inputs.
    preprocessing_input = (
        "I have uploaded the dataset obtained from Rent the Runway, "
        "which relates to fit fiber clothing for women. Develop a model with at least 90 percent F1 score. "
        "The target variable is fit."
    )
    model_request = "Find the top 3 models for classifying this dataset."
    deployment_details = "Deploy the selected model as a web application."

    # Run the asynchronous pipeline.
    final_output = await pipeline.run_pipeline(preprocessing_input, model_request, deployment_details)
    print("Final Pipeline Output:")
    print(json.dumps(final_output, indent=2))

# if __name__ == "__main__":
#     asyncio.run(main())


In [56]:
import nest_asyncio
nest_asyncio.apply()

import asyncio

# Now you can safely call asyncio.run() even if an event loop is already running.
asyncio.run(main())




[State] Updating memory for agent 'Agent Manager' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json
[State] Updating memory for agent 'Agent Manager' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json




[State] Updating memory for agent 'Prompt Agent' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json
[State] Updating memory for agent 'Prompt Agent' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json




[State] Updating memory for agent 'Data Agent' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json
[State] Updating memory for agent 'Data Agent' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json
[State] Updating memory for agent 'Model Agent' in Phase: Model Development.
[State] Memory persisted to competition/MyCompetition/Model_Development/memory.json
Final Pipeline Output:
{
  "context": "Competition: MyCompetition\nPhase: Model Development\nAgents in workflow:\n1. Agent Manager\n2. Prompt Agent\n3. Data Agent\n4. Model Agent\n5. Operations Agent",
  "outputs": {
    "preprocessing": "**Dataset Overview**\nThe dataset obtained from Rent the Runway relates to fit fiber clothing for women. The target variable is \"fit\", which indicates whether the clothing fits well or not.\n\n**Step 1: Retrieve and Explore the Dataset**\nI have retrieved the dataset and explor