In [3]:
import os
from groq import Groq
import sys
from dotenv import load_dotenv

sys.path.insert(1, "source")

# dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
# load_dotenv(dotenv_path)

from prompts.agent_prompts import (
    agent_manager_prompt,
    data_agent_prompt,
    model_agent_prompt,
    prompt_agent,
    operation_agent_prompt,
)


In [4]:
# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
api_key=os.environ.get("GROQ_API_KEY")
# print(api_key)

In [5]:
#########################
# 1. AGENT BASE CLASSES #
#########################

class AgentBase:
    def __init__(self, role, model, description, **kwargs):
        self.role = role
        self.model = model
        self.description = description
        self.kwargs = kwargs

    def execute(self, messages):
        """Executes a task using the defined role and model."""
        return client.chat.completions.create(
            messages=messages,
            model=self.model,
            **self.kwargs
        )

In [6]:
# ----------------------------
# Manager Agent (inherits from AgentBase)
# ----------------------------
class AgentManager(AgentBase):
    def __init__(self, role, model, description, json_schema, **kwargs):
        super().__init__(role, model, description, **kwargs)
        self.json_schema = json_schema

    def parse_to_json(self, user_input):
        """Parses the user input into a JSON format based on the schema."""
        messages = [
            {
                "role": "system",
                "content": f"""
{agent_manager_prompt.strip()}

# JSON SPECIFICATION SCHEMA #
{self.json_schema}
""",
            },
            {"role": "user", "content": user_input},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content


In [7]:

# ----------------------------
# Prompt Agent (inherits from AgentBase)
# ----------------------------
class PromptAgent(AgentBase):
    def __init__(self, role, model, description, json_specification, **kwargs):
        super().__init__(role, model, description, **kwargs)
        self.json_specification = json_specification

    def generate_json(self, user_input):
        """Generates a JSON response strictly adhering to the specification."""
        messages = [
            {
                "role": "system",
                "content": f"""
{prompt_agent.strip()}

# JSON SPECIFICATION SCHEMA #
'''json
{self.json_specification}
'''
""",
            },
            {"role": "user", "content": user_input},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content



In [8]:
# ----------------------------
# AutoML Agent (inherits from AgentBase)
# ----------------------------
class AutoMLAgent(AgentBase):
    def __init__(self, role, model, description, data_path="./data", **kwargs):
        super().__init__(role, model, description, **kwargs)
        self.data_path = data_path

    def retrieve_dataset(self, query):
        """Retrieves a dataset based on user instructions or searches for one."""
        dataset_path = os.path.join(self.data_path, "renttherunway_cleaned.csv")
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": query},
        ]
        response = self.execute(messages)
        # Save the retrieved dataset to the specified path (placeholder implementation)
        with open(dataset_path, "w") as file:
            file.write(response.choices[0].message.content)
        return dataset_path

    def preprocess_data(self, instructions):
        """Performs data preprocessing based on user instructions or best practices."""
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": f"Instructions: {instructions}"},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def augment_data(self, augmentation_details):
        """Performs data augmentation as necessary."""
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": f"Augmentation Details: {augmentation_details}"},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def visualize_data(self, visualization_request):
        """Generates meaningful visualizations to understand the dataset."""
        messages = [
            {"role": "system", "content": data_agent_prompt.strip()},
            {"role": "user", "content": visualization_request},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content



In [9]:

# ----------------------------
# Model Agent (inherits from AgentBase)
# ----------------------------
class ModelAgent(AgentBase):
    def __init__(self, role, model, description, **kwargs):
        super().__init__(role, model, description, **kwargs)

    def retrieve_models(self, dataset_details):
        """Retrieve a list of well-performing models or algorithms based on dataset details."""
        messages = [
            {"role": "system", "content": model_agent_prompt.strip()},
            {"role": "user", "content": dataset_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def optimize_model(self, hyperparameter_details):
        """Perform hyperparameter optimization on candidate models."""
        messages = [
            {"role": "system", "content": model_agent_prompt.strip()},
            {"role": "user", "content": hyperparameter_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content

    def profile_models(self, profiling_details):
        """Perform metadata extraction and profiling on candidate models."""
        messages = [
            {"role": "system", "content": model_agent_prompt.strip()},
            {"role": "user", "content": profiling_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content



In [10]:
# ----------------------------
# Operations Agent (inherits from AgentBase)
# ----------------------------
class OperationsAgent(AgentBase):
    def __init__(self, role, model, description, **kwargs):
        super().__init__(role, model, description, **kwargs)

    def deploy_model(self, deployment_details):
        """Prepare and deploy the model based on the provided details."""
        messages = [
            {"role": "system", "content": operation_agent_prompt.strip()},
            {"role": "user", "content": deployment_details},
        ]
        response = self.execute(messages)
        return response.choices[0].message.content


In [11]:
#################################
# 2. AGENT INSTANTIATION SETUP  #
#################################

# Define JSON specification schema
JSON_SCHEMA = """json
{
    "task": "string",
    "priority": "string",
    "deadline": "string",
    "resources": [
        {
            "type": "string",
            "quantity": "integer"
        }
    ]
}
"""

# Create agent instances
manager_agent = AgentManager(
    role="manager",
    model="llama-3.3-70b-versatile",
    description="Assistant project manager for parsing user requirements into JSON.",
    json_schema=JSON_SCHEMA,
    stream=False
)

prompt_parser_agent = PromptAgent(
    role="prompt_parser",
    model="llama-3.3-70b-versatile",
    description="Assistant project manager for JSON parsing.",
    json_specification=JSON_SCHEMA,
    stream=False
)

automl_agent = AutoMLAgent(
    role="data_scientist",
    model="llama-3.3-70b-versatile",
    description="Automated machine learning agent for dataset retrieval, preprocessing, augmentation, and visualization.",
    data_path="data",
    stream=False
)

model_agent = ModelAgent(
    role="ml_researcher",
    model="llama-3.3-70b-versatile",
    description="Machine learning research agent for model optimization and profiling.",
    stream=False
)

operations_agent = OperationsAgent(
    role="mlops",
    model="llama-3.3-70b-versatile",
    description="MLOps agent for deployment and application development.",
    stream=False
)

# A dictionary to hold all agents if needed
agents = {
    "manager": manager_agent,
    "prompt": prompt_parser_agent,
    "automl": automl_agent,
    "model": model_agent,
    "operations": operations_agent
}

In [12]:
from source.state import State
from source.memory import CSVEmbeddingManager

In [13]:

#################################
# 3. PIPELINE AGENT DEFINITION  #
#################################

class PipelineAgent(AgentBase):
    """
    PipelineAgent is responsible for orchestrating the full data-to-deployment pipeline.
    It uses the various agents (data, model, ops, etc.) to execute their tasks sequentially.
    """
    def __init__(self, agents, state: State,memory_manager: CSVEmbeddingManager,
                 dataset_dir="data", **kwargs):
        # You can give this pipeline a role and model description if needed.
        super().__init__(role="pipeline", model="n/a", description="Pipeline to orchestrate all agents", **kwargs)
        self.agents = agents
        self.state = state
        self.memory_manager = memory_manager
        self.dataset_dir = dataset_dir

    def run_pipeline(self, preprocessing_input, model_request, deployment_details):
        """
        Executes a full pipeline:
          1. Preprocess data.
          2. Retrieve candidate models.
          3. Deploy the selected model.
        """
        os.makedirs(self.dataset_dir, exist_ok=True)
        self.state.make_dir()

        # 1. Preprocess the dataset using the AutoML agent
        preprocessed_data = self.agents["automl"].preprocess_data(preprocessing_input)
        preprocessed_path = os.path.join(self.dataset_dir, "preprocessed_data.md")
        with open(preprocessed_path, "w") as f:
            f.write(preprocessed_data)
        print(f"Preprocessed data saved to: {preprocessed_path}")

        # Update state memory for preprocessing step
        self.state.update_memory({"preprocessing": preprocessed_data})
        self.state.persist_memory() # save memory to disk
        # Optionally update csv embedding if preprocessing produces a CSV
        # self.memory_manager.update_embedding(preprocesses_csv_file)

        # Advance state and write current agents rules
        rules = self.state.generate_rules()
        print("[Pipeline] Current agent rules saved:\n", rules)
        self.state.next_step()

        # 2. Retrieve candidate models using the Model agent
        model_list = self.agents["model"].retrieve_models(model_request)
        model_list_path = os.path.join(self.dataset_dir, "model_list.md")
        with open(model_list_path, "w") as f:
            f.write(model_list)
        print(f"Model list saved to: {model_list_path}")

        # Update state memory for model retrieval step
        self.state.update_memory({"model_list": model_list})
        self.state.persist_memory()
        self.state.next_step()

        # 3. Deploy the model using the Operations agent
        deployment_output = self.agents["operations"].deploy_model(deployment_details)
        deployment_output_path = os.path.join(self.dataset_dir, "deployment_output.md")
        with open(deployment_output_path, "w") as f:
            f.write(deployment_output)
        print(f"Deployment output saved to: {deployment_output_path}")

        # Update state memory for deployment step
        self.state.update_memory({"deployment_output": deployment_output})
        self.state.persist_memory()
        self.state.next_step()

        # Return a dictionary of results (if needed)
        return {
            "preprocessed_data": preprocessed_data,
            "model_list": model_list,
            "deployment_output": deployment_output
        }


In [21]:
import os
import json
import pandas as pd
from tqdm import tqdm
import re
import chromadb
from chromadb.config import Settings

def split_text(text: str, max_chunk_length: int = 8000, overlap_ratio: float = 0.1):
    """
    Splits a long string into overlapping chunks.
    """
    if not (0 <= overlap_ratio < 1):
        raise ValueError("Overlap ratio must be between 0 and 1 (exclusive).")
    
    overlap_length = int(max_chunk_length * overlap_ratio)
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chunk_length, len(text))
        chunks.append(text[start:end])
        start += max_chunk_length - overlap_length
    return chunks

class ImprovedCSVEmbeddingManager:
    """
    ImprovedCSVEmbeddingManager embeds CSV data into a Chroma DB collection
    using batch processing and optional text chunking. This should speed up the
    embedding process compared to row-by-row insertion.
    """
    def __init__(self, collection_name="default_collection", db_path="chromadb", embedding_model=None, cache_size=10_000_000_000):
        self.settings = Settings(
            chroma_segment_cache_policy="LRU",
            chroma_memory_limit_bytes=cache_size
        )
        # Initialize persistent client for Chroma DB
        self.client = chromadb.PersistentClient(path=db_path, settings=self.settings)
        # Create or get the collection, specifying cosine similarity
        self.collection = self.client.get_or_create_collection(collection_name, metadata={"hnsw:space": "cosine"})
        if embedding_model is None:
            raise ValueError("An embedding_model must be provided.")
        self.embedding_model = embedding_model
        self.id_counter = 0  # To assign unique IDs if needed

    def embed_csv(self, csv_file_path: str, batch_size: int = 100):
        """
        Reads a CSV file and embeds its content into the collection in batches.
        Each row is converted to a JSON string (excluding the 'id' column if present).
        If a row's text is too long, it is split into chunks.
        """
        if not os.path.exists(csv_file_path):
            raise FileNotFoundError(f"CSV file not found: {csv_file_path}")

        # Read the CSV into a DataFrame
        df = pd.read_csv(csv_file_path)
        # Ensure there is an 'id' column; if not, create one
        if 'id' not in df.columns:
            df['id'] = df.index.astype(str)
        
        # Convert each row into a dictionary
        rows = df.to_dict(orient='records')
        
        batch_ids = []
        batch_documents = []
        batch_metadatas = []
        
        for row in tqdm(rows, desc="Embedding CSV rows"):
            # Get the document id (as a string)
            doc_id = str(row.get('id', self.id_counter))
            # Remove the 'id' field for the embedding
            row_copy = {k: v for k, v in row.items() if k != 'id'}
            # Convert the remaining data to a JSON string
            doc_text = json.dumps(row_copy)
            
            # Check if the document is too long; if so, split into chunks.
            if len(doc_text) > 8000:
                chunks = split_text(doc_text, max_chunk_length=8000, overlap_ratio=0.1)
                for chunk in chunks:
                    batch_documents.append(chunk)
                    # Create a unique id for each chunk
                    batch_ids.append(f"{doc_id}_{self.id_counter}")
                    batch_metadatas.append({"doc_name": os.path.basename(csv_file_path)})
                    self.id_counter += 1
            else:
                batch_documents.append(doc_text)
                batch_ids.append(doc_id)
                batch_metadatas.append({"doc_name": os.path.basename(csv_file_path)})
                self.id_counter += 1

            # If the batch is full, upsert into the collection in one call.
            if len(batch_documents) >= batch_size:
                # Compute embeddings for the entire batch at once.
                # Directly convert each embedding (a NumPy array) to a list.
                embeddings = [self.embedding_model.encode(doc).tolist() for doc in batch_documents]
                self.collection.add(
                    documents=batch_documents,
                    ids=batch_ids,
                    embeddings=embeddings,
                    metadatas=batch_metadatas
                )
                batch_ids = []
                batch_documents = []
                batch_metadatas = []

        # Upsert any remaining documents not in a full batch.
        if batch_documents:
            embeddings = [self.embedding_model.encode(doc).tolist() for doc in batch_documents]
            self.collection.add(
                documents=batch_documents,
                ids=batch_ids,
                embeddings=embeddings,
                metadatas=batch_metadatas
            )
        
        print(f"Finished embedding CSV: {csv_file_path}")

    def query_collection(self, query: str, n_results: int = 5) -> dict:
        """
        Queries the collection using the provided query string and returns the results.
        """
        query_embedding = self.embedding_model.encode(query).tolist()
        results = self.collection.query(query_embeddings=query_embedding, n_results=n_results, include=['documents', 'metadatas', 'distances'])
        return results


In [22]:
# !pip install sentence_transformers

In [24]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [25]:
state = State(phase="Model development", competition="MyCompetition")
state.make_context() # build context info

# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# create an instance of the csv embedding manager
memory_manager = ImprovedCSVEmbeddingManager(collection_name="auto_ml_memory",
                                             embedding_model=embedding_model)
memory_manager.embed_csv("data/renttherunway_cleaned.csv")


# Define sample inputs for the pipeline.
preprocessing_input = (
        "I have uploaded the dataset obtained from Rent the Runway, "
        "which relates to fit fiber clothing for women. Develop a model with at least 90 percent F1 score. "
        "The target variable is fit."
    )
model_request = "Find the top 3 models for classifying this dataset."
deployment_details = "Deploy the selected model as a web application."

# Create the pipeline agent with the dictionary of agents and dataset directory.
pipeline = PipelineAgent(agents=agents, state=state, memory_manager=memory_manager,
                         dataset_dir="data")

# Execute the pipeline
results = pipeline.run_pipeline(preprocessing_input, model_request, deployment_details)

# Optionally, print the results for debugging.
print("Pipeline execution completed. Results:")
print(results)

Embedding CSV rows:   2%|▏         | 99/6516 [00:00<00:03, 1938.33it/s]


AttributeError: 'numpy.float32' object has no attribute 'embedding'