In [1]:
import os
from groq import Groq
import sys
from dotenv import load_dotenv

sys.path.insert(1, "source")

# dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
# load_dotenv(dotenv_path)

from prompts.agent_prompts import (
    agent_manager_prompt,
    data_agent_prompt,
    model_agent_prompt,
    prompt_agent,
    operation_agent_prompt,
)


In [2]:
# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
api_key=os.environ.get("GROQ_API_KEY")
# print(api_key)

In [14]:
import os
import json
import asyncio
import logging
import chromadb
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, Any
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Import your existing classes with minor async modifications
from source.state import State
from source.memory import CSVEmbeddingManager  # Assuming you have these

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)




In [15]:

class AsyncAgentBase:
    """Base class for all asynchronous agents"""
    def __init__(self, role: str, model: str, description: str):
        self.role = role
        self.model = model
        self.description = description
        self.client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

    async def execute(self, messages: list) -> str:
        """Async version of execute method"""
        try:
            response = await asyncio.to_thread(
                self.client.chat.completions.create,
                messages=messages,
                model=self.model
            )
            return response.choices[0].message.content
        except Exception as e:
            logger.error(f"Error in {self.role} agent: {str(e)}")
            raise

class AsyncAutoMLAgent(AsyncAgentBase):
    """Asynchronous AutoML agent with enhanced context handling"""
    def __init__(self, *args, data_path: str = "data", **kwargs):
        super().__init__(*args, **kwargs)
        self.data_path = data_path
        self.context = []

    async def preprocess_data(self, instructions: str, context: list) -> str:
        """Enhanced preprocessing with context awareness"""
        self.context.append(("preprocessing", instructions))
        messages = [
            {"role": "system", "content": f"Current context: {json.dumps(context)}\n\n{data_agent_prompt}"},
            {"role": "user", "content": instructions}
        ]
        return await self.execute(messages)

class AsyncModelAgent(AsyncAgentBase):
    """Asynchronous Model agent with cross-agent awareness"""
    async def retrieve_models(self, dataset_details: str, context: list) -> str:
        """Model selection with context from previous steps"""
        messages = [
            {"role": "system", "content": f"Pipeline Context: {json.dumps(context)}\n\n{model_agent_prompt}"},
            {"role": "user", "content": dataset_details}
        ]
        return await self.execute(messages)

class AsyncOpsAgent(AsyncAgentBase):
    """Asynchronous Operations agent with deployment capabilities"""
    async def deploy_model(self, deployment_details: str, context: list) -> str:
        """Context-aware deployment"""
        messages = [
            {"role": "system", "content": f"Full Context: {json.dumps(context)}\n\n{operation_agent_prompt}"},
            {"role": "user", "content": deployment_details}
        ]
        return await self.execute(messages)


In [36]:
import os
import json
import pandas as pd
from tqdm import tqdm
import re
import chromadb
from chromadb.config import Settings
import asyncio
from sentence_transformers import SentenceTransformer

def split_text(text: str, max_chunk_length: int = 8000, overlap_ratio: float = 0.1):
    """
    Splits a long string into overlapping chunks.
    """
    if not (0 <= overlap_ratio < 1):
        raise ValueError("Overlap ratio must be between 0 and 1 (exclusive).")
    
    overlap_length = int(max_chunk_length * overlap_ratio)
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chunk_length, len(text))
        chunks.append(text[start:end])
        start += max_chunk_length - overlap_length
    return chunks

class ImprovedCSVEmbeddingManager:
    """
    ImprovedCSVEmbeddingManager embeds CSV data into a Chroma DB collection
    using batch processing and optional text chunking.
    """
    def __init__(self, collection_name="default_collection", db_path="chromadb", embedding_model=None, cache_size=10_000_000_000):
        self.settings = Settings(
            chroma_segment_cache_policy="LRU",
            chroma_memory_limit_bytes=cache_size
        )
        # Initialize persistent client for Chroma DB
        self.client = chromadb.PersistentClient(path=db_path, settings=self.settings)
        # Create or get the collection, specifying cosine similarity
        self.collection = self.client.get_or_create_collection(collection_name, metadata={"hnsw:space": "cosine"})
        if embedding_model is None:
            raise ValueError("An embedding_model must be provided.")
        self.embedding_model = embedding_model
        self.id_counter = 0  # To assign unique IDs if needed

    def embed_csv(self, csv_file_path: str, batch_size: int = 1000):
        """
        Reads a CSV file and embeds its content into the collection in batches.
        Each row is converted to a JSON string (excluding the 'id' column if present).
        If a row's text is too long, it is split into chunks.
        """
        if not os.path.exists(csv_file_path):
            raise FileNotFoundError(f"CSV file not found: {csv_file_path}")
        try:
            df = pd.read_csv(csv_file_path)
        except FileNotFoundError as e:
            print(f"Error: {e}")

        # Ensure there is an 'id' column; if not, create one
        if 'id' not in df.columns:
            df['id'] = df.index.astype(str)
        
        # Convert each row into a dictionary
        rows = df.to_dict(orient='records')
        
        batch_ids = []
        batch_documents = []
        batch_metadatas = []
        
        for row in tqdm(rows, desc="Embedding CSV rows"):
            # Get the document id (as a string)
            doc_id = str(row.get('id', self.id_counter))
            # Remove the 'id' field for the embedding
            row_copy = {k: v for k, v in row.items() if k != 'id'}
            # Convert the remaining data to a JSON string
            doc_text = json.dumps(row_copy)
            
            # Check if the document is too long; if so, split into chunks.
            if len(doc_text) > 8000:
                chunks = split_text(doc_text, max_chunk_length=8000, overlap_ratio=0.1)
                for chunk in chunks:
                    batch_documents.append(chunk)
                    # Create a unique id for each chunk
                    batch_ids.append(f"{doc_id}_{self.id_counter}")
                    batch_metadatas.append({"doc_name": os.path.basename(csv_file_path)})
                    self.id_counter += 1
            else:
                batch_documents.append(doc_text)
                batch_ids.append(doc_id)
                batch_metadatas.append({"doc_name": os.path.basename(csv_file_path)})
                self.id_counter += 1

            # If the batch is full, upsert into the collection in one call.
            if len(batch_documents) >= batch_size:
                embeddings = [self.embedding_model.encode(doc).tolist() for doc in batch_documents]
                self.collection.add(
                    documents=batch_documents,
                    ids=batch_ids,
                    embeddings=embeddings,
                    metadatas=batch_metadatas
                )
                batch_ids = []
                batch_documents = []
                batch_metadatas = []

        # Upsert any remaining documents not in a full batch.
        if batch_documents:
            embeddings = [self.embedding_model.encode(doc).tolist() for doc in batch_documents]
            self.collection.add(
                documents=batch_documents,
                ids=batch_ids,
                embeddings=embeddings,
                metadatas=batch_metadatas
            )
        
        print(f"Finished embedding CSV: {csv_file_path}")

    import asyncio

    async def query_collection(self, query: str, n_results: int = 5) -> dict:
        """Async version of query_collection."""
        query_embedding = await asyncio.to_thread(self.embedding_model.encode, query)
        results = await asyncio.to_thread(self.collection.query, query_embeddings=query_embedding.tolist(), n_results=n_results, include=['documents', 'metadatas', 'distances'])
        return results


In [37]:

class UnifiedPipeline:
    """Orchestrates the entire async pipeline with shared context and memory"""
    def __init__(self, state: State, memory_manager: ImprovedCSVEmbeddingManager):
        self.state = state
        self.memory_manager = memory_manager
        self.context_store = []
        self.results = {}
        
        # Initialize agents
        self.agents = {
            "automl": AsyncAutoMLAgent(
                role="data_scientist",
                model="llama-3.3-70b-versatile",
                description="AutoML expert",
                data_path="data"
            ),
            "model": AsyncModelAgent(
                role="ml_researcher",
                model="llama-3.3-70b-versatile",
                description="Model expert"
            ),
            "ops": AsyncOpsAgent(
                role="mlops",
                model="llama-3.3-70b-versatile",
                description="Deployment expert"
            )
        }

    async def _update_context(self, stage: str, output: str):
        """Update shared context with memory integration"""
        # Store in memory
        await self.memory_manager.embed_csv(output)
        # Update pipeline context
        self.context_store.append({stage: output})
        self.state.update_memory({stage: output})
        self.state.persist_memory()

    async def run_pipeline(self, user_input: str) -> Dict[str, Any]:
        """Execute the full async pipeline with integrated context handling"""
        try:
            # Initial memory query
            initial_context = await self.memory_manager.query_collection(user_input, n_results=3)
            self.context_store = initial_context.get("documents", [])
            
            # Parallel execution of pipeline stages
            tasks = {
                "preprocessing": self.agents["automl"].preprocess_data(
                    user_input, self.context_store
                ),
                "model_selection": self.agents["model"].retrieve_models(
                    "Find top models", self.context_store
                ),
                "deployment": self.agents["ops"].deploy_model(
                    "Deploy best model", self.context_store
                )
            }
            
            # Run all tasks concurrently
            completed = await asyncio.gather(*tasks.values(), return_exceptions=True)
            
            # Process results
            for stage, result in zip(tasks.keys(), completed):
                if isinstance(result, Exception):
                    logger.error(f"Error in {stage}: {str(result)}")
                    continue
                
                await self._update_context(stage, result)
                self.results[stage] = result

            # Generate unified output
            return await self._generate_unified_output()
            
        except Exception as e:
            logger.error(f"Pipeline failed: {str(e)}")
            raise

    async def _generate_unified_output(self) -> str:
        """Create a single coherent output from all pipeline stages"""
        unified = [
            "# Automated ML Pipeline Report",
            "## Context Summary",
            f"Pipeline Phase: {self.state.phase}",
            f"Competition: {self.state.competition}",
            "### Memory Context:",
            *[f"- {ctx}" for ctx in self.context_store[-3:]],
            "\n## Pipeline Execution Details"
        ]
        
        for stage, result in self.results.items():
            unified.extend([
                f"\n### {stage.replace('_', ' ').title()}",
                f"```\n{result}\n```"
            ])
            
        unified.append("\n## Final Recommendations")
        unified.append(await self._generate_summary())
        
        return "\n".join(unified)

    async def _generate_summary(self) -> str:
        """Generate final summary using context"""
        summary_prompt = f"""
        Generate a comprehensive summary of the ML pipeline execution using this context:
        {json.dumps(self.context_store)}
        """
        return await self.agents["automl"].execute([
            {"role": "system", "content": "Summarize the pipeline execution"},
            {"role": "user", "content": summary_prompt}
        ])


In [38]:
# -----------------------------------------------------------------------------
# Usage Example and Main Execution
# -----------------------------------------------------------------------------

async def main():
    # Initialize state.
    state = State(phase="Model Development", competition="MyCompetition")
    
    # Initialize the embedding model.
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Instantiate the CSV embedding manager.
    memory_manager = ImprovedCSVEmbeddingManager(
        collection_name="auto_ml_memory",
        embedding_model=embedding_model
    )
    
    # Ensure the data directory exists.
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    csv_file_path = os.path.join(data_dir, "renttherunway_cleaned.csv")
    
    # For demonstration, if the CSV doesn't exist, create a dummy CSV.
    if not os.path.exists(csv_file_path):
        df_dummy = pd.DataFrame({
            "id": [1, 2],
            "feature": ["value1", "value2"],
            "label": ["A", "B"]
        })
        df_dummy.to_csv(csv_file_path, index=False)
    
    # Embed dataset if not already embedded.
    if memory_manager.collection.count() == 0:
        memory_manager.embed_csv(csv_file_path)
    
    # Create the pipeline.
    pipeline = UnifiedPipeline(state, memory_manager)
    
    # Define user input.
    user_input = (
        "I have uploaded the Rent the Runway dataset for women's clothing fit prediction. "
        "Develop a model with ≥90% F1 score and deploy as a web service."
    )
    
    # Execute pipeline.
    final_report = await pipeline.run_pipeline(user_input)
    
    # Save the final report as a Markdown file.
    final_md_path = os.path.join(data_dir, "final_report.md")
    with open(final_md_path, "w") as f:
        f.write("## Final Pipeline Report\n")
        f.write(final_report)
        f.write("\n")
    
    print("Pipeline execution completed. Final report saved at:", final_md_path)
    print("Final Report:")
    print(final_report)

if __name__ == "__main__":
    # Allow nested event loops if needed.
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.run(main())

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.55it/s]
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
ERROR:__main__:Pipeline failed: CSV file not found: **Rent the Runway Dataset Analysis and Model Deployment**

### Dataset Overview

The provided dataset contains information about women's clothing fit, including user ID, item ID, category, size, body type, bust size, height, weight, age, and rating. The dataset is used to predict the fit of clothing items for users.

### Data Preprocessing

```python
import pandas as pd
import numpy as np
from sklearn.model_selection import 

FileNotFoundError: CSV file not found: **Rent the Runway Dataset Analysis and Model Deployment**

### Dataset Overview

The provided dataset contains information about women's clothing fit, including user ID, item ID, category, size, body type, bust size, height, weight, age, and rating. The dataset is used to predict the fit of clothing items for users.

### Data Preprocessing

```python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_json('[{"fit": "fit", "user_id": 978643, "bust size": "34a", "item_id": 144714, "weight": NaN, "rating": 10.0, "body type": "athletic", "category": "gown", "height": 170.18, "size": 8, "age": 26.0}, \
                    {"fit": "fit", "user_id": 510619, "bust size": "34dd", "item_id": 1833819, "weight": NaN, "rating": 10.0, "body type": "athletic", "category": "gown", "height": 157.48000000000002, "size": 16, "age": 34.0}, \
                    {"fit": "fit", "user_id": 316065, "bust size": "32d", "item_id": 1585757, "weight": 53.523856, "rating": 10.0, "body type": NaN, "category": "gown", "height": 157.48000000000002, "size": 4, "age": 38.0}]')

# Drop rows with missing values
data.dropna(inplace=True)

# Encode categorical variables
le = LabelEncoder()
data['category'] = le.fit_transform(data['category'])
data['body type'] = le.fit_transform(data['body type'].fillna('Unknown'))
data['fit'] = le.fit_transform(data['fit'])

# Split data into training and testing sets
X = data.drop(['fit'], axis=1)
y = data['fit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
```

### Data Augmentation

To improve the model's performance, we can create additional features through data augmentation. One possible approach is to extract features from the user's body type and clothing size.

```python
# Create new features
data['body_type_size'] = data['body type'] * data['size']
data['height_weight_ratio'] = data['height'] / data['weight']
```

### Model Selection and Training

To achieve an F1 score of ≥90%, we can use a random forest classifier with hyperparameter tuning.

```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameter tuning space
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Perform hyperparameter tuning
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Train the model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
```

### Model Evaluation

```python
from sklearn.metrics import f1_score

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f'F1 score: {f1:.3f}')
```

### Model Deployment

To deploy the model as a web service, we can use Flask and create a RESTful API.

```python
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load the trained model
with open('best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Define the API endpoint
@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    # Preprocess the input data
    input_data = pd.DataFrame([data])
    input_data = input_data.drop(['fit'], axis=1)
    input_data['category'] = le.transform(input_data['category'])
    input_data['body type'] = le.transform(input_data['body type'].fillna('Unknown'))
    # Make predictions
    predictions = best_model.predict(input_data)
    return jsonify({'prediction': predictions.tolist()})

if __name__ == '__main__':
    app.run(debug=True)
```

**Example Use Case**

To use the deployed model, send a POST request to the `/predict` endpoint with the input data in JSON format.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"user_id": 978643, "item_id": 144714, "category": "gown", "size": 8, "body type": "athletic", "height": 170.18, "weight": 53.523856, "age": 26.0}' http://localhost:5000/predict
```

This should return a JSON response with the predicted fit.

```json
{
    "prediction": [1]
}
```