<a href="https://colab.research.google.com/github/NeetishPathak/colab-notebooks/blob/main/final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install requests crewai crewai[tools] openai termcolor

In [None]:
!wget https://raw.githubusercontent.com/NeetishPathak/dbops-agentic-crew/main/app/resources/pdfs/Redis.pdf -O Redis.pdf

In [None]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

### GCP Setup for Agents to use later

In [None]:
## We use GCP to setup a vm . Some presteps to setup a GCP project resources

# Install the CLI
!apt-get install -y lsb-release
!curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!echo "deb http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
!apt-get update -q
!apt-get install -y google-cloud-sdk


In [None]:
## Setup terraform
!wget -O - https://apt.releases.hashicorp.com/gpg | gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
!echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(grep -oP '(?<=UBUNTU_CODENAME=).*' /etc/os-release || lsb_release -cs) main" | tee /etc/apt/sources.list.d/hashicorp.list
!apt update && apt install terraform
!terraform version

In [None]:
# Authenticate the user to use GCP
from google.colab import auth
auth.authenticate_user()

In [None]:
my_gcp_project_name = userdata.get('MY_GCP_PROJECT')
!gcloud config set project {my_gcp_project_name}

!gcloud services enable compute.googleapis.com

In [None]:
!gcloud iam service-accounts create terraform-agent --display-name="Terraform Agent"

for role in [
    "roles/compute.admin",
    "roles/iam.serviceAccountUser",
    "roles/serviceusage.serviceUsageAdmin"
]:
    !gcloud projects add-iam-policy-binding {my_gcp_project_name} \
        --member="serviceAccount:terraform-agent@{my_gcp_project_name}.iam.gserviceaccount.com" \
        --role="{role}" --quiet

!gcloud iam service-accounts keys create terraform-agent-key.json \
    --iam-account=terraform-agent@{my_gcp_project_name}.iam.gserviceaccount.com

## CrewAi Agentic Workflow

In [None]:
# Terraform Config + CrewAI Tooling: Provision GCP VM

# --- Terraform Config (main.tf) ---
main_tf = """
provider "google" {
  project     = var.project
  region      = var.region
  zone        = var.zone
  credentials = file(var.credentials_file)
}

resource "google_compute_instance" "docker_vm" {
  name         = var.instance_name
  machine_type = var.machine_type
  zone         = var.zone

  boot_disk {
    initialize_params {
      image = "ubuntu-os-cloud/ubuntu-2204-lts"
      size  = 30
    }
  }

  network_interface {
    network = "default"
    access_config {}
  }
}
"""

# --- Terraform Variables (variables.tf) ---
variables_tf = """
variable "project" {}
variable "region" { default = "us-central1" }
variable "zone" { default = "us-central1-a" }
variable "credentials_file" { default = "account.json" }
variable "instance_name" { default = "docker-agent-vm" }
variable "machine_type" { default = "e2-micro" }
"""

# --- CrewAI Tool: Terraform VM Creator ---
import subprocess
import os, time, stat
from crewai.tools import BaseTool

class GCPVMTool(BaseTool):
    def __init__(self, name="GCPVMTool", description="Creates a GCP VM using Terraform if it doesn't already exist", **kwargs):
        super().__init__(name=name, description=description, **kwargs)

    def _run(self, input_type: dict) -> str:
        try:
            from google.colab import userdata  # Only available in Colab
            tf_path="terraform"

            def remove_readonly(func, path, _):
              # Change the file to be writable, then delete
              os.chmod(path, stat.S_IWRITE)
              func(path)

            try:
                shutil.rmtree(tf_path, onerror=remove_readonly)
            except Exception as e:
                return f"❌ Failed to remove existing terraform directory: {e}"


            os.makedirs("terraform", exist_ok=True)

            # Write Terraform files
            with open(os.path.join(tf_path, "main.tf"), "w") as f:
                f.write(main_tf)
            with open(os.path.join(tf_path, "variables.tf"), "w") as f:
                f.write(variables_tf)

            project_id = userdata.get("MY_GCP_PROJECT")

            if not project_id:
                return "❌ Missing required secrets: MY_GCP_PROJECT"

            # Load service account secrets from
            key_file_path = "terraform-agent-key.json"

            if not os.path.exists(key_file_path):
                return f"❌ Key file '{key_file_path}' does not exist."

            with open(key_file_path, "r") as f:
                key_data = f.read().strip()

            if not key_data or not key_data.startswith('{'):
                return "❌ Key file is empty or not a valid JSON key."

            credentials_path = os.path.join(tf_path, "account.json")
            with open(credentials_path, "w") as f:
                f.write(key_data)

            # Create terraform.tfvars
            with open(os.path.join(tf_path, "terraform.tfvars"), "w") as f:
                f.write(f"""
project = "{project_id}"
region = "us-central1"
zone = "us-central1-a"
credentials_file = "account.json"
instance_name = "docker-agent-vm"
machine_type = "e2-medium"
""")
            apply_init = subprocess.run(["terraform", "init"], cwd=tf_path, check=True, capture_output=True, text=True)
            if apply_init.returncode != 0:
                return f"❌ Terraform init failed:\nSTDOUT:\n{apply_init.stdout}\nSTDERR:\n{apply_init.stderr}"

            # Plan with exit code check
            plan_proc = subprocess.run(["terraform", "plan", "-detailed-exitcode"], cwd=tf_path, capture_output=True, text=True)

            if plan_proc.returncode == 0:
                return "✅ VM already exists — Terraform reports no changes needed."
            elif plan_proc.returncode == 1:
                return f"❌ Terraform plan failed:\n{plan_proc.stderr}"
            elif plan_proc.returncode == 2:
                # Proceed to apply
                apply_proc = subprocess.run(["terraform", "apply", "-auto-approve"], cwd=tf_path, capture_output=True, text=True)
                if "already exists" in apply_proc.stderr.lower():
                    return "✅ VM already exists — Terraform reports no changes needed."
                if apply_proc.returncode != 0:
                    return f"❌ Terraform apply failed:\n{apply_proc.stderr}"
                return "✅ VM created successfully via Terraform."

            return "❌ Unknown terraform plan exit code — something went wrong."

        except subprocess.CalledProcessError as e:
            return f"❌ Terraform failed: {e}"
        except Exception as e:
            return f"❌ Unexpected error: {str(e)}"

In [None]:
import subprocess
import os, time
from crewai.tools import BaseTool

class DockerInstallTool(BaseTool):
    def __init__(self, name="DockerInstaller", description="Installs and validates Docker on an existing GCP VM.", **kwargs):
        super().__init__(name=name, description=description, **kwargs)

    def _run(self, input_type: dict) -> str:
        try:
            # Check if Docker is already installed
            check_cmd = [
                "gcloud", "compute", "ssh", "docker-agent-vm", "--zone", "us-central1-a",
                "--command", "docker --version"
            ]
            result = subprocess.run(check_cmd, capture_output=True, text=True)
            if result.returncode == 0:
                return f"✅ Docker already installed: {result.stdout.strip()}"

            # Install Docker
            install_cmd = [
                "gcloud", "compute", "ssh", "docker-agent-vm", "--zone", "us-central1-a",
                "--command", "curl -fsSL https://get.docker.com | sh", "--quiet"
            ]
            install_result = subprocess.run(install_cmd, capture_output=True, text=True)
            if install_result.returncode != 0:
                return f"❌ Docker installation failed:\n{install_result.stderr}"

            return "✅ Docker installed successfully."
        except Exception as e:
            return f"❌ Docker installation error: {str(e)}"

In [None]:
import subprocess
from crewai.tools import BaseTool

class VMReadinessValidator(BaseTool):
    def __init__(self, name="VMReadinessValidator", description="Checks if the GCP VM is running and SSH-ready.", **kwargs):
        super().__init__(name=name, description=description, **kwargs)

    def _run(self, input_type: dict) -> str:
        try:
            vm_name = "docker-agent-vm"
            zone = "us-central1-a"


            # Check VM status
            status_cmd = [
                "gcloud", "compute", "instances", "describe", vm_name,
                "--zone", zone,
                "--format=value(status)"
            ]
            status = subprocess.check_output(status_cmd).decode().strip()
            if status != "RUNNING":
                return f"❌ VM exists but is not ready (status: {status})"


            # Wait buffer before SSH attempt
            wait_time = 10  # seconds
            print(f"⏳ Waiting {wait_time} seconds before checking VM SSH readiness...")
            time.sleep(wait_time)


            # Try a lightweight SSH check
            ssh_check_cmd = [
                "gcloud", "compute", "ssh", vm_name,
                "--zone", zone,
                "--command", "echo READY",
                "--quiet"
            ]
            result = subprocess.run(ssh_check_cmd, capture_output=True, text=True)
            if result.returncode != 0 or "READY" not in result.stdout:
                return f"❌ VM is running but SSH failed:\n{result.stderr}"

            return "✅ VM is running and SSH-ready."

        except subprocess.CalledProcessError as e:
            return f"❌ Error checking VM readiness: {e.stderr}"
        except Exception as e:
            return f"❌ Unexpected error during readiness check: {str(e)}"

In [None]:
# Custom CrewAI Tools
import subprocess
from crewai.tools import BaseTool

class KindInstallerTool(BaseTool):
    def __init__(self, name="KindInstaller", description="Installs Kind on a GCP VM via SSH", **kwargs):
        super().__init__(name=name, description=description, **kwargs)

    def _run(self, input_type: dict) -> str:
        try:
            vm_name = "docker-agent-vm"
            zone = "us-central1-a"

            # Step 1: Check if kind is already installed
            check_cmd = [
                "gcloud", "compute", "ssh", vm_name, "--zone", zone,
                "--command", "which kind && kind --version",
                "--quiet"
            ]
            check = subprocess.run(check_cmd, capture_output=True, text=True)
            if check.returncode == 0:
                return f"✅ Kind is already installed:\n{check.stdout.strip()}"

            # Step 2: Download and install kind
            install_cmd = [
                "gcloud", "compute", "ssh", vm_name, "--zone", zone,
                "--command",
                "curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.29.0/kind-linux-amd64 && chmod +x ./kind && sudo mv ./kind /usr/local/bin/kind",
                "--quiet"
            ]
            install = subprocess.run(install_cmd, capture_output=True, text=True)
            if install.returncode != 0:
                return f"❌ Failed to install Kind:\n{install.stderr}"

            # Step 3: Verify installation
            verify_cmd = [
                "gcloud", "compute", "ssh", vm_name, "--zone", zone,
                "--command", "kind --version",
                "--quiet"
            ]
            verify = subprocess.run(verify_cmd, capture_output=True, text=True)
            if verify.returncode != 0:
                return f"⚠️ Kind install ran, but verification failed:\n{verify.stderr}"

            return f"✅ Kind installed successfully:\n{verify.stdout.strip()}"

        except Exception as e:
            return f"❌ Kind installation error: {str(e)}"

In [None]:
from crewai.tools import BaseTool
import subprocess

class KindClusterDeployerTool(BaseTool):
    def __init__(self, name="KindClusterDeployer", description="Creates a Kubernetes cluster using Kind on a GCP VM.", **kwargs):
        super().__init__(name=name, description=description, **kwargs)

    def _run(self, input_type: dict) -> str:
        try:
            with open("kind-config.yaml", "w") as f:
              f.write("""
          kind: Cluster
          apiVersion: kind.x-k8s.io/v1alpha4
          nodes:
            - role: control-plane
            - role: worker
            - role: worker
          """)

             # Step 1: Copy kind config to VM
            scp_config = subprocess.run([
                "gcloud", "compute", "scp", "kind-config.yaml",
                "docker-agent-vm:~/kind-config.yaml",
                "--zone", "us-central1-a",
                "--quiet"
            ], capture_output=True, text=True)

            cluster_name = "redis-cluster"
            create_cmd = [
                "gcloud", "compute", "ssh", "docker-agent-vm",
                "--zone", "us-central1-a",
                "--command", f"echo '☸️ Creating Kind cluster'; kind create cluster --name {cluster_name} --config kind-config.yaml",
                "--quiet"
            ]
            result = subprocess.run(create_cmd, capture_output=True, text=True)

            if result.returncode != 0:
                return f"❌ Failed to create Kind cluster:\n{result.stderr}"

            return f"✅ Kind cluster '{cluster_name}' created successfully:\n{result.stdout}"
        except Exception as e:
            return f"❌ Error during Kind cluster creation: {str(e)}"

In [None]:
from crewai.tools import BaseTool
import subprocess

class RedisDeployerTool(BaseTool):
    def __init__(self, name="RedisDeployer", description="Deploys a Redis pod/service using redis.yaml into Kind cluster.", **kwargs):
        super().__init__(name=name, description=description, **kwargs)

    def _run(self, input_type: dict) -> str:
        try:
            # Step 1: Copy redis.yaml to remote VM
            scp_cmd = [
                "gcloud", "compute", "scp", "redis.yaml",
                "docker-agent-vm:~/redis.yaml",
                "--zone", "us-central1-a",
                "--quiet"
            ]
            scp_result = subprocess.run(scp_cmd, capture_output=True, text=True)
            if scp_result.returncode != 0:
                return f"❌ Failed to copy redis.yaml to VM:\n{scp_result.stderr}"

            # Step 2: Apply it to Kind
            apply_cmd = [
                "gcloud", "compute", "ssh", "docker-agent-vm",
                "--zone", "us-central1-a",
                "--command", "echo '🔴 Deploying Redis to Kind'; kubectl apply -f redis.yaml",
                "--quiet"
            ]
            apply_result = subprocess.run(apply_cmd, capture_output=True, text=True)
            if apply_result.returncode != 0:
                return f"❌ Failed to apply redis.yaml:\n{apply_result.stderr}"

            return f"✅ Redis deployed successfully:\n{apply_result.stdout}"

        except Exception as e:
            return f"❌ Error during Redis deploy: {str(e)}"


In [None]:
# Tools List

# Crew AI Tools

from crewai_tools import FileWriterTool, PDFSearchTool
from pathlib import Path
import os
import platform
import subprocess
import shutil
import urllib.request

# File Write Tools
file_writer_tool = FileWriterTool()

# PDF Search Tool
pdf_file = "Redis.pdf"
pdf_rag_search_tool = PDFSearchTool(pdf_file)


# Custom Tools
gcp_vm_tool = GCPVMTool(description="Deploys a VM in GCP.")
gcp_vm_readiness_tool = VMReadinessValidator(description="Validates that VM is ready for deployment")
gcp_docker_tool = DockerInstallTool(description="Installs Docker on an existing GCP VM.")
kind_installer_tool = KindInstallerTool(description="Installs a Kind Tool for Kubernetes deployments.")
kind_k8s_creator_tool = KindClusterDeployerTool(description="Creates a Kubernetes cluster using Kind on a GCP VM.")

In [None]:
# CrewAI Agents

from crewai import Agent

## Agents
architect = Agent(
  role="{data_store} System Architect",
  goal="Design optimal {data_store} cluster configurations tailored to user workloads and environments.",
  backstory=("You are a principal system architect specializing in distributed data stores, particularly {data_store}. "
        "You are an expert in {data_store} internals and Kubernetes-ready deployments."
        "Use the tool to get the guidance on redis configuration. Do not use external knowledge beyond what is contained in the pdf file {pdf_file} "
        "Pass your question as a plain string to the 'query' parameter — for example: "
        "{ \"query\": \"optimal Redis config for 100 ops/sec workload\" }. "
        "Do not include 'description' or 'type' keys. Do not wrap the string inside another dictionary."
        "Just say, your capacity is limited to certain use cases only at the moment if the query is outside  the scope of the pdf."),
  verbose=False,
  tools=[pdf_rag_search_tool],
  allow_delegation=False
)


manifest_writer_agent = Agent(
  role="{data_store} manifest Writer",
  goal="Vets and writes yaml configurations to the target directory.",
  backstory=("You are a dedicated writer agent specializing in {data_store} configurations. "
        "You leverage the File Writer Tool to create and update configuration files as needed."),
  verbose=False,
  tools=[file_writer_tool],
  allow_delegation=False,
)

infra_vm_agent = Agent(
    role="GCP VM Creator ",
    goal="Provision a GCP VM using Terraform.",
    backstory=("You are an experienced infra admin responsible for setting up compute vm on Google Cloud Platform using terraforms"
              "You ensure the vm is properly setup"
              ),
    verbose=False,
    tools=[gcp_vm_tool],
    allow_delegation=False,
    max_iterations=5
)

infra_vm_readiness_agent = Agent(
    role="GCP VM Readiness Agent ",
    goal="Checks the readiness of a GCP VM for ssh and future deployments",
    backstory=("You are an experienced infra admin responsible for validating the ssh readiness of a compute vm on Google Cloud Platform "
              "You ensure the vm is properly running and ready to ssh to install other tools"
              ),
    verbose=False,
    tools=[gcp_vm_readiness_tool],
    allow_delegation=False,
    max_iterations=5
)

infra_docker_agent = Agent(
    role="Docker Installer",
    goal="Provision Docker on a GCP VM.",
    backstory=("You are an experienced infra admin responsible for installing docker on a GCP VM"
              "You ensure docker is installed which is important for kind k8s cluster to be installed later"
              ),
    verbose=False,
    tools=[gcp_docker_tool],
    allow_delegation=False
)


infra_k8s_kind_agent = Agent(
  role="kind cluster installer",
  goal = "Installs the kind cluster setup and ensure the environment is ready for K8s cluster creation deployment.",
  backstory=("You are an experienced infra admin responsible for setting up and managing the kind cluster. "
        "You ensure that the environment is properly configured and ready for deployment of {data_store} configurations."),
  verbose=False,
  tools=[kind_installer_tool],
  allow_delegation=False
)




In [None]:
from crewai import Task

## Tasks

# ----------------------------------------------------
# Define the architect's task to suggest a cluster design
# ----------------------------------------------------
design_cluster = Task(
    description=(
        "Design an optimal {data_store} architecture for this workload:\n"
        "{workload_description}\n\n"
        "Respond in this exact format:\n"
        "---\n"
        "Architecture Summary:\n<brief text>\n\n"
        "---\n"
        "Key Decisions:\n- <bullet1>\n- <bullet2>\n\n"
        "```yaml\n<valid Kubernetes YAML>\n```\n"
        "Do not add any extra commentary."
    ),
    expected_output="A summary, key decisions, and a valid Kubernetes YAML block.",
    agent=architect,
)

# ----------------------------------------------------
# Define the writer's task to create a manifest writer
# ----------------------------------------------------
write_manifest = Task(
    description=(
        "Validate the Kubernetes YAML and write it to `{target_directory}/{data_store}.yaml`.\n"
        "Use the File Writer Tool.\n"
        "If the file exists, overwrite it.\n"
        "**Important**: The YAML must be valid and ready for `kubectl apply`."
    ),
    expected_output="Confirmation that the YAML was successfully written.",
    agent=manifest_writer_agent,
    context=[design_cluster]
)

# ----------------------------------------------------
# Define the Infra's setup task to create a vm
# ----------------------------------------------------
infra_vm_setup = Task(
    description=(
        "Set up a vm on gcp \n"
        "Use the infra_vm_agent Tool. Pass a directory as a string to store terraform configurations\n"
        "If already set up, confirm readiness.\n"
    ),
    expected_output="Confirmation that the vm is created and ready",
    agent=infra_vm_agent,
)

# ----------------------------------------------------
# Define the Infra's setup task to validate readiness of a vm
# ----------------------------------------------------
infra_vm_readiness_setup = Task(
    description=(
        "Check status of a vm on gcp as running and it should be able to ssh\n"
        "If set up correctly, confirm readiness.\n"
    ),
    expected_output="Confirmation that the vm is running , can ssh and ready for docker installation",
    agent=infra_vm_readiness_agent,
    context=[infra_vm_setup]
)

# ----------------------------------------------------
# Define the Infra's setup task to install docker on vm
# ----------------------------------------------------
infra_docker_setup = Task(
    description=(
        "Install and validate docker on a gcp vm \n"
        "If already set up, confirm readiness. Check docker is ready after installation is complete\n"
    ),
    expected_output="Confirmation that the vm with docker is ready for kind deployment.",
    agent=infra_docker_agent,
    context=[infra_vm_readiness_setup]
)


# ----------------------------------------------------
# Define the Infra's setup task to create a kind k8s cluster
# ----------------------------------------------------
infra_k8s_kind_setup = Task(
    description=(
        "Installs kind tool for deploying {data_store}.\n"
        "Use the Kind Installer Tool.\n"
        "If already set up, confirm readiness.\n"
        "Provide access instructions if setup is successful."
    ),
    expected_output="Confirmation that the kind is installed and ready for k8s deployment.",
    agent=infra_k8s_kind_agent,
    context=[infra_docker_setup]
)

In [None]:
from crewai import Crew, Process

## Crew
crew = Crew(
  # agents=[architect, manifest_writer_agent, infra_docker_vm_agent, infra_k8s_kind_agent],
  agents=[infra_vm_agent, infra_vm_readiness_agent, infra_docker_agent, infra_k8s_kind_agent],
  # agents=[infra_vm_agent
  # tasks=[design_cluster, write_manifest, infra_vm_setup, infra_k8s_setup],
  tasks=[infra_vm_setup, infra_vm_readiness_setup, infra_docker_setup, infra_k8s_kind_setup],
  process=Process.sequential,
  verbose=True
)

In [None]:
import os
from pathlib import Path
from termcolor import colored
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")

inputs = {
  "data_store": "redis",
  "workload_description": (
    "A low-volume website. "
    "It handles 100 ops/sec with 80% reads, 20% writes. "
  ),
  "target_directory": "k8s_configs",
  "target_file": "redis.yaml",
  "pdf_file": "Redis.pdf",
  "terraform_dir" : "terraform"
}


print("""
Kick off the crew process with the provided inputs.
"""
)
print(colored("🚀 Crew execution started...\n", "yellow"))


crew.kickoff(inputs=inputs)

print(colored("\n🧠 All Tasks Finished  ", "green"))

## Validation Steps

In [None]:
import subprocess

def run_test_step(step_name: str, ssh_command: str, icon: str = "🔹"):
    full_command = [
        "gcloud", "compute", "ssh", "docker-agent-vm",
        "--zone", "us-central1-a",
        "--command", f"echo '{icon} {step_name}'; {ssh_command}",
        "--quiet"
    ]
    print(f"\n{icon} Running: {step_name}")
    print("-" * 60)

    result = subprocess.run(full_command, capture_output=True, text=True)
    print(result.stdout)
    if result.returncode != 0:
        print(result.stderr)
        raise RuntimeError(f"❌ Step failed: {icon} {step_name}")
    else:
        print(f"✅ Step passed: {icon} {step_name}")
    print("-" * 60)
    print("\n")

try:
    run_test_step("Checking VM OS version", "uname -a", "💻")
    run_test_step("Checking Docker version", "docker --version", "🐳")
    run_test_step("Checking Docker status", "docker ps", "📦")
    run_test_step("Checking Kind version", "kind --version", "☸️")
    run_test_step("Running Redis container", "docker run -d -p 6379:6379 --name redis redis", "🔴")
    run_test_step("Checking running containers (should include Redis)", "docker ps", "🔍")
    run_test_step("Cleaning up Redis container", "docker stop redis && docker rm redis", "🧹")

    print("\n🎉 All validation steps passed successfully! 🚀")
    print("You're ready to build with 🐳 Docker, ☸️ Kind, and 🔴 Redis on GCP 💻")

except RuntimeError as e:
    print(f"\n🚨 Validation failed:\n{e}")
    print("Please check the above output and rerun once resolved. ❗")


In [None]:
# 🔄 Optional: Remove the vm
!gcloud compute instances delete docker-agent-vm --zone us-central1-a --quiet

In [None]:
!touch redis.yaml