# Test Dask Kubernetes with Hugging Face Dataset

In [None]:
# Install required libraries
!pip install dask-kubernetes datasets dask

## Step 1: Import Required Libraries

In [1]:

from dask_kubernetes.operator import KubeCluster
from dask.distributed import Client
from datasets import load_dataset
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

  from dask_kubernetes import KubeCluster, make_pod_spec


## Step 2: Create Dask Kubernetes Cluster

In [None]:
# Define pod specification for Dask workers
pod_spec = make_pod_spec(
    image="my-custom-dask-image:latest",  # Replace with your Docker image
    memory_limit="1Gi",
    memory_request="1Gi",
    cpu_limit=1.0,
    cpu_request=1.0
)

# Create a Dask Kubernetes cluster
cluster = KubeCluster(pod_spec, namespace="default")
cluster.scale(n=3)  # Scale to 3 workers

# Connect a Dask client to the cluster
client = Client(cluster)
logger.info(f"Dask scheduler dashboard: {client.dashboard_link}")

## Step 3: Load Hugging Face Dataset and Process in Batches

In [None]:
# Load a dataset from Hugging Face
dataset_name = "imdb"
batch_size = 1000  # Number of rows per batch

logger.info(f"Loading dataset '{dataset_name}'...")
dataset = load_dataset(dataset_name, split="train")

# Process dataset in batches
logger.info(f"Processing dataset in batches of {batch_size} rows...")
for i in range(0, len(dataset), batch_size):
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    logger.info(f"Processing batch {i // batch_size + 1}: {len(batch)} rows")

logger.info("Dataset processing completed.")

## Step 4: Clean Up Cluster

In [None]:
# Clean up the cluster
client.close()
cluster.close()
logger.info("Cluster resources released.")