Skip to content
This repository was archived by the owner on Jan 29, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ CLUSTER_NAME ?= mycluster
PUBLIC_IP ?= 127.0.0.1
CI_MINIKUBE_VERSION ?= v0.25.1
CI_KUBECTL_VERSION ?= v1.9.4
NAMESPACE ?= default

AWS_ACCESS_KEY_ID ?= test
AWS_SECRET_ACCESS_KEY ?= test
Expand Down Expand Up @@ -99,6 +100,7 @@ docker-push:
# TODO: setup-registry

create-registry:
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@kubectl create secret docker-registry regcred --docker-server=${DOCKER_REPO} --docker-username="${DOCKER_REPO_USER}" --docker-password="${DOCKER_REPO_PASS}" --docker-email=unknown@docker.io ; \
cd ${DOCKER_REPO_DIR} ; \
docker-compose up -d
Expand All @@ -125,20 +127,21 @@ deploy-plugin:
done; \
fi;
@existingPlugin=$$(helm list | grep ibmcloud-object-storage-plugin | awk '{print $$1}' | head -n 1);
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@if [ "$(VM_TYPE)" = "dind" ]; then \
export FFDL_PATH=$$(pwd); \
./bin/s3_driver.sh; \
sleep 10; \
(if [ -z "$$existingPlugin" ]; then \
helm install --set dind=true,cloud=false storage-plugin; \
helm install --set dind=true,cloud=false,namespace=$$NAMESPACE storage-plugin; \
else \
helm upgrade --set dind=true,cloud=false $$existingPlugin storage-plugin; \
helm upgrade --set dind=true,cloud=false,namespace=$$NAMESPACE $$existingPlugin storage-plugin; \
fi) & pid=$$!; \
else \
(if [ -z "$$existingPlugin" ]; then \
helm install storage-plugin; \
helm install --set namespace=$$NAMESPACE storage-plugin; \
else \
helm upgrade $$existingPlugin storage-plugin; \
helm upgrade --set namespace=$$NAMESPACE $$existingPlugin storage-plugin; \
fi) & pid=$$!; \
fi;
@echo "Wait while kubectl get pvc shows static-volume-1 in state Pending"
Expand All @@ -148,6 +151,7 @@ deploy-plugin:

quickstart-deploy:
@echo "collecting existing pods"
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@while kubectl get pods --all-namespaces | \
grep -v RESTARTS | \
grep -v Running | \
Expand All @@ -161,11 +165,11 @@ quickstart-deploy:
existing=$$(helm list | grep ffdl | awk '{print $$1}' | head -n 1); \
(if [ -z "$$existing" ]; then \
echo "Deploying the stack via Helm. This will take a while."; \
helm install --set lcm.shared_volume_storage_class=$$SHARED_VOLUME_STORAGE_CLASS . ; \
helm install --set lcm.shared_volume_storage_class=$$SHARED_VOLUME_STORAGE_CLASS,namespace=$$NAMESPACE . ; \
sleep 10; \
else \
echo "Upgrading existing Helm deployment ($$existing). This will take a while."; \
helm upgrade --set lcm.shared_volume_storage_class=$$SHARED_VOLUME_STORAGE_CLASS $$existing . ; \
helm upgrade --set lcm.shared_volume_storage_class=$$SHARED_VOLUME_STORAGE_CLASS,namespace=$$NAMESPACE $$existing . ; \
fi) & pid=$$!; \
sleep 5; \
while kubectl get pods --all-namespaces | \
Expand Down Expand Up @@ -208,6 +212,7 @@ quickstart-deploy:
test-job-submit: ## Submit test training job
@# make sure the buckets with training data exist
@echo Downloading Docker images and test training data. This may take a while.
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@if [ "$(VM_TYPE)" = "minikube" ]; then \
eval $(minikube docker-env); docker images | grep tensorflow | grep latest > /dev/null || docker pull tensorflow/tensorflow > /dev/null; \
fi
Expand Down Expand Up @@ -253,6 +258,7 @@ deploy: ## Deploy the services to Kubernetes
sleep 3; \
fi;
@echo collecting existing pods
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@while kubectl get pods --all-namespaces | \
grep -v RESTARTS | \
grep -v Running | \
Expand All @@ -272,9 +278,9 @@ deploy: ## Deploy the services to Kubernetes
cp -rf Chart.yaml values.yaml templates ${HELM_DEPLOY_DIR}; \
existing=$$(helm list | grep ffdl | awk '{print $$1}' | head -n 1); \
if [ "$$CI" = "true" ]; then \
export helm_params='--set lcm.shared_volume_storage_class=${SHARED_VOLUME_STORAGE_CLASS},has_static_volumes=${HAS_STATIC_VOLUMES},prometheus.deploy=false,learner.docker_namespace=${DOCKER_NAMESPACE},docker.namespace=${DOCKER_NAMESPACE},learner.tag=${IMAGE_TAG},docker.pullPolicy=${DOCKER_PULL_POLICY},docker.registry=${DOCKER_REPO},trainer.version=${IMAGE_TAG},restapi.version=${IMAGE_TAG},lcm.version=${IMAGE_TAG},trainingdata.version=${IMAGE_TAG},databroker.tag=${IMAGE_TAG},databroker.version=${IMAGE_TAG},webui.version=${IMAGE_TAG}'; \
export helm_params='--set lcm.shared_volume_storage_class=${SHARED_VOLUME_STORAGE_CLASS},has_static_volumes=${HAS_STATIC_VOLUMES},namespace=${NAMESPACE},prometheus.deploy=false,learner.docker_namespace=${DOCKER_NAMESPACE},docker.namespace=${DOCKER_NAMESPACE},learner.tag=${IMAGE_TAG},docker.pullPolicy=${DOCKER_PULL_POLICY},docker.registry=${DOCKER_REPO},trainer.version=${IMAGE_TAG},restapi.version=${IMAGE_TAG},lcm.version=${IMAGE_TAG},trainingdata.version=${IMAGE_TAG},databroker.tag=${IMAGE_TAG},databroker.version=${IMAGE_TAG},webui.version=${IMAGE_TAG}'; \
else \
export helm_params='--set lcm.shared_volume_storage_class=${SHARED_VOLUME_STORAGE_CLASS},has_static_volumes=${HAS_STATIC_VOLUMES},learner.docker_namespace=${DOCKER_NAMESPACE},docker.namespace=${DOCKER_NAMESPACE},learner.tag=${IMAGE_TAG},docker.pullPolicy=${DOCKER_PULL_POLICY},docker.registry=${DOCKER_REPO},trainer.version=${IMAGE_TAG},restapi.version=${IMAGE_TAG},lcm.version=${IMAGE_TAG},trainingdata.version=${IMAGE_TAG},databroker.tag=${IMAGE_TAG},databroker.version=${IMAGE_TAG},webui.version=${IMAGE_TAG}'; \
export helm_params='--set lcm.shared_volume_storage_class=${SHARED_VOLUME_STORAGE_CLASS},has_static_volumes=${HAS_STATIC_VOLUMES},namespace=${NAMESPACE},learner.docker_namespace=${DOCKER_NAMESPACE},docker.namespace=${DOCKER_NAMESPACE},learner.tag=${IMAGE_TAG},docker.pullPolicy=${DOCKER_PULL_POLICY},docker.registry=${DOCKER_REPO},trainer.version=${IMAGE_TAG},restapi.version=${IMAGE_TAG},lcm.version=${IMAGE_TAG},trainingdata.version=${IMAGE_TAG},databroker.tag=${IMAGE_TAG},databroker.version=${IMAGE_TAG},webui.version=${IMAGE_TAG}'; \
fi; \
(if [ -z "$$existing" ]; then \
echo "Deploying the stack via Helm. This will take a while."; \
Expand Down Expand Up @@ -448,6 +454,7 @@ docker-build-logcollectors:
test-push-data-s3: ## Test
@# Pushes test data to S3 buckets.
@echo Pushing test data.
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@s3_ip=$$(make --no-print-directory kubernetes-ip); \
s3_port=$$(kubectl get service s3 -o jsonpath='{.spec.ports[0].nodePort}'); \
s3_url=http://$$s3_ip:$$s3_port; \
Expand Down Expand Up @@ -557,6 +564,7 @@ test-submit: ## Submit test training job
@if [ "$(VM_TYPE)" = "minikube" ]; then \
eval $(minikube docker-env); docker images | grep tensorflow | grep latest > /dev/null || docker pull tensorflow/tensorflow > /dev/null; \
fi
@kubectl config set-context $$(kubectl config current-context) --namespace=$$NAMESPACE
@node_ip=$$(make --no-print-directory kubernetes-ip); \
s3_ip=$$(kubectl get po/storage-0 -o=jsonpath='{.status.hostIP}'); \
s3_port=$$(kubectl get service s3 -o jsonpath='{.spec.ports[0].nodePort}'); \
Expand Down
47 changes: 28 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,21 @@ To know more about the architectural details, please read the [design document](

There are multiple installation paths for installing FfDL locally ("1-click-install") or into an existing Kubernetes cluster. You can visit [Step 5](#5-detailed-installation-instructions) for more details on the deployment instructions.

> If you are using bash shell, you can modify the necessary environment variables in `env.txt` and export all of them using the following commands
> ```shell
> source env.txt
> export $(cut -d= -f1 env.txt)
> ```

### 1.1 Installation using Kubeadm-DIND

If you have [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) installed on your machine, use these commands to deploy the FfDL platform:
``` shell
export VM_TYPE=dind
export PUBLIC_IP=localhost
export SHARED_VOLUME_STORAGE_CLASS="";
export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before running the make commands below

make deploy-plugin
make quickstart-deploy
```
Expand All @@ -91,9 +99,11 @@ then deploy the platform services:
``` shell
export VM_TYPE=none
export PUBLIC_IP=<Cluster Public IP>
export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before running the make commands below

# Change the storage class to what's available on your Cloud Kubernetes Cluster.
export SHARED_VOLUME_STORAGE_CLASS="ibmc-file-gold";

make deploy-plugin
make quickstart-deploy
```
Expand Down Expand Up @@ -130,29 +140,42 @@ kubectl get pods --all-namespaces | grep tiller-deploy
```

2. Define the necessary environment variables.
> If you are using bash shell, you can modify the necessary environment variables in `env.txt` and export all of them using the following commands
> ```shell
> source env.txt
> export $(cut -d= -f1 env.txt)
> ```

* 2.a. For Kubeadm-DIND Cluster only
```shell
export FFDL_PATH=$(pwd)
export SHARED_VOLUME_STORAGE_CLASS=""
export VM_TYPE=dind
export PUBLIC_IP=localhost
export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before proceeding to the next step
```

* 2.b. For Cloud Kubernetes Cluster
> Note: If you are using IBM Cloud Cluster, you can obtain your k8s public ip using `bx cs workers <cluster-name>`.

```shell
# Change the storage class to what's available on your Cloud Kubernetes Cluster.
export SHARED_VOLUME_STORAGE_CLASS="ibmc-file-gold"
export VM_TYPE=none
export PUBLIC_IP=<Cluster Public IP>
export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before proceeding to the next step
```

3. Install the Object Storage driver using helm install.
* 3.a. For Kubeadm-DIND Cluster only
```shell
export FFDL_PATH=$(pwd)
./bin/s3_driver.sh
helm install storage-plugin --set dind=true,cloud=false
helm install storage-plugin --set dind=true,cloud=false,namespace=$NAMESPACE
```

* 3.b. For Cloud Kubernetes Cluster
```shell
helm install storage-plugin
helm install storage-plugin --set namespace=$NAMESPACE
```

4. Create a static volume to store any metadata from FfDL.
Expand All @@ -168,13 +191,14 @@ popd
5. Now let's install all the necessary FfDL components using helm install.

``` shell
helm install . --set lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS
helm install . --set lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS,namespace=$NAMESPACE
```
> Note: If you want to upgrade an older version of FfDL, run
> `helm upgrade $(helm list | grep ffdl | awk '{print $1}' | head -n 1) .`

Make sure all the FfDL components are installed and running before moving to the next step.
``` shell
kubectl config set-context $(kubectl config current-context) --namespace=$NAMESPACE
kubectl get pods
# NAME READY STATUS RESTARTS AGE
# alertmanager-7cf6b988b9-h9q6q 1/1 Running 0 5h
Expand Down Expand Up @@ -207,22 +231,7 @@ s3_port=$(kubectl get service s3 -o jsonpath='{.spec.ports[0].nodePort}')
```

7. Run the following commands to configure Grafana for monitoring FfDL using the logging information from prometheus.
* 7.a. For Kubeadm-DIND Cluster only
```shell
export VM_TYPE=none
export PUBLIC_IP=localhost

./bin/grafana.init.sh
```


* 7.b. For Cloud Kubernetes Cluster.
> Note: If you are using IBM Cloud Cluster, you can obtain your k8s public ip using `bx cs workers <cluster-name>`.

``` shell
export VM_TYPE=none
export PUBLIC_IP=<Cluster Public IP>

./bin/grafana.init.sh
```

Expand Down
2 changes: 2 additions & 0 deletions bin/create_static_volumes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
SHARED_VOLUME_STORAGE_CLASS="${SHARED_VOLUME_STORAGE_CLASS:-""}"

volumeNum=${1:-1}
Namespace=${Namespace:-default}

echo "Creating persistent volume claim $volumeNum"
(kubectl apply -f - <<EOF
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: static-volume-$volumeNum
namespace: $Namespace
annotations:
volume.beta.kubernetes.io/storage-class: "$SHARED_VOLUME_STORAGE_CLASS"
labels:
Expand Down
15 changes: 8 additions & 7 deletions bin/create_static_volumes_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@

CONFIGMAP_NAME=static-volumes
volumeType=${1:-dlaas-static-volume}
NAMESPACE=${NAMESPACE:-default}

# Delete configmap
if kubectl get cm | grep static-volumes &> /dev/null; then kubectl delete configmap ${CONFIGMAP_NAME}; else echo "No need to delete ${CONFIGMAP_NAME} since it doesn't exist."; fi
if kubectl get cm -n ${NAMESPACE} | grep static-volumes &> /dev/null; then kubectl delete configmap ${CONFIGMAP_NAME} -n ${NAMESPACE}; else echo "No need to delete ${CONFIGMAP_NAME} since it doesn't exist."; fi

# Create new configmap
echo
echo "Using volumes with label type=$volumeType:"
kubectl get pvc --selector type=${volumeType}
kubectl get pvc --selector type=${volumeType} -n ${NAMESPACE}
echo
kubectl create configmap ${CONFIGMAP_NAME} --from-file=PVCs.yaml=<(
kubectl get pvc --selector type=${volumeType} -o yaml
kubectl create configmap ${CONFIGMAP_NAME} -n ${NAMESPACE} --from-file=PVCs.yaml=<(
kubectl get pvc --selector type=${volumeType} -n ${NAMESPACE} -o yaml
)

CONFIGMAP_NAME2=static-volumes-v2
Expand All @@ -30,9 +31,9 @@ CONFIGMAP_NAME2=static-volumes-v2
# Create new configmap
echo
echo "Using volumes with label type=$volumeType"
kubectl get pvc --selector type=${volumeType}
kubectl get pvc --selector type=${volumeType} -n ${NAMESPACE}
echo

kubectl get pvc --selector type="dlaas-static-volume" -o jsonpath='{"static-volumes-v2:"}{range .items[*]}{"\n - name: "}{.metadata.name}{"\n zlabel: "}{.metadata.name}{"\n status: active\n"}' > PVCs-v2.yaml
kubectl get pvc --selector type="dlaas-static-volume" -n ${NAMESPACE} -o jsonpath='{"static-volumes-v2:"}{range .items[*]}{"\n - name: "}{.metadata.name}{"\n zlabel: "}{.metadata.name}{"\n status: active\n"}' > PVCs-v2.yaml

kubectl create configmap ${CONFIGMAP_NAME2} --from-file=PVCs-v2.yaml
kubectl create configmap ${CONFIGMAP_NAME2} -n ${NAMESPACE} --from-file=PVCs-v2.yaml
3 changes: 2 additions & 1 deletion bin/grafana.init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ fi

[ -z ${node_ip} ] && echo "Can't get node_ip for grafana, \$VM_TYPE == \"$VM_TYPE\""

grafana_port=$(kubectl get service grafana -o jsonpath='{.spec.ports[0].nodePort}')
NAMESPACE=${NAMESPACE:-default}
grafana_port=$(kubectl get service grafana -o jsonpath='{.spec.ports[0].nodePort}' -n $NAMESPACE)
grafana_url="http://$node_ip:$grafana_port"

echo "wait until the grafana service is up (grafana_url=${grafana_url})"
Expand Down
2 changes: 2 additions & 0 deletions docs/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export DOCKER_NAMESPACE=<NAMESPACE_ON_IBM_CLOUD> # Container Registry Namespace
export DOCKER_PULL_POLICY=Always # Keep IfNotPresent if not pushing to registry, e.g. for Minikube
export VM_TYPE=none
export HAS_STATIC_VOLUMES=True
export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before proceeding to the next step
```

Compile the code, generate certificates, and build the Docker images via:
Expand All @@ -50,6 +51,7 @@ make docker-push # Push built Docker images to registry, not used for Minikube
Make sure `kubectl` points to the right target context/namespace, then deploy the services to your Kubernetes
environment (using `helm`):
```shell
kubectl config set-context $(kubectl config current-context) --namespace=$NAMESPACE # Set your current-context to the FfDL namespace
make create-volumes # Create static volumes for sharing across pods
make deploy-plugin # Deploy S3 storage plugin
make deploy # Deploy FfDL
Expand Down
4 changes: 2 additions & 2 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Need to adapt tensorflow version in manifest to what is specified on https://git

# DIND
## Deploy
* ffdl-lcm, ffdl-restapi, ffdl-trainer, ffdl-trainingdata and ffdl-ui pods show ImagePullBackOff: See if Kubernetes secret regcred exists via `kubectl get secret | grep regcred`. If it does not (output empty), create it with `kubectl create secret docker-registry regcred --docker-server=${DOCKER_REPO} --docker-username=${DOCKER_REPO_USER} --docker-password=${DOCKER_REPO_PASS} --docker-email=unknown@docker.io`.
* ffdl-lcm, ffdl-restapi, ffdl-trainer, ffdl-trainingdata and ffdl-ui pods show ImagePullBackOff: See if Kubernetes secret regcred exists via `kubectl get secret | grep regcred`. If it does not (output empty), create it with `kubectl create secret docker-registry regcred --docker-server=${DOCKER_REPO} --docker-username=${DOCKER_REPO_USER} --docker-password=${DOCKER_REPO_PASS} --docker-email=unknown@docker.io -n ${NAMESPACE}`.

## Training
* If you start a job and `lhelper` and `jobmonitor` pods get to `Running` state, but the corresponding `learner` remains stuck in `ContainerCreating`, please take a look at `kubectl describe pod <learner-pod>`. It is possible that your storage configuration in your manifest is invalid and if so, you should see events that point out the issues.
* If you start a job and `lhelper` and `jobmonitor` pods get to `Running` state, but the corresponding `learner` remains stuck in `ContainerCreating`, please take a look at `kubectl describe pod <learner-pod>`. It is possible that your storage configuration in your manifest is invalid and if so, you should see events that point out the issues.
5 changes: 5 additions & 0 deletions env.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
VM_TYPE=dind # Kubernetes deployment environment (dind or none)
PUBLIC_IP=localhost # Kubernetes External IP
SHARED_VOLUME_STORAGE_CLASS=""; # Storage Class available on your Kubernetes Cluster
NAMESPACE=default # The namespace that you want to deploy FfDL on
FFDL_PATH=$(pwd) # Current path of your FfDL directory
1 change: 1 addition & 0 deletions storage-plugin/templates/volume.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ apiVersion: v1
kind: PersistentVolume
metadata:
name: local-volume-1
namespace: {{.Values.namespace}}
labels:
type: local
spec:
Expand Down
1 change: 1 addition & 0 deletions storage-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ image:
pullPolicy: IfNotPresent
dind: false
cloud: true
namespace: default
3 changes: 3 additions & 0 deletions templates/infrastructure/etcd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ apiVersion: v1
kind: Service
metadata:
name: etcd
namespace: {{.Values.namespace}}
spec:
ports:
- port: 2379
Expand All @@ -19,6 +20,7 @@ metadata:
app: etcd
etcd_node: etcd0
name: etcd0
namespace: {{.Values.namespace}}
spec:
containers:
- command:
Expand Down Expand Up @@ -57,6 +59,7 @@ metadata:
labels:
etcd_node: etcd0
name: etcd0
namespace: {{.Values.namespace}}
spec:
ports:
- name: client
Expand Down
2 changes: 2 additions & 0 deletions templates/infrastructure/mongo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
name: mongo
namespace: {{.Values.namespace}}
spec:
serviceName: mongo
replicas: 1
Expand Down Expand Up @@ -34,6 +35,7 @@ apiVersion: v1
kind: Service
metadata:
name: mongo
namespace: {{.Values.namespace}}
labels:
environment: local
spec:
Expand Down
3 changes: 3 additions & 0 deletions templates/infrastructure/storage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ apiVersion: v1
kind: Service
metadata:
name: s3
namespace: {{.Values.namespace}}
spec:
{{ if .Values.services.expose_node_port }}
type: NodePort
Expand All @@ -19,6 +20,7 @@ apiVersion: v1
kind: Service
metadata:
name: elasticsearch
namespace: {{.Values.namespace}}
labels:
component: elasticsearch
spec:
Expand All @@ -39,6 +41,7 @@ apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
name: storage
namespace: {{.Values.namespace}}
spec:
serviceName: storage
replicas: 1
Expand Down
Loading