Skip to content
Permalink
Browse files

Sboag dlaas pull june14 (#97)

* Pointed travis testing to do hostmount minikube

* Debugging permissions error.

* Fix to mkdir problems.

* Fixed Makefile syntax.

* Printing debugging information about pods.

* Printing debugging information about pods.

* Printing debugging information about pods.

* Printing debugging information incl kubectl get pod.

* Enabled debug mode.

* Again.

* Set debug as default.

* tracing from the trainer to lcm

* more debugging

* added lower level logging

* dist: xenial

* Update .travis.yml

* fix typo

* Trying to fix Travis issue.

* Fixed Travis issue.

* Followed Tommy's request and increased resource limits to values from before. Might break CI.

* Parameterized memory values like Tommy requested.

* Attempt to fix CI.

* Removed excessive debug statements and cleaned comments. Probably breaks code.

* DLaaS pull june 14, with security mods

* fixed glide problem

* Added Image.go etc. files, deleted learner_test.go

* temporarily disable framework validation

* FIXME: Disable validation check for bucket until conditionalize for s3fs vs.  option.

* fixed two bugs related to volume mounting

* I think mostly just logging changes

* basic success

* Add FfDL.iml to .gitignore

* removed docker ref to csf_env.properties

* Test for mount_cos before attempting s3 validation

* fixed hostmount by pre-setup of model code in Makefile

* fixed missing import

* log HELM_DEPLOY_DIR, add a bunch of logging for the ci test

* Added create-volumes to jenkins file, more verbose docker build for ui

* Wound back Angular to 6.0.8

* Quiet docker-build-ui docker build

* merged bin/create_static_volumes_config2.sh into bin/create_static_volumes_config.sh
  • Loading branch information...
sboagibm committed Aug 1, 2018
1 parent 0b93b0a commit 0f053c1d0dd9d6a1540a306fca077a9f0992fc80
Showing with 3,133 additions and 1,396 deletions.
  1. +1 −0 .gitignore
  2. +5 −0 .helmignore
  3. +1 −0 .travis.yml
  4. +88 −72 Makefile
  5. +15 −0 bin/create_static_volumes_config.sh
  6. +2 −2 bin/dind_scripts/experimental_master.sh
  7. +4 −0 bin/dind_scripts/launch_kubernetes.sh
  8. +1 −0 bin/dind_scripts/s3_driver.sh
  9. +12 −0 bin/escape_for_sed.sh
  10. +6 −0 cli/main.go
  11. +25 −17 commons/config/config.go
  12. +86 −0 commons/framework/framework.go
  13. +2 −0 commons/logger/logger.go
  14. +4 −2 commons/metricsmon/metricspusher.go
  15. +122 −67 commons/service/lcm.pb.go
  16. +9 −0 commons/service/lcm.proto
  17. +2 −1 dashboard/Dockerfile
  18. +1 −0 docs/developer-guide.md
  19. +37 −0 etc/examples/tf-model/manifest-for-test.yml
  20. +2 −2 glide.lock
  21. +65 −103 jobmonitor/jobmonitor/jobmonitor.go
  22. +5 −12 jobmonitor/jobmonitor/pod_inspection.go
  23. +17 −11 jobmonitor/main.go
  24. +1 −1 lcm/controller/Dockerfile
  25. +7 −0 lcm/controller/src/controller.sh
  26. +45 −40 lcm/coord/coord.go
  27. +1 −1 lcm/lcmconfig/lcmconfig.go
  28. +2 −2 lcm/main.go
  29. +34 −29 lcm/service/lcm/constants.go
  30. +96 −53 lcm/service/lcm/container_helper.go
  31. +0 −10 lcm/service/lcm/helper/volumes.go
  32. +10 −6 lcm/service/lcm/job_monitor_deployment_helpers.go
  33. +63 −72 lcm/service/lcm/lcm_utils.go
  34. +25 −65 lcm/service/lcm/learner/container.go
  35. +15 −10 lcm/service/lcm/learner/envvars.go
  36. +46 −0 lcm/service/lcm/learner/image.go
  37. +48 −0 lcm/service/lcm/learner/image_test.go
  38. +72 −0 lcm/service/lcm/learner/imagesecrets.go
  39. +1 −4 lcm/service/lcm/learner/learner.go
  40. +0 −49 lcm/service/lcm/learner/learner_test.go
  41. +5 −1 lcm/service/lcm/learner/pod_test.go
  42. +5 −0 lcm/service/lcm/learner/secrets.go
  43. +13 −56 lcm/service/lcm/learner/volumes.go
  44. +44 −85 lcm/service/lcm/learner_deployment_helpers.go
  45. +7 −1 lcm/service/lcm/nonSplitTraining.go
  46. +64 −184 lcm/service/lcm/service_impl.go
  47. +68 −30 lcm/service/lcm/splitTraining.go
  48. +7 −8 lcm/service/lcm/volumes.go
  49. +4 −5 metrics/log_collectors/emetrics_file/Dockerfile
  50. +5 −1 metrics/log_collectors/emetrics_file/Makefile
  51. +27 −1 metrics/log_collectors/emetrics_file/src/run.sh
  52. +5 −6 metrics/log_collectors/regex_extractor/Dockerfile
  53. +4 −1 metrics/log_collectors/regex_extractor/Makefile
  54. +27 −1 metrics/log_collectors/regex_extractor/src/run.sh
  55. +5 −4 metrics/log_collectors/simple_log_collector/Dockerfile
  56. +4 −1 metrics/log_collectors/simple_log_collector/Makefile
  57. +27 −1 metrics/log_collectors/simple_log_collector/src/run.sh
  58. +7 −5 metrics/log_collectors/tensorboard/Dockerfile
  59. +1 −1 metrics/log_collectors/tensorboard/Makefile
  60. +2 −1 metrics/log_collectors/tensorboard/src/extract_tb.py
  61. +27 −0 metrics/log_collectors/tensorboard/src/run.sh
  62. +2 −1 metrics/log_collectors/training_data_service_client/scan_log_dirs.py
  63. +1 −8 metrics/log_collectors/training_data_service_client/states.py
  64. +5 −4 restapi/api_v1/server/configure_dlaas.go
  65. +24 −9 restapi/api_v1/server/models_impl.go
  66. +56 −0 restapi/api_v1/swagger-ui/buildui.sh
  67. +23 −0 restapi/api_v1/swagger-ui/manifest.yml
  68. +1,101 −0 restapi/api_v1/swagger/swagger.yml
  69. +5 −0 templates/services/lcm-deployment.yml
  70. +0 −2 trainer/Dockerfile
  71. +103 −0 trainer/instrumentation/call_logger.go
  72. +66 −9 trainer/storage/s3_object_store.go
  73. +17 −21 trainer/trainer/frameworks.go
  74. +234 −179 trainer/trainer/grpc_trainer_v2/trainer.pb.go
  75. +8 −0 trainer/trainer/grpc_trainer_v2/trainer.proto
  76. +10 −0 trainer/trainer/queue.go
  77. +11 −12 trainer/trainer/repository.go
  78. +131 −127 trainer/trainer/trainer_impl.go
@@ -25,3 +25,4 @@ bin/dind_scripts/config.json
bin/dind_scripts/go1.10.1.linux-amd64.tar.gz
bin/dind_scripts/helm-v2.8.2-linux-amd64.tar.gz
bin/dind_scripts/linux-amd64/
FfDL.iml
@@ -23,3 +23,8 @@ cli/
community/
persistentvol/
design/
demos/
list/
mybucket/
persistentvol/
storage-plugin/
@@ -43,6 +43,7 @@ script:
- make $MAKE_ARGS gen-certs
- make $MAKE_ARGS build
- make $MAKE_ARGS docker-build
- make $MAKE_ARGS create-volumes
# deploy services
- make $MAKE_ARGS deploy
# submit a test job
160 Makefile

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -21,3 +21,18 @@ echo
kubectl create configmap ${CONFIGMAP_NAME} --from-file=PVCs.yaml=<(
kubectl get pvc --selector type=${volumeType} -o yaml
)

CONFIGMAP_NAME2=static-volumes-v2

# Delete configmap
#if kubectl get cm | grep static-volumes &> /dev/null; then kubectl delete configmap ${CONFIGMAP_NAME2}; else echo "No need to delete ${CONFIGMAP_NAME2} since it doesn't exist."; fi

# Create new configmap
echo
echo "Using volumes with label type=$volumeType"
kubectl get pvc --selector type=${volumeType}
echo

kubectl get pvc --selector type="dlaas-static-volume" -o jsonpath='{"static-volumes-v2:"}{range .items[*]}{"\n - name: "}{.metadata.name}{"\n zlabel: "}{.metadata.name}{"\n status: active\n"}' > PVCs-v2.yaml

kubectl create configmap ${CONFIGMAP_NAME2} --from-file=PVCs-v2.yaml
@@ -1,5 +1,5 @@
#!/bin/bash
sudo chown -R ffdlr /home/ffdlr
sudo chown -R ${USER} /home/${USER}
chmod +x build_ffdl.sh compile_s3fs.sh create_user.sh import_registry_certificates.sh install_docker.sh install_go.sh install_kubernetes.sh install_nfs.sh install_registry.sh launch_kubernetes.sh launch_registry.sh s3_driver.sh
echo "This script assumes that you have created a user, e.g. via create_user.sh, and are now logged in as that user."

@@ -21,7 +21,7 @@ export DOCKER_REPO_USER=$USER
export DOCKER_REPO_PASS=7312mInalM4n
./launch_registry.sh
./launch_kubernetes.sh
sudo chown -R ffdlr /home/ffdlr/.kube/
sudo chown -R ${USER} /home/${USER}/.kube/
./import_registry_certificates.sh
./s3_driver.sh

@@ -1,6 +1,10 @@
#!/bin/bash
sudo apt install -y jq
cd ~

wget https://cdn.rawgit.com/kubernetes-sigs/kubeadm-dind-cluster/master/fixed/dind-cluster-v1.9.sh
chmod +x dind-cluster-v1.9.sh

sudo ./dind-cluster-v1.9.sh clean
sudo ./dind-cluster-v1.9.sh up
# cd ~/go/src/github.com/IBM/FfDL/bin
@@ -1,6 +1,7 @@
#!/bin/bash

kubectl create secret docker-registry regcred --docker-server=${DOCKER_REPO} --docker-username=${DOCKER_REPO_USER} --docker-password=${DOCKER_REPO_PASS} --docker-email=unknown@docker.io
docker pull docker.io/ffdl/ibmcloud-object-storage-plugin
docker tag ibmcloud-object-storage-plugin ${DOCKER_REPO}/ibmcloud-object-storage-plugin
docker push ${DOCKER_REPO}/ibmcloud-object-storage-plugin

@@ -0,0 +1,12 @@
#!/usr/bin/env bash

#!/bin/bash
operating_system=$(uname)
if [[ "$operating_system" == 'Linux' ]]; then
CMD_SED=sed
elif [[ "$operating_system" == 'Darwin' ]]; then
CMD_SED=gsed
fi
replacement_string=$(echo "$AWS_URL" | $CMD_SED -r 's/\//\\\//g')

echo $replacement_string
@@ -107,6 +107,12 @@ func (c *DlaasPlugin) Run(context plugin.PluginContext, args []string) {
if er == nil {
os.Exit(exitCode)
}

// FIXME QuietPanic does not exist
//if err != terminal.QuietPanic {
// fmt.Printf("%v\n", err)
//}

os.Exit(1)
}
}()
@@ -18,7 +18,6 @@ package config

import (
"fmt"
"runtime"
"io/ioutil"
"os"
"path"
@@ -30,9 +29,9 @@ import (
"google.golang.org/grpc/grpclog"

log "github.com/sirupsen/logrus"

"github.com/spf13/viper"
v1core "k8s.io/api/core/v1"
"runtime"
)

const (
@@ -150,6 +149,7 @@ const (
DlaasResourceLimit = "resource.limit"
DlaasResourceLimitQuerySize = "resource.limit.query.size"

// FfDL Change: next 5 lines
ImagePullPolicy = "image_pull_policy"

SharedVolumeStorageClassKey = "shared_volume_storage_class"
@@ -171,12 +171,13 @@ func InitViper() {

viperInitOnce.Do(func() {

viper.SetEnvPrefix(envPrefix) // will be capitalized automatically
viper.SetEnvPrefix(envPrefix) // will be uppercased automatically
viper.SetConfigType("yaml")

// FfDL Change: make image pull policy default
viper.SetDefault(ImagePullPolicy, v1core.PullIfNotPresent)

// Most likely be "standard" in Minikube and "ibmc-s3fs-standard" in DIND, (other value is "default" or "")
// FfDL Change:Most likely be "standard" in Minikube and "ibmc-s3fs-standard" in DIND, (other value is "default" or "")
viper.SetDefault(SharedVolumeStorageClassKey, "")

// enable ENV vars and defaults
@@ -192,7 +193,10 @@ func InitViper() {
viper.SetDefault(LearnerTagKey, "prod")
viper.SetDefault(DataBrokerTagKey, "prod")
viper.SetDefault(LearnerRegistryKey, "docker.io/ffdl")

// FfDL Change: FIXME in DLaaS, this will need to be overridden with "bluemix-cr-ng"
viper.SetDefault(LearnerImagePullSecretKey, "regcred")
// viper.SetDefault(LearnerImagePullSecretKey, "bluemix-cr-ng")

// TLS defaults for microservices
viper.SetDefault(TLSKey, true)
@@ -288,7 +292,7 @@ func GetFileContents(filename string) string {
return contents
}

// IsTLSEnabled is true if the microservices should all use TLS for communication, otherwise
// IsTLSEnabled is true if the microservices should all use TLS for communiction, otherwise
// it is false.
func IsTLSEnabled() bool {
return viper.GetBool(TLSKey)
@@ -427,10 +431,16 @@ func GetDataStoreConfig() map[string]string {
if val != "" {
m[DomainKey] = val
}
val = viper.GetString("objectstore." + UsernameKey)
val = viper.GetString("objectstore." + RegionKey)
if val != "" {
m[UsernameKey] = val
m[RegionKey] = val
}
// FfDL Change: This was in the older PR, supposing not needed?
//val = viper.GetString("objectstore." + UsernameKey)
//if val != "" {
// m[UsernameKey] = val
//}

val = viper.GetString("objectstore." + ProjectKey)
if val != "" {
m[ProjectKey] = val
@@ -479,7 +489,7 @@ func configKey2EnvVar(key string) string {

// setLogLevel sets the logging level based on the environment
func setLogLevel() {
viper.SetDefault(LogLevelKey, "debug") // FIXME Should be warn
viper.SetDefault(LogLevelKey, "warn")

env := viper.GetString(EnvKey)
if env == "dev" || env == "test" {
@@ -547,32 +557,32 @@ func GetMongoCertLocation() string {
return getFileAtLocation("/etc/certs/mongo/mongo.cert") //the file should have been mounted at this path as a part of secrets
}

//Get LearnerKubeURLKey...
//GetLearnerKubeURL ...
func GetLearnerKubeURL() string {
return viper.GetString(learnerKubeURLKey)
}

//Get LearnerKubeCAFileKey...
//GetLearnerKubeCAFile ...
func GetLearnerKubeCAFile() string {
return viper.GetString(learnerKubeCAFileKey)
}

//Get LearnerKubetokenKey...
//GetLearnerKubeToken ...
func GetLearnerKubeToken() string {
return viper.GetString(learnerKubeTokenKey)
}

//Get LearnerKubeTokenFileKey
//GetLearnerKubeTokenFile ...
func GetLearnerKubeTokenFile() string {
return viper.GetString(learnerKubeTokenFileKey)
}

//Get LearnerKubeKeyFileKey
//GetLearnerKubeKeyFile ...
func GetLearnerKubeKeyFile() string {
return viper.GetString(learnerKubeKeyFileKey)
}

//Get LearnerKubeCertFileKey
//GetLearnerKubeCertFile ...
func GetLearnerKubeCertFile() string {
return viper.GetString(learnerKubeCertFileKey)
}
@@ -626,9 +636,7 @@ func getFileAtLocation(location string) string {
log.Debugf("file was found at location %s", location)
return location
}
//log.Debugf("file certificate was missing at location %s", location)
//LogStackTrace()

log.Debugf("file certificate was missing at location %s", location)
return "" //empty location means that cert is not required
}

@@ -0,0 +1,86 @@
package framework

import (
"encoding/json"
"io/ioutil"
)

//Frameworks All frameworks supported and maintained by dlaas
type Frameworks struct {
Frameworks map[string]*DetailList
}

//DetailList list of versions for a framework
type DetailList struct {
Versions []*Details
}

//Details Specific details for a framework version
type Details struct {
Version string
External bool
Build string
PrevBuild string
}

func readFile(location string) ([]byte, error) {
fileData, err := ioutil.ReadFile(location)
if err != nil {
return []byte(""), err
}
return fileData, nil
}

//GetFrameworks returns the frameworks and their versions that are stored in the path to the learnerConfig
func GetFrameworks(learnerConfigPath string) (Frameworks, error) {
var frameworks Frameworks
fileData, err := readFile(learnerConfigPath)
if err != nil {
return frameworks, err
}
err = json.Unmarshal(fileData, &frameworks)
if err != nil {
return frameworks, err
}

return frameworks, nil
}

//GetImageBuildTagForFramework Returns the latest build tag for a specified framework and version
func GetImageBuildTagForFramework(fwName, fwVersion, learnerConfigPath string) string {
frameworks, err := GetFrameworks(learnerConfigPath)
if err != nil {
return ""
}

frameworkVersions := frameworks.Frameworks[fwName].Versions

for _, frameworkVersion := range frameworkVersions {
if frameworkVersion.Version == fwVersion {
return frameworkVersion.Build
}
}

return ""
}

//CheckIfFrameworkExists Checks if the specified framework exists
func CheckIfFrameworkExists(fwName, fwVersion, learnerConfigPath string) (bool, error) {
frameworks, err := GetFrameworks(learnerConfigPath)
if err != nil {
return false, err
}

frameworkType := frameworks.Frameworks[fwName]
if frameworkType == nil {
return false, nil
}
frameworkVersions := frameworkType.Versions

for _, frameworkVersion := range frameworkVersions {
if frameworkVersion.Version == fwVersion {
return true, nil
}
}
return false, nil
}
@@ -131,6 +131,7 @@ func FileInfoFindGood() string {
if strings.Contains(file, "runtime/extern.go") {
continue
}
// FfDL Change: Not sure if this can be handled in a common way. Moot point once DLaaS is based on FfDL.
if strings.Contains(file, "logger/logger.go") {
continue
}
@@ -164,6 +165,7 @@ func FileInfoFindGood() string {
return fmt.Sprintf("%s:%d %s -", file, line, funcName)
}

// FfDL Change: Because it's useful
func LogStackTrace() {
pc := make([]uintptr, 30)
stackDepth := runtime.Callers(0, pc)
@@ -28,6 +28,7 @@ import (
"github.com/sony/gobreaker"
)

//StartMetricsPusher ...
func StartMetricsPusher(label string, interval time.Duration, url string) chan struct{} {
log.Info("Starting code to push out metrics")
quit := make(chan struct{})
@@ -74,10 +75,11 @@ func pushMetrics(job string, url string) error {
return nil
}

// Pushes metrics out to statsd server every 30s
//StartStatsdMetricsPusher ... pushes metrics out to statsd server every 30s
func StartStatsdMetricsPusher(statsd *statsd.Statsd, pushInterval time.Duration) {
log.Info("Starting code to push out metrics via statsd")
report := time.NewTicker(pushInterval)
// TODO defer report.Stop()
//TODO
//defer report.Stop()
go statsd.SendLoop(report.C, "udp", "statsdexporter:9125")
}
Oops, something went wrong.

0 comments on commit 0f053c1

Please sign in to comment.
You can’t perform that action at this time.