# Train a movie recommendation engine with Watson Machine Learning Accelerator

### Notebook created by Samaya Madhavan, Kelvin Lui in Jan 2021

### In this notebook, you will learn how to use the Watson Machine Learning Accelerator (WML-A) API and accelerate the training of a movie recommendation model on GPU with Watson Machine Learning Accelerator.

This notebook uses Tensorflow to build a movie recommendation engine. The model will be trained both on CPU and GPU to demonstrate that training models on GPU hardware deliver faster result times.


This notebook covers the following sections:

1. [Setup movie recommendation model using Tensorflow](#rbm-model)<br>

1. [Training the model on CPU](#cpu)<br>

1. [Training the model on GPU with Watson Machine Learning Accelerator](#gpu)<br>

<a id = "rbm-model"></a>
## Step 1 : Setup movie recommendation model using Tensorflow


#### Prepare directory and file for writing movie recommendation engine.

In [1]:
from pathlib import Path
model_dir = f'/project_data/data_asset/samaya' 
model_main = f'main_2.py'

Path(model_dir).mkdir(exist_ok=True)

#### Tensorflow code to build and train a restricted Boltzmann machine using collaborative filtering. The details of how the model is built can be found in this [tutorial](https://developer.ibm.com/technologies/deep-learning/tutorials/build-a-recommendation-engine-with-a-restricted-boltzmann-machine-using-tensorflow/). 

In [2]:
%%writefile {model_dir}/{model_main}


import sys
import subprocess
import os
import datetime


import numpy as np
import pandas as pd

import pyarrow as pa
print('pyarrow version :', pa.__version__)
path = os.path.abspath( pa.__file__)
print("pyarrow path : ", path)

import tensorflow as tf
print('Tensorflow version : ',tf.__version__)

import argparse
import os
from pathlib import PurePath


from tensorflow.python.keras import backend as K
from tensorflow.python.client import device_lib

print('List of available devices : ',device_lib.list_local_devices())

def _get_available_devices():
    return [x.name for x in K.get_session().list_devices()]

def _normalize_device_name(name):
    name = '/' + ':'.join(name.lower().replace('/', '').split(':')[-2:])
    return name

def download_data(use_cuda): 
    import wget, os
    from zipfile import ZipFile,ZipInfo

    zip_file = 'ml-latest-small.zip'
    url = 'https://github.com/IBM/wmla-assets/raw/master/dli-learning-path/movie-recommendation-use-case/dataset/'+zip_file

    
    CPU_PATH='/project_data/data_asset/'
    GPU_PATH='/gpfs/mydatafs/'

    if(use_cuda):
        DOWNLOAD_PATH = GPU_PATH
    else :
        DOWNLOAD_PATH = CPU_PATH
        
    wget.download(url, out=DOWNLOAD_PATH)
    
    path_to_zip_file = os.path.join(DOWNLOAD_PATH, zip_file)

    with ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(DOWNLOAD_PATH)

    os.remove(path_to_zip_file)
    size = len(path_to_zip_file)
    return path_to_zip_file[:size - 4]
    
def load_data(use_cuda):
    
    DATA_PATH = download_data(use_cuda)
    print('data path :', DATA_PATH)
    
    MOVIE_PATH = DATA_PATH +'/movies.csv'
    RATINGS_PATH = DATA_PATH +'/ratings.csv'
    
    print('movie path :', MOVIE_PATH)
    print('ratings path :', RATINGS_PATH )
     
    if use_cuda:
        
        print('load_data GPU')
        # support multiple gpu
        available_devices = _get_available_devices()
        available_devices = [_normalize_device_name(name)
                         for name in available_devices]
        print('available devices : ',available_devices)
        gpu_names = [x for x in available_devices if '/gpu:' in x]
        num_gpus = len(gpu_names)
        print('gpu names = ',gpu_names)
        print("Let's use gpus: " + str(gpu_names))
        if num_gpus <= 0:
            raise ValueError('Unable to find any gpu device ')
            
        import cudf 
        print('cudf version : ',cudf.__version__)
        
        #using gpu get path
        movies_df = cudf.read_csv(MOVIE_PATH)
        ratings_df = cudf.read_csv(RATINGS_PATH)
       
        
    else : 
        print('load_data CPU')
        movies_df = pd.read_csv(MOVIE_PATH)
        ratings_df = pd.read_csv(RATINGS_PATH)
            
    return movies_df, ratings_df

def preprocess_data(movies_df,ratings_df):
    
    movies_df.columns = ['MovieID', 'Title', 'Genres']
    movies_df = movies_df.drop('Genres',axis=1)
    movies_df.head()
    print('shape of movies data frame : ',movies_df.shape)
    
    ratings_df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
    ratings_df = ratings_df.drop('Timestamp',axis=1)
    ratings_df.head()
    print('shape of ratings data frame : ',ratings_df.shape)
    
    ratings_df = ratings_df.head(200000)
    
    # create pivot of ratings dataframe
    user_rating_df = ratings_df.pivot(index='UserID', columns='MovieID')

    #Remove NaNs and normalize
    user_rating_df = user_rating_df.fillna(0)
    norm_user_rating_df = user_rating_df / 5.0
    
    if use_cuda:
        norm_user_rating_df_pd = norm_user_rating_df.to_pandas()
        trX = norm_user_rating_df_pd.values
    else : 
        trX = norm_user_rating_df.values 
        
    return trX,len(user_rating_df.columns)

class RBM_Model(tf.Module):
    
  def __init__(self, visibleUnits,hiddenUnits):
    print('init')
    self.vb = tf.Variable(tf.zeros([visibleUnits]), tf.float32) # Initialze bias to 0 for visible units(i.e. number of unique movies)
    self.hb = tf.Variable(tf.zeros([hiddenUnits]), tf.float32) # Initialze bias to 0 for hidden units(i.e. numer of features we're going to learn )
    self.W = tf.Variable(tf.zeros([visibleUnits, hiddenUnits]), tf.float32)

  def hidden_layer(self,v0_state, W, hb):
      h0_prob = tf.nn.sigmoid(tf.matmul([v0_state], W) + hb)  #probabilities of the hidden units
      h0_state = tf.nn.relu(tf.sign(h0_prob - tf.random.uniform(tf.shape(h0_prob)))) #sample_h_given_X
      return h0_state
    
  def reconstructed_output(self,h0_state, W, vb):
      v1_prob = tf.nn.sigmoid(tf.matmul(h0_state, tf.transpose(W)) + vb) 
      v1_state = tf.nn.relu(tf.sign(v1_prob - tf.random.uniform(tf.shape(v1_prob)))) #sample_v_given_h
      return v1_state[0]
    
  def error(self,v0_state, v1_state):
      return tf.reduce_mean(tf.square(v0_state - v1_state))
    
  def train(self,v0_state,v1_state,h0_state,h1_state):
      delta_W = tf.matmul(tf.transpose([v0_state]), h0_state) - tf.matmul(tf.transpose([v1_state]), h1_state)
      self.W = self.W + alpha * delta_W
      self.vb = self.vb + alpha * tf.reduce_mean(v0_state - v1_state, 0)
      self.hb = self.hb + alpha * tf.reduce_mean(h0_state - h1_state, 0)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Tensorflow Movie Recommender Example')
    parser.add_argument('--batch-size', type=int, default=128, metavar='N',
                    help='input batch size for training (default: 128)')
    parser.add_argument('--epochs', type=int, default=2, metavar='N',
                    help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
    
    args = parser.parse_args()
    print(args)

    use_cuda = not args.no_cuda
    print('using cuda : ', use_cuda)    
    
    ####  load data ####
    movies_df,ratings_df = load_data(use_cuda)
    
    ####  preprocess data ####
    trX,visibleUnits = preprocess_data(movies_df,ratings_df)
    
    hiddenUnits = 20
    epochs = args.epochs
    batchsize = args.batch_size
    errors = []
    weights = []
    K=1
    alpha = args.lr
    train_ds = tf.data.Dataset.from_tensor_slices(np.float32(trX)).batch(batchsize)
    
    rbm = RBM_Model(visibleUnits,hiddenUnits)
   
    _train_starttime = datetime.datetime.now()
    
    for epoch in range(epochs):
        batch_number = 0
        for batch_x in train_ds:

            for i_sample in range(len(batch_x)): 
                v0_state = batch_x[i_sample]
                for k in range(K):
                    h0_state = rbm.hidden_layer(v0_state, rbm.W, rbm.hb)
                    v1_state = rbm.reconstructed_output(h0_state, rbm.W, rbm.vb)
                    h1_state = rbm.hidden_layer(v1_state, rbm.W, rbm.hb)
                    rbm.train(v0_state,v1_state,h0_state,h1_state)
                    v0_state = v1_state

                if i_sample == len(batch_x):
                    err = rbm.error(batch_x[i_sample], v1_state)
                    errors.append(err)
                    weights.append(rbm.W)
                    print ( 'Epoch: %d' % (epoch + 1), 
                       "batch #: %i " % batch_number, "of %i" % (len(trX)/batchsize), 
                       "sample #: %i" % i_sample,
                       'reconstruction error: %f' % err)
            batch_number += 1
                
    _train_endtime = datetime.datetime.now()  
    
    print("\n exclusive Training cost: ", (_train_endtime - _train_starttime).seconds, " seconds.")

Overwriting /project_data/data_asset/samaya/main_2.py



## Step 2 :  Training the model on CPU

#### Training was run from a Cloud Pak for Data Notebook utilizing a CPU kernel. 


In the custom environment that was created with **16vCPU** and **32GB**, it took **277 seconds** (or approximately **5 minutes**) to complete 1 EPOCH training.

In [3]:
import datetime

starttime = datetime.datetime.now()

! python /project_data/data_asset/samaya/main_2.py --no-cuda --epochs 5 --batch-size 10000

endtime = datetime.datetime.now()
print("Training cost: ", (endtime - starttime).seconds, " seconds.")


pyarrow version : 3.0.0
pyarrow path :  /opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/pyarrow/__init__.py
Tensorflow version :  2.1.0
2021-03-25 17:54:34.017330: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
2021-03-25 17:54:34.043612: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2095074999 Hz
2021-03-25 17:54:34.046318: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56038a78b470 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-03-25 17:54:34.046350: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
List of available devices :  [name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8380388092102805856
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 171798691

<a id = "gpu"></a>
## Step 3 : Training the model on GPU with Watson Machine Learning Accelerator

#### Prepare the model files for running on GPU:

In [4]:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

from matplotlib import pyplot as plt
%pylab inline

import base64
import json
import time
import urllib

Populating the interactive namespace from numpy and matplotlib



#### Configuring your environment and project details

To set up your project details, provide your credentials in this cell. You must include your cluster URL, username, and password.

In [6]:
hostname='wmla-console-wmla.apps.cpd35-beta.cpolab.ibm.com'  # please enter Watson Machine Learning Accelerator host name
login='username:password' # please enter the login and password
es = base64.b64encode(login.encode('utf-8')).decode("utf-8")

commonHeaders={'Authorization': 'Basic '+es}
req = requests.Session()
auth_url = 'https://{}/auth/v1/logon'.format(hostname)
print(auth_url)

a=requests.get(auth_url,headers=commonHeaders, verify=False)
access_token=a.json()['accessToken']


https://wmla-console-wmla.apps.cpd35-beta.cpolab.ibm.com/auth/v1/logon


In [7]:
dl_rest_url = 'https://{}/platform/rest/deeplearning/v1'.format(hostname)
commonHeaders={'accept': 'application/json', 'X-Auth-Token': access_token}
req = requests.Session()

In [8]:
# Health check
confUrl = 'https://{}/platform/rest/deeplearning/v1/conf'.format(hostname)
r = req.get(confUrl, headers=commonHeaders, verify=False)


In [9]:
import tarfile
import tempfile
import os
import json
import pprint
import pandas as pd
from IPython.display import clear_output

def query_job_status(job_id,refresh_rate=3) :

    execURL = dl_rest_url  +'/execs/'+ job_id['id']
    pp = pprint.PrettyPrinter(indent=2)

    keep_running=True
    res=None
    while(keep_running):
        res = req.get(execURL, headers=commonHeaders, verify=False)
        monitoring = pd.DataFrame(res.json(), index=[0])
        pd.set_option('max_colwidth', 120)
        clear_output()
        print("Refreshing every {} seconds".format(refresh_rate))
        display(monitoring)
        pp.pprint(res.json())
        if(res.json()['state'] not in ['PENDING_CRD_SCHEDULER', 'SUBMITTED','RUNNING']) :
            keep_running=False
        time.sleep(refresh_rate)
    return res

files = {'file': open("/project_data/data_asset/samaya/main_2.py", 'rb')}

args = '--exec-start tensorflow --cs-datastore-meta type=fs \
                     --workerDeviceNum 1 \
                     --conda-env-name rapids-0.18-movie-recommendation  \
                     --model-main main_2.py  --epochs 5 --batch-size 10000'


In the conda environment that was created, it took 5 seconds to complete 5 EPOCH training.

In [10]:
starttime = datetime.datetime.now()

r = requests.post(dl_rest_url+'/execs?args='+args, files=files,
                  headers=commonHeaders, verify=False)
if not r.ok:
    print('submit job failed: code=%s, %s'%(r.status_code, r.content))
        
job_status = query_job_status(r.json(),refresh_rate=5)

endtime = datetime.datetime.now()

print("\nTraining cost: ", (endtime - starttime).seconds, " seconds.")

Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,PENDING_CRD_SCHEDULER,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'PENDING_CRD_SCHEDULER',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,PENDING_CRD_SCHEDULER,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'PENDING_CRD_SCHEDULER',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,RUNNING,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'RUNNING',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}
Refreshing every 5 seconds


Unnamed: 0,id,args,submissionId,creator,state,appId,schedulerUrl,modelFileOwnerName,workDir,appName,createTime,elastic,nameSpace,numWorker,framework
0,wmla-425,--exec-start tensorflow --cs-datastore-meta type=fs --workerDeviceNum 1 --...,wmla-425,dse_user,FINISHED,wmla-425,https://wmla-mss:9080,wmla,/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code,SingleNodeTensorflowTrain,2021-03-25T18:05:22Z,False,wmla,1,tensorflow


{ 'appId': 'wmla-425',
  'appName': 'SingleNodeTensorflowTrain',
  'args': '--exec-start tensorflow --cs-datastore-meta '
          'type=fs                      --workerDeviceNum '
          '1                      --conda-env-name '
          'rapids-0.18-movie-recommendation                       --model-main '
          'main_2.py  --epochs 5 --batch-size 10000 ',
  'createTime': '2021-03-25T18:05:22Z',
  'creator': 'dse_user',
  'elastic': False,
  'framework': 'tensorflow',
  'id': 'wmla-425',
  'modelFileOwnerName': 'wmla',
  'nameSpace': 'wmla',
  'numWorker': 1,
  'schedulerUrl': 'https://wmla-mss:9080',
  'state': 'FINISHED',
  'submissionId': 'wmla-425',
  'workDir': '/gpfs/myresultfs/dse_user/batchworkdir/wmla-425/_submitted_code'}

Training cost:  61  seconds.


##### 