## Packaging Champion Model (Mlean Flavor) for GCP deployment

### References

- https://docs.azuredatabricks.net/_static/notebooks/mleap-model-export-demo-scala.html
-  https://cloud.google.com/dataproc/docs/tutorials/spark-scala

Let's create a simple MLflow project programmatically with:

1. Create a Working Dir

2. Create a scala job

2. Create score.py

<!-- 3. Create the .sh to run the score.py


2. Create the ML project:
  - MLProject file
  - Conda environment
  - Basic machine learning script

3. Create the scoring script
4. Test the scoring script
5. Create the entrypoint file:
  - execute .sh (Create a Spark cluster, Install Mlflow, Run Batch Scoring Job based on score python code in cloud bucket) -->

## 1. Create a working Dir

In [4]:
MLpackagePath = "/FileStore/ModelProjects/Boston_ML"
dbutils.fs.rm(MLpackagePath, True)
dbutils.fs.mkdirs(MLpackagePath)
dbutils.fs.ls(MLpackagePath)

In [5]:
# Prepare the environment
# Copy data to score
dbutils.fs.cp("dbfs:/data/boston_house_prices.csv", "dbfs:/FileStore/ModelProjects/Boston_ML")
# Copy model to consume for scoring
dbutils.fs.cp("dbfs:/example/lrModel.zip","dbfs:/FileStore/ModelProjects/Boston_ML")
# Check the content
dbutils.fs.ls(MLpackagePath)

## 2. Download the folder programmatically

In [7]:
# Copy model to consume for scoring
dbutils.fs.cp("dbfs:/example/lrModel.zip","/tmp")
# Check the content
dbutils.fs.ls("/tmp")

## 2. Create score.py job

In [9]:
#!/usr/bin/python

import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator


import os
import sys
import argparse
import tempfile
import warnings

# Read Data

# def read_data_csv(spark, inputPath_CSV):
  
#   """
#   Function to load data in the Spark Session 
#   :param spark: spark session 
#   :param inputPath: path to get the data 
#   :return: df
#   """
  
#   print('trying to read the data...')
  
#   try:
#     # define the schema
#     schema = StructType([
#       StructField('crim',DoubleType(),True),
#       StructField('zn',DoubleType(),True),
#       StructField('indus',DoubleType(),True),
#       StructField('chas',IntegerType(),True),
#       StructField('nox',DoubleType(),True),
#       StructField('rm',DoubleType(),True),
#       StructField('age',DoubleType(),True),
#       StructField('dis',DoubleType(),True),
#       StructField('rad',IntegerType(),True),
#       StructField('tax',IntegerType(),True),
#       StructField('ptratio',DoubleType(),True),
#       StructField('b',DoubleType(),True),
#       StructField('lstat',DoubleType(),True),
#       StructField('medv',DoubleType(),True)]
#     )

#     df = (spark.read
#           .option("HEADER", True)
#           .schema(schema)
#           .csv(datapath))
    
#   except ValueError:
#     print('At least, one variable format is wrong! \
#     Please check the data')
      
#   else:
#     print('Data to score have been read successfully!')
#     return df
  
# #Preprocessing

# def preprocessing(df):
  
#   """
#   Function to preprocess data 
#   :param df: A pyspark DataFrame 
#   :return: abt_to_score
#   """
  
#   print('Data preprocessing...')
  
#   features = df.schema.names[:-1]
#   assembler_features = VectorAssembler(inputCols=features, outputCol="features")
#   abt_to_score = assembler_features.transform(df)
#   return abt_to_score

# #Scoring
# def score_data(abt_to_score, modelPath):
  
#   """
#   Function to score data 
#   :param abt_to_score: A pyspark DataFrame to score
#   :param modelPath: The modelpath associated to .zip mleap flavor
#   :return: scoredData
#   """
  
#   print('Scoring process starts...')
  
#   deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:{}".format(modelpath))
#   scoredData = deserializedPipeline.transform(abt_to_score)
#   return scoredData  
  
# def write_output_csv(scoredData, outputPath_CSV):
  
#   """
#   Function to write predictions
#   :param scoredData: A pyspark DataFrame of predictions
#   :param outputPath: The path to write the ouput table
#   :return: scoredData
#   """

#   scoredData.toPandas().to_csv(outputPath_CSV, sep=',', index=False)
#   return outputDf.toPandas().to_dict()
  
def main():

  parser = argparse.ArgumentParser(description='Score')
  
  parser.add_argument('-i', dest="inputpath_CSV",
                        required=True, help='Provide the input path of data to score')

  args = parser.parse_args()
  input_path_CSV = args.inputpath_CSV
  
  try:
    conf = pyspark.SparkConf().setMaster("local").setAppName("My app")
    sc = SparkContext.getOrCreate(conf)
    sqlContext = SQLContext.getOrCreate(sc)
#     sc = pyspark.SparkContext()
#     sc.setLogLevel("ERROR")
#     sqlContext = pyspark.sql.SQLContext(sc)
    print('Created a SparkContext')
      
  except ValueError:
      warnings.warn('SparkContext already exists in this scope')
  
#   #Create a Spark Session
#   spark = SparkSession.builder.appName('MyApp').config("spark.master", "local").getOrCreate()
  
  #Read data
  read_data_csv(spark, input_path_CSV)
  
  
if __name__=="__main__":
  sys.exit(main())

## 3.  Create a Spark Scala job to run on Cloud Dataproc for deploying the model in batch

In [11]:
dbutils.fs.put(f"{MLpackagePath}/score.py",
"""
#!/usr/bin/python

import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator


import os
import sys
import argparse
import tempfile
import warnings

# Read Data

# def read_data_csv(spark, inputPath_CSV):
  
#   '''
#   Function to load data in the Spark Session 
#   :param spark: spark session 
#   :param inputPath: path to get the data 
#   :return: df
#   '''
  
#   print('trying to read the data...')
  
#   try:
#     # define the schema
#     schema = StructType([
#       StructField('crim',DoubleType(),True),
#       StructField('zn',DoubleType(),True),
#       StructField('indus',DoubleType(),True),
#       StructField('chas',IntegerType(),True),
#       StructField('nox',DoubleType(),True),
#       StructField('rm',DoubleType(),True),
#       StructField('age',DoubleType(),True),
#       StructField('dis',DoubleType(),True),
#       StructField('rad',IntegerType(),True),
#       StructField('tax',IntegerType(),True),
#       StructField('ptratio',DoubleType(),True),
#       StructField('b',DoubleType(),True),
#       StructField('lstat',DoubleType(),True),
#       StructField('medv',DoubleType(),True)]
#     )

#     df = (spark.read
#           .option("HEADER", True)
#           .schema(schema)
#           .csv(datapath))
    
#   except ValueError:
#     print('At least, one variable format is wrong! \
#     Please check the data')
      
#   else:
#     print('Data to score have been read successfully!')
#     return df
  
# #Preprocessing

# def preprocessing(df):
  
#   '''
#   Function to preprocess data 
#   :param df: A pyspark DataFrame 
#   :return: abt_to_score
#   '''
  
#   print('Data preprocessing...')
  
#   features = df.schema.names[:-1]
#   assembler_features = VectorAssembler(inputCols=features, outputCol="features")
#   abt_to_score = assembler_features.transform(df)
#   return abt_to_score

# #Scoring
# def score_data(abt_to_score, modelPath):
  
#   '''
#   Function to score data 
#   :param abt_to_score: A pyspark DataFrame to score
#   :param modelPath: The modelpath associated to .zip mleap flavor
#   :return: scoredData
#   '''
  
#   print('Scoring process starts...')
  
#   deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:{}".format(modelpath))
#   scoredData = deserializedPipeline.transform(abt_to_score)
#   return scoredData  
  
# def write_output_csv(scoredData, outputPath_CSV):
  
#   '''
#   Function to write predictions
#   :param scoredData: A pyspark DataFrame of predictions
#   :param outputPath: The path to write the ouput table
#   :return: scoredData
#   '''

#   scoredData.toPandas().to_csv(outputPath_CSV, sep=',', index=False)
#   return outputDf.toPandas().to_dict()
  
def main():

  parser = argparse.ArgumentParser(description='Score')
  
  parser.add_argument('-i', dest="inputpath_CSV",
                        required=True, help='Provide the input path of data to score')

  args = parser.parse_args()
  input_path_CSV = args.inputpath_CSV
  
#   #Create a Spark Session
#   spark = SparkSession.builder.appName('MyApp').config("spark.master", "local").getOrCreate()
  
  #Read data
  #read_data_csv(spark, input_path_CSV)
  
  
if __name__=="__main__":

  from pyspark import SparkContext
  from pyspark.sql import SQLContext

  try:
    conf = pyspark.SparkConf().setMaster("local").setAppName("My app")
    sc = SparkContext.getOrCreate(conf)
   # sqlContext = SQLContext.getOrCreate(sc)
#     sc = pyspark.SparkContext()
#     sc.setLogLevel("ERROR")
#     sqlContext = pyspark.sql.SQLContext(sc)
    print('Created a SparkContext')
      
  except ValueError:
      warnings.warn('SparkContext already exists in this scope')
      
  sys.exit(main())

""".strip(), True)

In [12]:
import subprocess
# errors in the created process are raised here too
try:
  output = subprocess.check_output(["python","/dbfs/FileStore/ModelProjects/Boston_ML/score.py", "-i", "/dbfs/FileStore/ModelProjects/Boston_ML/boston_house_prices.csv"], stderr=subprocess.STDOUT, universal_newlines=True)
except subprocess.CalledProcessError as exc:
    print("Status : FAIL", exc.returncode, exc.output)
else:
    print("Output: \n{}\n".format(output))

## Test score.py

In [14]:
#!/usr/bin/python
import click


import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator


import os
import argparse
import tempfile
import warnings

@click.command()
@click.option("--inputPath_CSV", type=str, )

# Read Data

def read_data_csv(spark, inputPath_CSV):
  
  """
  Function to load data in the Spark Session 
  :param spark: spark session 
  :param inputPath: path to get the data 
  :return: df
  """
  
  print('trying to read the data...')
  
  try:
    # define the schema
    schema = StructType([
      StructField('crim',DoubleType(),True),
      StructField('zn',DoubleType(),True),
      StructField('indus',DoubleType(),True),
      StructField('chas',IntegerType(),True),
      StructField('nox',DoubleType(),True),
      StructField('rm',DoubleType(),True),
      StructField('age',DoubleType(),True),
      StructField('dis',DoubleType(),True),
      StructField('rad',IntegerType(),True),
      StructField('tax',IntegerType(),True),
      StructField('ptratio',DoubleType(),True),
      StructField('b',DoubleType(),True),
      StructField('lstat',DoubleType(),True),
      StructField('medv',DoubleType(),True)]
    )

    df = (spark.read
          .option("HEADER", True)
          .schema(schema)
          .csv(datapath))
    
  except ValueError:
    print('At least, one variable format is wrong! \
    Please check the data')
      
  else:
    print('Data to score have been read successfully!')
    return df
  
# #Preprocessing

# def preprocessing(df):
  
#   """
#   Function to preprocess data 
#   :param df: A pyspark DataFrame 
#   :return: abt_to_score
#   """
  
#   print('Data preprocessing...')
  
#   features = df.schema.names[:-1]
#   assembler_features = VectorAssembler(inputCols=features, outputCol="features")
#   abt_to_score = assembler_features.transform(df)
#   return abt_to_score

# #Scoring
# def score_data(abt_to_score, modelPath):
  
#   """
#   Function to score data 
#   :param abt_to_score: A pyspark DataFrame to score
#   :param modelPath: The modelpath associated to .zip mleap flavor
#   :return: scoredData
#   """
  
#   print('Scoring process starts...')
  
#   deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:{}".format(modelpath))
#   scoredData = deserializedPipeline.transform(abt_to_score)
#   return scoredData  
  
# def write_output_csv(scoredData, outputPath_CSV):
  
#   """
#   Function to write predictions
#   :param scoredData: A pyspark DataFrame of predictions
#   :param outputPath: The path to write the ouput table
#   :return: scoredData
#   """

#   scoredData.toPandas().to_csv(outputPath_CSV, sep=',', index=False)
#   return outputDf.toPandas().to_dict()
  
# def main():

#   parser = argparse.ArgumentParser(description='Score')

#   parser.add_argument('-s', dest="Spark_Session",
#                       help='Provide the name of Spark Session')
  
#   parser.add_argument('-i', dest="inputpath_CSV",
#                         required=True, help='Provide the input path of data to score')

#   args = parser.parse_args()
#   spark_session = args.Spark_session
#   input_path_CSV = args.Input_path_CSV
  
#   #Create a Spark Session
#   spark = SparkSession.builder.appName(spark_session).getOrCreate()
  
#   #Read data
#   read_data_csv(spark, inputPath_CSV)
  
  
# if __name__=="__main__":
#   sys.exit(main())

In [15]:
from click.testing import CliRunner

runner = CliRunner()
result1 = runner.invoke(read_data_csv, ['--datapath', '/data/boston_house_prices.csv'], catch_exceptions=True)

assert result1.exit_code == 0, "Code failed" # Check to see that it worked

print("Success!")

In [16]:
print(result1.output)

In [17]:
dbutils.fs.put(f"{MLpackagePath}/score.py", 

"""
#!/usr/bin/python

print('suca')

""".strip(), True)
               

In [18]:
import subprocess

# errors in the created process are raised here too
output = subprocess.check_output(["python","/dbfs/FileStore/ModelProjects/Boston_ML/score.py"], universal_newlines=True)

print(output)