This is a notebook for Quick Start Example on MaLearn EngDocs

https://engdocs.uberinternal.com/malearn/rst/user_guide/quick-start.html

## Create Spark Session

In [None]:
# Build operators and workflow service fat jars.
# ./buckw build //data/michelangelo/operators/jarfatjar:bin_main
# ./buckw build //data/michelangelo/workflow:bin_main

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local[*]') \
    .config('spark.jars', '../../../../../buck-out/gen/data/michelangelo/operators/jarfatjar/bin_main/bin_main.jar') \
    .getOrCreate()

## Some Setup (not needed in DSW)

In [None]:
# Append system path
import sys
sys.path.append('../')

import os
# Tunnel Muttley for production WFO GRPC
# os.system('ssh -MfN -L 5435:localhost:5435 adhoc20-dca1')
# Or start WFO service locally.

## Initialize MaLearn

In [None]:
import malearn
from malearn import MaLearnConfig

config = MaLearnConfig()
config.running_mode = 'local'
config.workspace_override_root_dir = '/tmp/michelangelo'  # Only for local.
config.workflow_service_port = 9877  # 9876 for OSS, 5435 for muttley
malearn.init(config)

## Read Data
Load Iris dataset.

In [None]:
from malearn.datasets import load_iris
df = load_iris()
# Show DataFrame
df.value.show()

## Model Selection
Split data into train data set and test data set.

In [None]:
from malearn.model_selection import TrainTestSplit
splitter = TrainTestSplit(test_ratio=0.2)
train_df, test_df = splitter.split(df)
# Show count
train_df.value.count(), test_df.value.count()

## Training
Define Decision Tree parameters using Params.

In [None]:
from malearn import Params
p = Params()
p.max_depth = 5
p.max_bins = 32
params = p.to_constant()

Create a Spark ML Pipeline using Custom Python Operator.

In [None]:
from malearn import Params, python_op
from pyspark.sql import DataFrame
from pyspark.ml import Pipeline

@python_op
def create_pipeline(train_df: DataFrame, params: Params) -> Pipeline:
    from pyspark.ml.classification import DecisionTreeClassifier
    from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
    from pyspark.ml import Pipeline

    label_string_indexer = StringIndexer(inputCol='target', outputCol='indexedLabel').fit(train_df)
    assembler = VectorAssembler(
        inputCols=['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm'],
        outputCol='features')
    dt = DecisionTreeClassifier(
        labelCol='indexedLabel',
        featuresCol='features',
        predictionCol='indexedPrediction',
        maxDepth=p.max_depth,
        maxBins=p.max_bins)
    label_index_to_string = IndexToString(
        inputCol='indexedPrediction',
        outputCol='prediction',
        labels=label_string_indexer.labels)
    return Pipeline(stages=[label_string_indexer, assembler, dt, label_index_to_string])

pipeline = create_pipeline(train_df, params)

Fit the model.

In [None]:
pipeline_model = pipeline.fit(train_df)

## Prediction
Make prediction on test data.

In [None]:
predicted_test_df = pipeline_model.predict(test_df)
predicted_test_df.value.show()

## Save Data
Save data to csv.

In [None]:
final_df = predicted_test_df.drop(['features', 'rawPrediction', 'indexedLabel', 'probability', 'indexedPrediction'])
final_df.to_csv('/tmp/michelangelo/test.csv')