In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
spark

[Transformer base class](https://github.com/apache/spark/blob/v2.4.0/python/pyspark/ml/base.py#L139)

[Estimator base class](https://github.com/apache/spark/blob/v2.4.0/python/pyspark/ml/base.py#L70)

### Let's create a basic transformer

In [None]:
from pyspark.ml import Transformer
import pyspark.sql.functions as F

In [None]:
class ConstTransformer(Transformer):
    """Constant transformer.
    
    It just adds one constant column with a predefined name
    """
    def _transform(self, dataset):
        return dataset.withColumn("mytransformer", F.lit("I am a constant"))

In [None]:
df = spark.range(0, 10, numPartitions=1)

In [None]:
transformer = ConstTransformer()

transformer.transform(df).show()

### How do we specify transformer parameters?

[HasOutputCol mixin](https://github.com/apache/spark/blob/v2.4.0/python/pyspark/ml/param/shared.py#L235)

In [None]:
from pyspark.ml.param.shared import HasOutputCol

In [None]:
class ConstTransformer(Transformer, HasOutputCol):
    """Constant transformer with variable name."""
    def __init__(self): 
        super(ConstTransformer, self).__init__()
        
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.lit("I am a constant"))

In [None]:
transformer = ConstTransformer()

In [None]:
transformer.extractParamMap()

In [None]:
transformer.getOutputCol()

In [None]:
transformer.transform(df).show()

In [None]:
from pyspark import keyword_only

In [None]:
class ConstTransformer(Transformer, HasOutputCol):
    @keyword_only
    def __init__(self, outputCol=None): 
        super(ConstTransformer, self).__init__()
        if outputCol is not None:
            self.setOutputCol(outputCol)
        
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.lit("I am a constant"))

In [None]:
transformer = ConstTransformer("mycolumn")

In [None]:
transformer = ConstTransformer(outputCol="myColumn")

In [None]:
transformer.getOutputCol()

In [None]:
transformer.transform(df).show()

In [None]:
transformer.setOutputCol("anotherColumn")

In [None]:
transformer.transform(df).show()

### Let's define a real-life transformer with input and output columns
[HasInputCol mixin](https://github.com/apache/spark/blob/v2.4.0/python/pyspark/ml/param/shared.py#L189)

In [None]:
from pyspark.ml.param.shared import HasInputCol

In [None]:
class HashTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(HashTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.md5(F.col(self.getInputCol()).cast("string")))

In [None]:
transformer = HashTransformer(inputCol="id", outputCol="hash")

In [None]:
transformer.transform(df).show()

### How do we define a custom parameter?
[Param](https://github.com/apache/spark/blob/v2.4.0/python/pyspark/ml/param/__init__.py#L37)

In [None]:
from pyspark.ml.param import Param, Params, TypeConverters

In [None]:
class HashTransformer(Transformer, HasInputCol, HasOutputCol):
    
    algorithm = Param(Params._dummy(), "algorithm",
                      "hash function to use, must be one of (md5|sha1)",
                      typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, algorithm="md5"):
        super(HashTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
        self._set(algorithm=algorithm)
        
    def get_hash_function(self):
        try:
            return getattr(F, self.getOrDefault("algorithm"))
        except AttributeError as e:
            raise ValueError("Unsupported algorithm {}".format(algorithm))
            
    def setAlgorithm(self, algorithm):
        self._set(algorithm=algorithm)
            
    def _transform(self, dataset):
        hash_col = self.get_hash_function()
        return dataset.withColumn(self.getOutputCol(), hash_col(F.col(self.getInputCol()).cast("string")))

In [None]:
transformer = HashTransformer(inputCol="id", outputCol="hash")

In [None]:
print(transformer.explainParams())

In [None]:
transformer.getOrDefault("algorithm")

In [None]:
transformer.transform(df).show()

In [None]:
transformer.setAlgorithm("sha1")

In [None]:
transformer.getOrDefault("algorithm")

In [None]:
transformer.transform(df).show()

In [None]:
spark.stop()