## Importing

In [1]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("scaling").getOrCreate()

In [2]:
from pyspark.ml.feature import Normalizer, VectorAssembler, StandardScaler, MinMaxScaler, RobustScaler

## Loading Data

In [3]:
cars = spark.read.load(
    "../../data/Carros.csv",
    format="csv",
    sep=";",
    header = True, 
    inferSchema=True)

cars.show(2)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 2 rows



## Vectorizing

In [4]:
vectas = VectorAssembler(
    inputCols=['Consumo','Cilindros','Cilindradas'],
    outputCol="matrix"
)

cars = vectas.transform(cars)

cars.select('Consumo','Cilindros','Cilindradas', 'matrix').show(2, truncate=False)

+-------+---------+-----------+----------------+
|Consumo|Cilindros|Cilindradas|matrix          |
+-------+---------+-----------+----------------+
|21     |6        |160        |[21.0,6.0,160.0]|
|21     |6        |160        |[21.0,6.0,160.0]|
+-------+---------+-----------+----------------+
only showing top 2 rows



## Normalizing

In [5]:
scaler = Normalizer(
    inputCol="matrix",
    outputCol="MatrixNorm",
    p=1.0
)

cars = scaler.transform(cars)

cars.select('Consumo','Cilindros','Cilindradas', 'matrix', 'matrixNorm').show(2, truncate=False)

+-------+---------+-----------+----------------+------------------------------------------------------------+
|Consumo|Cilindros|Cilindradas|matrix          |matrixNorm                                                  |
+-------+---------+-----------+----------------+------------------------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|
|21     |6        |160        |[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|
+-------+---------+-----------+----------------+------------------------------------------------------------+
only showing top 2 rows



## StandardScaler

We have to pass two important arguments in the PySpark's StandardScaler: `withStd` and `withMean`.

- `withStd`: scale to unit std
- `withMean`: center data with mean before scaling.

In [7]:
scaler = StandardScaler(
    inputCol="matrix",
    outputCol="MatrixSS",
    withStd=True,
    withMean=False
)

scaler = scaler.fit(cars)

cars = scaler.transform(cars)

cars.select('matrix', 'matrixNorm', 'MatrixSS').show(2, truncate=False)

+----------------+------------------------------------------------------------+-----------------------------------------------------------+
|matrix          |matrixNorm                                                  |MatrixSS                                                   |
+----------------+------------------------------------------------------------+-----------------------------------------------------------+
|[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|[0.24996122082808128,3.359609874407659,0.20137542427273997]|
|[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|[0.24996122082808128,3.359609874407659,0.20137542427273997]|
+----------------+------------------------------------------------------------+-----------------------------------------------------------+
only showing top 2 rows



## RobustScaler

Params:

- `lower`: lower quantile to calculate quantile range
- `upper`: upper quantile to calculate quantile range
- `withScaling`:whether to scale the data to quantile range
- `withCentering`: whether to center data with median

In [9]:
scaler = RobustScaler(
    inputCol="matrix",
    outputCol="MatrixRS",
    withScaling=True,
    withCentering=False,
    lower=0.25,
    upper=0.75
)

scaler = scaler.fit(cars)

cars = scaler.transform(cars)

cars.select('matrix', 'matrixNorm', 'MatrixSS', 'MatrixRS').show(2, truncate=False)

+----------------+------------------------------------------------------------+-----------------------------------------------------------+---------------------------------------------+
|matrix          |matrixNorm                                                  |MatrixSS                                                   |MatrixRS                                     |
+----------------+------------------------------------------------------------+-----------------------------------------------------------+---------------------------------------------+
|[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|[0.24996122082808128,3.359609874407659,0.20137542427273997]|[0.29166666666666663,1.5,0.16967126193001061]|
|[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|[0.24996122082808128,3.359609874407659,0.20137542427273997]|[0.29166666666666663,1.5,0.16967126193001061]|
+----------------+----------------------------------------------------

## MinMaxScaler

Params:

- `Min`: lower bound of the output feature range
- `Max`: upper bound of the output feature range

In [12]:
scaler = MinMaxScaler(
    inputCol="matrix",
    outputCol="MatrixMM",
    min=0,
    max=1
)

scaler = scaler.fit(cars)

cars = scaler.transform(cars)

cars.select('matrix', 'matrixNorm', 'MatrixSS', 'MatrixRS', 'MatrixMM').show(2, truncate=False)

+----------------+------------------------------------------------------------+-----------------------------------------------------------+---------------------------------------------+-----------------------------------------------+
|matrix          |matrixNorm                                                  |MatrixSS                                                   |MatrixRS                                     |MatrixMM                                       |
+----------------+------------------------------------------------------------+-----------------------------------------------------------+---------------------------------------------+-----------------------------------------------+
|[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.8556149732620321]|[0.24996122082808128,3.359609874407659,0.20137542427273997]|[0.29166666666666663,1.5,0.16967126193001061]|[0.018518518518518517,0.5,0.030235162374020158]|
|[21.0,6.0,160.0]|[0.11229946524064172,0.03208556149732621,0.855