In [1]:
sc

<pyspark.context.SparkContext at 0x7f0366f625c0>

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))
            
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

## Data Normalization

### Normalizer

In [4]:
from pyspark.ml.feature import Normalizer

In [5]:
scaler1 = Normalizer().setInputCol("features").setOutputCol("scaledFeat").setP(1.0)

In [6]:
scaler1.transform(dfVec.select("id","features")).show(5)

+---+--------------------+--------------------+
| id|            features|          scaledFeat|
+---+--------------------+--------------------+
|  0|[0.41371264720975...|[0.32886636983701...|
|  1|[0.73117192818966...|[0.27877135762286...|
|  2|[0.19829196382083...|[0.20619308493718...|
|  3|[0.12714181165849...|[0.06801701322638...|
|  4|[0.76043181534066...|[0.54081735791552...|
+---+--------------------+--------------------+
only showing top 5 rows



### Standard Scaler

In [7]:
from pyspark.ml.feature import StandardScaler

In [8]:
scaler2 = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)

In [9]:
scaler2Model = scaler2.fit(dfVec.select("id","features"))

In [10]:
scaler2Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[0.0943617151947, -0.962057764653, -0.34138993..."
1,1,"[0.73117192819, 1.57463277597, -0.317032643347]","[1.1083042573, 1.62893863751, -0.414211116325]"
2,2,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.593676746506, -0.565193629184, -0.64270042..."
3,3,"[0.127141811658, -0.317032643347, 1.42509038959]","[-0.82092531184, -0.637682397399, 1.68279834122]"
4,4,"[0.760431815341, 0.497762942531, 0.147884030486]","[1.20175827215, 0.338617531349, 0.145413343926]"


### MinMax Scaler

In [11]:
from pyspark.ml.feature import MinMaxScaler

In [12]:
scaler3 = MinMaxScaler().setInputCol("features").setOutputCol("scaledFeat").setMin(-1.0).setMax(1.0)

In [13]:
scaler3Model = scaler3.fit(dfVec.select("id","features"))

In [14]:
scaler3Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[-0.115963827879, -1.0, -0.275721153]"
1,1,"[0.73117192819, 1.57463277597, -0.317032643347]","[0.628048217155, 1.0, -0.321615854237]"
2,2,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.620833563287, -0.693659060941, -0.46561858..."
3,3,"[0.127141811658, -0.317032643347, 1.42509038959]","[-0.78758430389, -0.749613417461, 1.0]"
4,4,"[0.760431815341, 0.497762942531, 0.147884030486]","[0.696623022747, 0.00399621930629, 0.031080961..."
