In [3]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')
findspark.find()

'/usr/hdp/current/spark2-client'

In [4]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn")\
.config("spark.driver.memory", "8g")\
.config("spark.executor.memory", "3g")\
.appName("ch25_MLPreprocessing").getOrCreate()

# .config("spark.executor.cores", "5")\
# .config("spark.executor.instances", "10")\


# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read in YARN client mode
#SPARK_EXECUTOR_INSTANCES="2" #Number of workers to start (Default: 2)
#SPARK_EXECUTOR_CORES="1" #Number of cores for the workers (Default: 1).
#SPARK_EXECUTOR_MEMORY="1G" #Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
#SPARK_DRIVER_MEMORY="512M" #Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)

In [5]:
sc = spark.sparkContext

In [6]:
for x in sc._conf.getAll():
    if "/proxy/" in x[1]:
        print(x[1])

http://rm01.itversity.com:19288/proxy/application_1533622723243_15480


In [7]:
sales = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/user/kranthidr/dataSets/spark-guide/retail-data/by-day/*.csv")\
  .coalesce(5)\
  .where("Description IS NOT NULL")

In [8]:
fakeIntDF = spark.read.parquet("/user/kranthidr/dataSets/spark-guide/simple-ml-integers")

In [9]:
simpleDF = spark.read.json("/user/kranthidr/dataSets/spark-guide/simple-ml")

In [10]:
scaleDF = spark.read.parquet("/user/kranthidr/dataSets/spark-guide/simple-ml-scaling")

In [11]:
# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")

In [12]:
supervised.fit(simpleDF).transform(simpleDF).show(5, False)

+-----+----+------+------------------+----------------------------------------------------------------------+-----+
|color|lab |value1|value2            |features                                                              |label|
+-----+----+------+------------------+----------------------------------------------------------------------+-----+
|green|good|1     |14.386294994851129|(10,[1,2,3,5,8],[1.0,1.0,14.386294994851129,1.0,14.386294994851129])  |1.0  |
|blue |bad |8     |14.386294994851129|(10,[2,3,6,9],[8.0,14.386294994851129,8.0,14.386294994851129])        |0.0  |
|blue |bad |12    |14.386294994851129|(10,[2,3,6,9],[12.0,14.386294994851129,12.0,14.386294994851129])      |0.0  |
|green|good|15    |38.97187133755819 |(10,[1,2,3,5,8],[1.0,15.0,38.97187133755819,15.0,38.97187133755819])  |1.0  |
|green|good|12    |14.386294994851129|(10,[1,2,3,5,8],[1.0,12.0,14.386294994851129,12.0,14.386294994851129])|1.0  |
+-----+----+------+------------------+----------------------------------

In [13]:
supervised1 = RFormula(formula="lab ~ .")
supervised1.fit(simpleDF).transform(simpleDF).show(5, False)

+-----+----+------+------------------+---------------------------------+-----+
|color|lab |value1|value2            |features                         |label|
+-----+----+------+------------------+---------------------------------+-----+
|green|good|1     |14.386294994851129|[0.0,1.0,1.0,14.386294994851129] |1.0  |
|blue |bad |8     |14.386294994851129|[0.0,0.0,8.0,14.386294994851129] |0.0  |
|blue |bad |12    |14.386294994851129|[0.0,0.0,12.0,14.386294994851129]|0.0  |
|green|good|15    |38.97187133755819 |[0.0,1.0,15.0,38.97187133755819] |1.0  |
|green|good|12    |14.386294994851129|[0.0,1.0,12.0,14.386294994851129]|1.0  |
+-----+----+------+------------------+---------------------------------+-----+
only showing top 5 rows



In [14]:
# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

In [15]:
basicTransformation.transform(sales).show(10)

+-------------+--------+----------+
|sum(Quantity)|count(1)|CustomerID|
+-------------+--------+----------+
|          290|      98|   13607.0|
|         1542|      30|   13094.0|
|          541|      27|   14285.0|
|           34|       6|   14768.0|
|           97|      12|   16596.0|
|          630|      72|   17633.0|
|          244|      31|   16561.0|
|          493|      64|   16629.0|
|          159|      38|   17267.0|
|          206|      23|   12493.0|
+-------------+--------+----------+
only showing top 10 rows



In [16]:
# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()

+----+----+----+--------------------------------------------+
|int1|int2|int3|VectorAssembler_4e3c8994744c6a3cd411__output|
+----+----+----+--------------------------------------------+
|   1|   2|   3|                               [1.0,2.0,3.0]|
|   4|   5|   6|                               [4.0,5.0,6.0]|
|   7|   8|   9|                               [7.0,8.0,9.0]|
+----+----+----+--------------------------------------------+



In [17]:
# COMMAND ----------

contDF = spark.range(20).selectExpr("cast(id as double)")

In [18]:
contDF.show(5)

+---+
| id|
+---+
|0.0|
|1.0|
|2.0|
|3.0|
|4.0|
+---+
only showing top 5 rows



In [19]:
# COMMAND ----------

from pyspark.ml.feature import Bucketizer
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id")
bucketer.transform(contDF).show()

+----+---------------------------------------+
|  id|Bucketizer_43b99bb609f5a474846f__output|
+----+---------------------------------------+
| 0.0|                                    0.0|
| 1.0|                                    0.0|
| 2.0|                                    0.0|
| 3.0|                                    0.0|
| 4.0|                                    0.0|
| 5.0|                                    1.0|
| 6.0|                                    1.0|
| 7.0|                                    1.0|
| 8.0|                                    1.0|
| 9.0|                                    1.0|
|10.0|                                    2.0|
|11.0|                                    2.0|
|12.0|                                    2.0|
|13.0|                                    2.0|
|14.0|                                    2.0|
|15.0|                                    2.0|
|16.0|                                    2.0|
|17.0|                                    2.0|
|18.0|       

In [20]:
# COMMAND ----------

from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()

+----+------------------------------------------------+
|  id|QuantileDiscretizer_401dafe89f7b762b31c4__output|
+----+------------------------------------------------+
| 0.0|                                             0.0|
| 1.0|                                             0.0|
| 2.0|                                             0.0|
| 3.0|                                             1.0|
| 4.0|                                             1.0|
| 5.0|                                             1.0|
| 6.0|                                             1.0|
| 7.0|                                             2.0|
| 8.0|                                             2.0|
| 9.0|                                             2.0|
|10.0|                                             2.0|
|11.0|                                             2.0|
|12.0|                                             3.0|
|13.0|                                             3.0|
|14.0|                                          

In [21]:
# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show(10, False)

+---+--------------+------------------------------------------------------------+
|id |features      |StandardScaler_4e2e9d6499c1f481920f__output                 |
+---+--------------+------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[1.1952286093343936,0.02337622911060922,-0.5976143046671968]|
|1  |[2.0,1.1,1.0] |[2.390457218668787,0.2571385202167014,0.5976143046671968]   |
|0  |[1.0,0.1,-1.0]|[1.1952286093343936,0.02337622911060922,-0.5976143046671968]|
|1  |[2.0,1.1,1.0] |[2.390457218668787,0.2571385202167014,0.5976143046671968]   |
|1  |[3.0,10.1,3.0]|[3.5856858280031805,2.3609991401715313,1.7928429140015902]  |
+---+--------------+------------------------------------------------------------+



In [22]:
# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show(10,False)

+---+--------------+-----------------------------------------+
|id |features      |MinMaxScaler_424a83ef26ce22fbe448__output|
+---+--------------+-----------------------------------------+
|0  |[1.0,0.1,-1.0]|[5.0,5.0,5.0]                            |
|1  |[2.0,1.1,1.0] |[7.5,5.5,7.5]                            |
|0  |[1.0,0.1,-1.0]|[5.0,5.0,5.0]                            |
|1  |[2.0,1.1,1.0] |[7.5,5.5,7.5]                            |
|1  |[3.0,10.1,3.0]|[10.0,10.0,10.0]                         |
+---+--------------+-----------------------------------------+



In [23]:
# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show(10, False)

+---+--------------+-------------------------------------------------------------+
|id |features      |MaxAbsScaler_45d88e4a8c0993c983b8__output                    |
+---+--------------+-------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.3333333333333333,0.009900990099009901,-0.3333333333333333]|
|1  |[2.0,1.1,1.0] |[0.6666666666666666,0.10891089108910892,0.3333333333333333]  |
|0  |[1.0,0.1,-1.0]|[0.3333333333333333,0.009900990099009901,-0.3333333333333333]|
|1  |[2.0,1.1,1.0] |[0.6666666666666666,0.10891089108910892,0.3333333333333333]  |
|1  |[3.0,10.1,3.0]|[1.0,1.0,1.0]                                                |
+---+--------------+-------------------------------------------------------------+



In [24]:
# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

In [25]:
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")

In [26]:
scalingUp.transform(scaleDF).show()

+---+--------------+-----------------------------------------------+
| id|      features|ElementwiseProduct_4f0ca9f6be0eb8db3677__output|
+---+--------------+-----------------------------------------------+
|  0|[1.0,0.1,-1.0]|                               [10.0,1.5,-20.0]|
|  1| [2.0,1.1,1.0]|                               [20.0,16.5,20.0]|
|  0|[1.0,0.1,-1.0]|                               [10.0,1.5,-20.0]|
|  1| [2.0,1.1,1.0]|                               [20.0,16.5,20.0]|
|  1|[3.0,10.1,3.0]|                              [30.0,151.5,60.0]|
+---+--------------+-----------------------------------------------+



In [27]:
# COMMAND ----------

from pyspark.ml.feature import Normalizer
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show(truncate=False)

+---+--------------+---------------------------------------------------------------+
|id |features      |Normalizer_4f569092b31f528b583d__output                        |
+---+--------------+---------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.47619047619047616,0.047619047619047616,-0.47619047619047616]|
|1  |[2.0,1.1,1.0] |[0.48780487804878053,0.26829268292682934,0.24390243902439027]  |
|0  |[1.0,0.1,-1.0]|[0.47619047619047616,0.047619047619047616,-0.47619047619047616]|
|1  |[2.0,1.1,1.0] |[0.48780487804878053,0.26829268292682934,0.24390243902439027]  |
|1  |[3.0,10.1,3.0]|[0.18633540372670807,0.6273291925465838,0.18633540372670807]   |
+---+--------------+---------------------------------------------------------------+



In [28]:
# COMMAND ----------

from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

+-----+----+------+------------------+--------+
|color| lab|value1|            value2|labelInd|
+-----+----+------+------------------+--------+
|green|good|     1|14.386294994851129|     1.0|
| blue| bad|     8|14.386294994851129|     0.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     1.0|
|green|good|    12|14.386294994851129|     1.0|
|green| bad|    16|14.386294994851129|     0.0|
|  red|good|    35|14.386294994851129|     1.0|
|  red| bad|     1| 38.97187133755819|     0.0|
|  red| bad|     2|14.386294994851129|     0.0|
|  red| bad|    16|14.386294994851129|     0.0|
|  red|good|    45| 38.97187133755819|     1.0|
|green|good|     1|14.386294994851129|     1.0|
| blue| bad|     8|14.386294994851129|     0.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     1.0|
|green|good|    12|14.386294994851129|     1.0|
|green| bad|    16|14.386294994851129|     0.0|
|  red|good|    35|14.386294994851129|  

In [29]:
# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()

+-----+----+------+------------------+--------+
|color| lab|value1|            value2|valueInd|
+-----+----+------+------------------+--------+
|green|good|     1|14.386294994851129|     2.0|
| blue| bad|     8|14.386294994851129|     4.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     5.0|
|green|good|    12|14.386294994851129|     0.0|
|green| bad|    16|14.386294994851129|     1.0|
|  red|good|    35|14.386294994851129|     6.0|
|  red| bad|     1| 38.97187133755819|     2.0|
|  red| bad|     2|14.386294994851129|     7.0|
|  red| bad|    16|14.386294994851129|     1.0|
|  red|good|    45| 38.97187133755819|     3.0|
|green|good|     1|14.386294994851129|     2.0|
| blue| bad|     8|14.386294994851129|     4.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     5.0|
|green|good|    12|14.386294994851129|     0.0|
|green| bad|    16|14.386294994851129|     1.0|
|  red|good|    35|14.386294994851129|  

In [30]:
# COMMAND ----------

from pyspark.ml.feature import IndexToString
labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show()

+-----+----+------+------------------+--------+------------------------------------------+
|color| lab|value1|            value2|labelInd|IndexToString_44b9aa81a342b03af7c2__output|
+-----+----+------+------------------+--------+------------------------------------------+
|green|good|     1|14.386294994851129|     1.0|                                      good|
| blue| bad|     8|14.386294994851129|     0.0|                                       bad|
| blue| bad|    12|14.386294994851129|     0.0|                                       bad|
|green|good|    15| 38.97187133755819|     1.0|                                      good|
|green|good|    12|14.386294994851129|     1.0|                                      good|
|green| bad|    16|14.386294994851129|     0.0|                                       bad|
|  red|good|    35|14.386294994851129|     1.0|                                      good|
|  red| bad|     1| 38.97187133755819|     0.0|                                       bad|

In [31]:
# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors

In [32]:
idxIn = spark.createDataFrame([
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")

In [33]:
idxIn.show()

+-------------+-----+
|     features|label|
+-------------+-----+
|[1.0,2.0,3.0]|    1|
|[2.0,5.0,6.0]|    2|
|[1.0,8.0,9.0]|    3|
+-------------+-----+



In [34]:
[
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]

[(DenseVector([1.0, 2.0, 3.0]), 1),
 (DenseVector([2.0, 5.0, 6.0]), 2),
 (DenseVector([1.0, 8.0, 9.0]), 3)]

In [35]:
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)

In [36]:
indxr.fit(idxIn).transform(idxIn).show()

+-------------+-----+-------------+
|     features|label|        idxed|
+-------------+-----+-------------+
|[1.0,2.0,3.0]|    1|[0.0,2.0,3.0]|
|[2.0,5.0,6.0]|    2|[1.0,5.0,6.0]|
|[1.0,8.0,9.0]|    3|[0.0,8.0,9.0]|
+-------------+-----+-------------+



In [37]:
# COMMAND ----------
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [38]:
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))

In [39]:
colorLab.show(5)

+-----+--------+
|color|colorInd|
+-----+--------+
|green|     1.0|
| blue|     2.0|
| blue|     2.0|
|green|     1.0|
|green|     1.0|
+-----+--------+
only showing top 5 rows



In [40]:
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show(5)

+-----+--------+------------------------------------------+
|color|colorInd|OneHotEncoder_4ace9dd77f85339340be__output|
+-----+--------+------------------------------------------+
|green|     1.0|                             (2,[1],[1.0])|
| blue|     2.0|                                 (2,[],[])|
| blue|     2.0|                                 (2,[],[])|
|green|     1.0|                             (2,[1],[1.0])|
|green|     1.0|                             (2,[1],[1.0])|
+-----+--------+------------------------------------------+
only showing top 5 rows



In [41]:
# COMMAND ----------
from pyspark.ml.feature import Tokenizer

In [42]:
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))

In [43]:
tokenized.show(20, False)

+-----------------------------------+------------------------------------------+
|Description                        |DescOut                                   |
+-----------------------------------+------------------------------------------+
|RABBIT NIGHT LIGHT                 |[rabbit, night, light]                    |
|DOUGHNUT LIP GLOSS                 |[doughnut, lip, gloss]                    |
|12 MESSAGE CARDS WITH ENVELOPES    |[12, message, cards, with, envelopes]     |
|BLUE HARMONICA IN BOX              |[blue, harmonica, in, box]                |
|GUMBALL COAT RACK                  |[gumball, coat, rack]                     |
|SKULLS  WATER TRANSFER TATTOOS     |[skulls, , water, transfer, tattoos]      |
|FELTCRAFT GIRL AMELIE KIT          |[feltcraft, girl, amelie, kit]            |
|CAMOUFLAGE LED TORCH               |[camouflage, led, torch]                  |
|WHITE SKULL HOT WATER BOTTLE       |[white, skull, hot, water, bottle]        |
|ENGLISH ROSE HOT WATER BOTT

In [44]:
# COMMAND ----------
from pyspark.ml.feature import RegexTokenizer

In [45]:
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setToLowercase(True)

In [46]:
rt.transform(sales.select("Description")).show(20, False)

+-----------------------------------+------------------------------------------+
|Description                        |DescOut                                   |
+-----------------------------------+------------------------------------------+
|RABBIT NIGHT LIGHT                 |[rabbit, night, light]                    |
|DOUGHNUT LIP GLOSS                 |[doughnut, lip, gloss]                    |
|12 MESSAGE CARDS WITH ENVELOPES    |[12, message, cards, with, envelopes]     |
|BLUE HARMONICA IN BOX              |[blue, harmonica, in, box]                |
|GUMBALL COAT RACK                  |[gumball, coat, rack]                     |
|SKULLS  WATER TRANSFER TATTOOS     |[skulls, water, transfer, tattoos]        |
|FELTCRAFT GIRL AMELIE KIT          |[feltcraft, girl, amelie, kit]            |
|CAMOUFLAGE LED TORCH               |[camouflage, led, torch]                  |
|WHITE SKULL HOT WATER BOTTLE       |[white, skull, hot, water, bottle]        |
|ENGLISH ROSE HOT WATER BOTT

In [47]:
# COMMAND ----------
from pyspark.ml.feature import RegexTokenizer

In [48]:
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)

In [49]:
rt.transform(sales.select("Description")).show(20, False)

+-----------------------------------+------------------+
|Description                        |DescOut           |
+-----------------------------------+------------------+
|RABBIT NIGHT LIGHT                 |[ ,  ]            |
|DOUGHNUT LIP GLOSS                 |[ ,  ,  ]         |
|12 MESSAGE CARDS WITH ENVELOPES    |[ ,  ,  ,  ]      |
|BLUE HARMONICA IN BOX              |[ ,  ,  ,  ]      |
|GUMBALL COAT RACK                  |[ ,  ]            |
|SKULLS  WATER TRANSFER TATTOOS     |[ ,  ,  ,  ,  ]   |
|FELTCRAFT GIRL AMELIE KIT          |[ ,  ,  ]         |
|CAMOUFLAGE LED TORCH               |[ ,  ]            |
|WHITE SKULL HOT WATER BOTTLE       |[ ,  ,  ,  ,  ]   |
|ENGLISH ROSE HOT WATER BOTTLE      |[ ,  ,  ,  ]      |
|HOT WATER BOTTLE KEEP CALM         |[ ,  ,  ,  ]      |
|SCOTTIE DOG HOT WATER BOTTLE       |[ ,  ,  ,  ]      |
|ROSE CARAVAN DOORSTOP              |[ ,  ]            |
|GINGHAM HEART  DOORSTOP RED        |[ ,  ,  ,  ]      |
|STORAGE TIN VINTAGE LEAF      

In [50]:
# COMMAND ----------
from pyspark.ml.feature import StopWordsRemover

englishStopWords = StopWordsRemover.loadDefaultStopWords("english")

stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")

In [51]:
from pyspark.sql.functions import col
condi = (col("Description") == 'SET OF 4 KNICK KNACK TINS POPPIES') | (col("Description") == 'GROW A FLYTRAP OR SUNFLOWER IN TIN')

In [52]:
condi

Column<((Description = SET OF 4 KNICK KNACK TINS POPPIES) OR (Description = GROW A FLYTRAP OR SUNFLOWER IN TIN))>

In [53]:
stops.transform(tokenized)\
.select("*",condi.alias("filtered"))\
.where("filtered")\
.drop("Description","filtered")\
.show(10,False)

+------------------------------------------+---------------------------------------------+
|DescOut                                   |StopWordsRemover_4a2bbde8faf8b270ceac__output|
+------------------------------------------+---------------------------------------------+
|[set, of, 4, knick, knack, tins, poppies] |[set, 4, knick, knack, tins, poppies]        |
|[grow, a, flytrap, or, sunflower, in, tin]|[grow, flytrap, sunflower, tin]              |
|[set, of, 4, knick, knack, tins, poppies] |[set, 4, knick, knack, tins, poppies]        |
|[set, of, 4, knick, knack, tins, poppies] |[set, 4, knick, knack, tins, poppies]        |
|[set, of, 4, knick, knack, tins, poppies] |[set, 4, knick, knack, tins, poppies]        |
|[set, of, 4, knick, knack, tins, poppies] |[set, 4, knick, knack, tins, poppies]        |
|[set, of, 4, knick, knack, tins, poppies] |[set, 4, knick, knack, tins, poppies]        |
|[grow, a, flytrap, or, sunflower, in, tin]|[grow, flytrap, sunflower, tin]              |

In [54]:
# COMMAND ----------
from pyspark.ml.feature import NGram

unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)

In [55]:
unigram.transform(tokenized.select("DescOut")).show(10,False)

+-------------------------------------+-------------------------------------+
|DescOut                              |NGram_4969b5c9149c0f2fb385__output   |
+-------------------------------------+-------------------------------------+
|[rabbit, night, light]               |[rabbit, night, light]               |
|[doughnut, lip, gloss]               |[doughnut, lip, gloss]               |
|[12, message, cards, with, envelopes]|[12, message, cards, with, envelopes]|
|[blue, harmonica, in, box]           |[blue, harmonica, in, box]           |
|[gumball, coat, rack]                |[gumball, coat, rack]                |
|[skulls, , water, transfer, tattoos] |[skulls, , water, transfer, tattoos] |
|[feltcraft, girl, amelie, kit]       |[feltcraft, girl, amelie, kit]       |
|[camouflage, led, torch]             |[camouflage, led, torch]             |
|[white, skull, hot, water, bottle]   |[white, skull, hot, water, bottle]   |
|[english, rose, hot, water, bottle]  |[english, rose, hot, wate

In [56]:
bigram.transform(tokenized.select("DescOut")).show(10,False)

+-------------------------------------+-------------------------------------------------------+
|DescOut                              |NGram_4c60ad3e57de70c4af3d__output                     |
+-------------------------------------+-------------------------------------------------------+
|[rabbit, night, light]               |[rabbit night, night light]                            |
|[doughnut, lip, gloss]               |[doughnut lip, lip gloss]                              |
|[12, message, cards, with, envelopes]|[12 message, message cards, cards with, with envelopes]|
|[blue, harmonica, in, box]           |[blue harmonica, harmonica in, in box]                 |
|[gumball, coat, rack]                |[gumball coat, coat rack]                              |
|[skulls, , water, transfer, tattoos] |[skulls ,  water, water transfer, transfer tattoos]    |
|[feltcraft, girl, amelie, kit]       |[feltcraft girl, girl amelie, amelie kit]              |
|[camouflage, led, torch]             |[

In [57]:
# COMMAND ----------
from pyspark.ml.feature import CountVectorizer

In [58]:
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("countVec")\
  .setVocabSize(500)\
  .setMinTF(1)\
  .setMinDF(2)

In [59]:
fittedCV = cv.fit(tokenized)
fittedCV.transform(tokenized).drop("Description").show(20,False)

+------------------------------------------+---------------------------------------------------+
|DescOut                                   |countVec                                           |
+------------------------------------------+---------------------------------------------------+
|[rabbit, night, light]                    |(500,[149,185,212],[1.0,1.0,1.0])                  |
|[doughnut, lip, gloss]                    |(500,[462,463,491],[1.0,1.0,1.0])                  |
|[12, message, cards, with, envelopes]     |(500,[35,41,166],[1.0,1.0,1.0])                    |
|[blue, harmonica, in, box]                |(500,[10,16,36,352],[1.0,1.0,1.0,1.0])             |
|[gumball, coat, rack]                     |(500,[228,281,407],[1.0,1.0,1.0])                  |
|[skulls, , water, transfer, tattoos]      |(500,[11,40,133],[1.0,1.0,1.0])                    |
|[feltcraft, girl, amelie, kit]            |(500,[60,64,69],[1.0,1.0,1.0])                     |
|[camouflage, led, torch]     

In [60]:
# COMMAND ----------
tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)

In [61]:
tfIdfIn.show(10, False)

+--------------------------------------+
|DescOut                               |
+--------------------------------------+
|[wooden, advent, calendar, red]       |
|[wrap, red, vintage, doily]           |
|[alarm, clock, bakelike, red]         |
|[alarm, clock, bakelike, red]         |
|[red, diner, wall, clock]             |
|[red, apples, chopping, board]        |
|[red, kitchen, scales]                |
|[airline, bag, vintage, jet, set, red]|
|[red, retrospot, charlotte, bag]      |
|[pin, cushion, babushka, red]         |
+--------------------------------------+



In [62]:
# COMMAND ----------
from pyspark.ml.feature import HashingTF, IDF

tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
    

In [63]:
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)

In [64]:
# COMMAND ----------
idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).drop("DescOut").show(10, False)

+----------------------------------------------------------------+----------------------------------------------------------------+
|TFOut                                                           |IDFOut                                                          |
+----------------------------------------------------------------+----------------------------------------------------------------+
|(10000,[829,2311,4291,8242],[1.0,1.0,1.0,1.0])                  |(10000,[829,2311,4291,8242],[0.0,0.0,0.0,0.0])                  |
|(10000,[792,4291,8150,9749],[1.0,1.0,1.0,1.0])                  |(10000,[792,4291,8150,9749],[0.0,0.0,0.0,0.0])                  |
|(10000,[4291,4852,4995,9668],[1.0,1.0,1.0,1.0])                 |(10000,[4291,4852,4995,9668],[0.0,0.0,0.0,0.0])                 |
|(10000,[4291,4852,4995,9668],[1.0,1.0,1.0,1.0])                 |(10000,[4291,4852,4995,9668],[0.0,0.0,0.0,0.0])                 |
|(10000,[4291,6404,8873,9668],[1.0,1.0,1.0,1.0])                 |(10000,[42

In [65]:
# COMMAND ----------
from pyspark.ml.feature import Word2Vec

In [66]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

In [67]:
documentDF.show(5, False)

+------------------------------------------+
|text                                      |
+------------------------------------------+
|[Hi, I, heard, about, Spark]              |
|[I, wish, Java, could, use, case, classes]|
|[Logistic, regression, models, are, neat] |
+------------------------------------------+



In [68]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text",
  outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)

In [69]:
result.show(10, False)

+------------------------------------------+-----------------------------------------------------------------+
|text                                      |result                                                           |
+------------------------------------------+-----------------------------------------------------------------+
|[Hi, I, heard, about, Spark]              |[0.007668816111981869,-0.03736278116703034,0.017676191776990893] |
|[I, wish, Java, could, use, case, classes]|[-0.017213788947888782,0.030650552528511198,0.046941662672907114]|
|[Logistic, regression, models, are, neat] |[0.10106341168284416,-0.04297380540519953,0.005813928321003914]  |
+------------------------------------------+-----------------------------------------------------------------+



In [70]:
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.00766881611198,-0.037362781167,0.017676191777]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.0172137889479,0.0306505525285,0.0469416626729]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.101063411683,-0.0429738054052,0.005813928321]



In [71]:
# COMMAND ----------
from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)

+---+--------------+------------------------------------------+
|id |features      |PCA_4a8195a7120ed05eba92__output          |
+---+--------------+------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|1  |[3.0,10.1,3.0]|[-10.872398139848944,0.030962697060149758]|
+---+--------------+------------------------------------------+



In [72]:
# COMMAND ----------

from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show(10,False)

+---+--------------+-----------------------------------------------------------------------------------+
|id |features      |PolynomialExpansion_425abd60463c5192118f__output                                   |
+---+--------------+-----------------------------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[1.0,1.0,0.1,0.1,0.010000000000000002,-1.0,-1.0,-0.1,1.0]                          |
|1  |[2.0,1.1,1.0] |[2.0,4.0,1.1,2.2,1.2100000000000002,1.0,2.0,1.1,1.0]                               |
|0  |[1.0,0.1,-1.0]|[1.0,1.0,0.1,0.1,0.010000000000000002,-1.0,-1.0,-0.1,1.0]                          |
|1  |[2.0,1.1,1.0] |[2.0,4.0,1.1,2.2,1.2100000000000002,1.0,2.0,1.1,1.0]                               |
|1  |[3.0,10.1,3.0]|[3.0,9.0,10.1,30.299999999999997,102.00999999999999,3.0,9.0,30.299999999999997,9.0]|
+---+--------------+-----------------------------------------------------------------------------------+



In [73]:
# COMMAND ----------
from pyspark.ml.feature import ChiSqSelector, Tokenizer

In [74]:
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")

In [75]:
tokenized = tkn\
  .transform(sales.select("Description", "CustomerId"))\
  .where("CustomerId IS NOT NULL")

In [81]:
tokenized.show(3, False)

+-------------------------------+----------+-------------------------------------+
|Description                    |CustomerId|DescOut                              |
+-------------------------------+----------+-------------------------------------+
|RABBIT NIGHT LIGHT             |14075.0   |[rabbit, night, light]               |
|DOUGHNUT LIP GLOSS             |14075.0   |[doughnut, lip, gloss]               |
|12 MESSAGE CARDS WITH ENVELOPES|14075.0   |[12, message, cards, with, envelopes]|
+-------------------------------+----------+-------------------------------------+
only showing top 3 rows



In [76]:
prechi = fittedCV.transform(tokenized)\
  .where("CustomerId IS NOT NULL")

In [82]:
prechi.show(3, False)

+-------------------------------+----------+-------------------------------------+---------------------------------+
|Description                    |CustomerId|DescOut                              |countVec                         |
+-------------------------------+----------+-------------------------------------+---------------------------------+
|RABBIT NIGHT LIGHT             |14075.0   |[rabbit, night, light]               |(500,[149,185,212],[1.0,1.0,1.0])|
|DOUGHNUT LIP GLOSS             |14075.0   |[doughnut, lip, gloss]               |(500,[462,463,491],[1.0,1.0,1.0])|
|12 MESSAGE CARDS WITH ENVELOPES|14075.0   |[12, message, cards, with, envelopes]|(500,[35,41,166],[1.0,1.0,1.0])  |
+-------------------------------+----------+-------------------------------------+---------------------------------+
only showing top 3 rows



In [77]:
chisq = ChiSqSelector()\
  .setFeaturesCol("countVec")\
  .setLabelCol("CustomerId")\
  .setNumTopFeatures(2)

In [78]:
 chisq.fit(prechi).transform(prechi)\
    .drop("customerId", "Description", "DescOut").show()

# Py4JJavaError: An error occurred while calling o929.fit.
# : java.lang.OutOfMemoryError: GC overhead limit exceeded
# .config("spark.driver.memory", "8g")\
# .config("spark.executor.memory", "3g")\

+--------------------+------------------------------------------+
|            countVec|ChiSqSelector_45949a22610dcebde5eb__output|
+--------------------+------------------------------------------+
|(500,[149,185,212...|                                 (2,[],[])|
|(500,[462,463,491...|                                 (2,[],[])|
|(500,[35,41,166],...|                                 (2,[],[])|
|(500,[10,16,36,35...|                                 (2,[],[])|
|(500,[228,281,407...|                                 (2,[],[])|
|(500,[11,40,133],...|                                 (2,[],[])|
|(500,[60,64,69],[...|                                 (2,[],[])|
|   (500,[264],[1.0])|                                 (2,[],[])|
|(500,[15,34,39,40...|                                 (2,[],[])|
|(500,[34,39,40,46...|                                 (2,[],[])|
|(500,[34,39,40,14...|                                 (2,[],[])|
|(500,[34,39,40,14...|                                 (2,[],[])|
|(500,[46,

In [79]:
# COMMAND ----------
fittedPCA = pca.fit(scaleDF)
fittedPCA.write().overwrite().save("/user/kranthidr/savedModels/fittedPCA")

In [80]:
# COMMAND ----------

from pyspark.ml.feature import PCAModel
loadedPCA = PCAModel.load("/user/kranthidr/savedModels/fittedPCA")
loadedPCA.transform(scaleDF).show(5, False)

+---+--------------+------------------------------------------+
|id |features      |PCA_4a8195a7120ed05eba92__output          |
+---+--------------+------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|1  |[3.0,10.1,3.0]|[-10.872398139848944,0.030962697060149758]|
+---+--------------+------------------------------------------+

