# Spark-ml Preprocessing

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler, Bucketizer, \
                                Tokenizer, HashingTF, IDF

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
features_df = spark.createDataFrame([
            (1, Vectors.dense([10.0, 10000.0, 1.0]),),
            (2, Vectors.dense([15.0, 30000.0, 2.0]),),
            (3, Vectors.dense([30.0, 40000.0, 3.0]),),
            (4, Vectors.dense([20.0, 35000.0, 1.0]),),
            (5, Vectors.dense([40.0, 20000.0, 2.0]),),
    ], ['id', 'features'])

features_df.show()

+---+------------------+
| id|          features|
+---+------------------+
|  1|[10.0,10000.0,1.0]|
|  2|[15.0,30000.0,2.0]|
|  3|[30.0,40000.0,3.0]|
|  4|[20.0,35000.0,1.0]|
|  5|[40.0,20000.0,2.0]|
+---+------------------+



### Min-Max-Scaling

In [4]:
mmx_scaler = MinMaxScaler(inputCol='features', outputCol='sfeatures')
mmx_model = mmx_scaler.fit(features_df)
mmx_output = mmx_model.transform(features_df)

In [5]:
mmx_model.originalMax

DenseVector([40.0, 40000.0, 3.0])

In [6]:
mmx_model.originalMin

DenseVector([10.0, 10000.0, 1.0])

In [7]:
mmx_output.show()

+---+------------------+--------------------+
| id|          features|           sfeatures|
+---+------------------+--------------------+
|  1|[10.0,10000.0,1.0]|       [0.0,0.0,0.0]|
|  2|[15.0,30000.0,2.0]|[0.16666666666666...|
|  3|[30.0,40000.0,3.0]|[0.66666666666666...|
|  4|[20.0,35000.0,1.0]|[0.33333333333333...|
|  5|[40.0,20000.0,2.0]|[1.0,0.3333333333...|
+---+------------------+--------------------+



### Standardization

In [8]:
std_scaler = StandardScaler(inputCol='features', outputCol='sfeatures', withMean=True, withStd=True)
std_model = std_scaler.fit(features_df)
std_output = std_model.transform(features_df)

In [9]:
std_model.mean

DenseVector([23.0, 27000.0, 1.8])

In [10]:
std_model.std

DenseVector([12.0416, 12041.5946, 0.8367])

In [11]:
std_output.show()

+---+------------------+--------------------+
| id|          features|           sfeatures|
+---+------------------+--------------------+
|  1|[10.0,10000.0,1.0]|[-1.0795912380986...|
|  2|[15.0,30000.0,2.0]|[-0.6643638388299...|
|  3|[30.0,40000.0,3.0]|[0.58131835897617...|
|  4|[20.0,35000.0,1.0]|[-0.2491364395612...|
|  5|[40.0,20000.0,2.0]|[1.41177315751357...|
+---+------------------+--------------------+



### Bucketing

In [12]:
# define threshold for buckets
splits = [-float("inf"), -10, 0.0, 10.0, float("inf")]

# define unidimensional dataframe
b_data = [(-435.0,),(-53.0,),(-8.0,),(2.0,),(6.0,),(27.0,),(368.0,)]
b_df = spark.createDataFrame(b_data, ["features"])
b_df.show()

+--------+
|features|
+--------+
|  -435.0|
|   -53.0|
|    -8.0|
|     2.0|
|     6.0|
|    27.0|
|   368.0|
+--------+



In [13]:
bck = Bucketizer(inputCol='features', outputCol='bfeatures', splits=splits)
bck_output = bck.transform(b_df)

In [14]:
bck_output.show()

+--------+---------+
|features|bfeatures|
+--------+---------+
|  -435.0|      0.0|
|   -53.0|      0.0|
|    -8.0|      1.0|
|     2.0|      2.0|
|     6.0|      2.0|
|    27.0|      3.0|
|   368.0|      3.0|
+--------+---------+



### Tokenizer
Converts the input string to lowercase and then splits it by white spaces.

In [15]:
sentences_df = spark.createDataFrame([
            (1, "This is an introduction to Spark ML"),
            (2, "The cat is on the table"),
            (3, "The dog is under the table"),
            (4, "The quick brown fox jumps over the lazy dog"),
            (5, "Hello, this is a spark dataframe"),
    ], ['id', 'sentence'])

sentences_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|This is an introd...|
|  2|The cat is on the...|
|  3|The dog is under ...|
|  4|The quick brown f...|
|  5|Hello, this is a ...|
+---+--------------------+



In [16]:
token = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer_out = token.transform(sentences_df)

In [17]:
tokenizer_out.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|
|  2|The cat is on the...|[the, cat, is, on...|
|  3|The dog is under ...|[the, dog, is, un...|
|  4|The quick brown f...|[the, quick, brow...|
|  5|Hello, this is a ...|[hello,, this, is...|
+---+--------------------+--------------------+



### TF-IDF
Term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general.

In [18]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=32)
hashTF_df = hashingTF.transform(tokenizer_out)
hashTF_df.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(32,[1,4,8,17,20,...|
|  2|The cat is on the...|[the, cat, is, on...|(32,[2,16,17,29,3...|
|  3|The dog is under ...|[the, dog, is, un...|(32,[15,17,29,30,...|
|  4|The quick brown f...|[the, quick, brow...|(32,[0,5,15,16,19...|
|  5|Hello, this is a ...|[hello,, this, is...|(32,[1,17,18,25,2...|
+---+--------------------+--------------------+--------------------+



In [19]:
idf = IDF(inputCol="rawFeatures", outputCol="idf_features")
idf_model = idf.fit(hashTF_df)
idf_out = idf_model.transform(hashTF_df)
idf_out.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|        idf_features|
+---+--------------------+--------------------+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|(32,[1,4,8,17,20,...|(32,[1,4,8,17,20,...|
|  2|The cat is on the...|[the, cat, is, on...|(32,[2,16,17,29,3...|(32,[2,16,17,29,3...|
|  3|The dog is under ...|[the, dog, is, un...|(32,[15,17,29,30,...|(32,[15,17,29,30,...|
|  4|The quick brown f...|[the, quick, brow...|(32,[0,5,15,16,19...|(32,[0,5,15,16,19...|
|  5|Hello, this is a ...|[hello,, this, is...|(32,[1,17,18,25,2...|(32,[1,17,18,25,2...|
+---+--------------------+--------------------+--------------------+--------------------+

