## <center>Indexing - StringIndexer</center>   字符串索引器

In [1]:
from pyspark.sql import SparkSession   # 与spark 进行连接
from pyspark.ml.feature import StringIndexer

In [2]:
#固定的写法，创建一个 spark session 代表跟spark 的连接
spark = SparkSession.builder.appName("index_vectorize").getOrCreate()

In [3]:
#创建一个表 spark.createDataFrame
df = spark.createDataFrame( 
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])
df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [10]:
# step1. create indexer object 
#  inputCol 输入变量是 category这一列    outputCol输出变量命名为  categoryIndex

# string 到index 的变换
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

StringIndexer_be43b1c30803

In [6]:
# step2. fit to get indexer model - calculate what categorical value should be mapped to what index
# can be based on frequencey from high to low, or low to high
# another option alphabetic order from high to low, or low to high
# fit, in essence, is calculation. 

model_index = indexer.fit(df)  # fit()  背后的工作，扫描全表，然后排序就直接进行对应
                               #这一步的作用：扫描indexer全表，然后排序 与 0 1 2 3 直接进行对应

In [7]:
# step3. transform origin dataframe to indexed dataframe
df_indexed = model_index.transform(df) 

In [8]:
df_indexed.show()  #  扫描全表，然后排序就直接进行对应

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



In [11]:
#把index 再变回去
from pyspark.ml.feature import IndexToString
i2s = IndexToString().setInputCol('categoryIndex')
res = i2s.transform(df_indexed)

In [12]:
res.show()

+---+--------+-------------+----------------------------------+
| id|category|categoryIndex|IndexToString_2dd48f059282__output|
+---+--------+-------------+----------------------------------+
|  0|       a|          0.0|                                 a|
|  1|       b|          2.0|                                 b|
|  2|       c|          1.0|                                 c|
|  3|       a|          0.0|                                 a|
|  4|       a|          0.0|                                 a|
|  5|       c|          1.0|                                 c|
+---+--------+-------------+----------------------------------+



## <center>Encoder - OneHotEncoder</center>

In [14]:
from pyspark.ml.feature import OneHotEncoderEstimator  # 一个类 当你把列款占城多个列的时候只有一个地方是 1

In [15]:
# step1. create encoder object
encoder = OneHotEncoderEstimator(inputCols=['categoryIndex'], outputCols=['categoryVec'])

In [16]:
# step2. fit to get encoder model
model_encoder = encoder.fit(df_indexed)

In [17]:
# step3. transform indexed dataframe to vectorized dataframe
df_encoded = model_encoder.transform(df_indexed)

In [19]:
df_encoded.show()   # 稀疏矩阵 (2,[0],[1.0]） 有两位  第0位有值 值为 1.0  [1.0, 0]

+---+--------+-------------+-------------+
| id|category|categoryIndex|  categoryVec|
+---+--------+-------------+-------------+
|  0|       a|          0.0|(2,[0],[1.0])|
|  1|       b|          2.0|    (2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|
|  3|       a|          0.0|(2,[0],[1.0])|
|  4|       a|          0.0|(2,[0],[1.0])|
|  5|       c|          1.0|(2,[1],[1.0])|
+---+--------+-------------+-------------+



In [21]:
# 0.0 -> [1.0, 0]
# 1.0 -> [0, 1.0]
# 2.0 -> [0.0, 0.0]

## <center>Vectorization - VectorAssembler</center>

In [22]:
from pyspark.ml.feature import VectorAssembler  # 从spark 把这个类导入，这个类里面自动进行转化了

In [23]:
vectorizer = VectorAssembler(inputCols=['categoryVec'], outputCol="features")

In [24]:
df_vectorized = vectorizer.transform(df_encoded)

In [25]:
df_vectorized.show()  #  转变为一个  list  

+---+--------+-------------+-------------+---------+
| id|category|categoryIndex|  categoryVec| features|
+---+--------+-------------+-------------+---------+
|  0|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|
|  1|       b|          2.0|    (2,[],[])|(2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|
|  3|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|
|  4|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|
|  5|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|
+---+--------+-------------+-------------+---------+



## <center>Try One More Feature</center>

In [26]:
df2 = spark.createDataFrame(
    [(0, "H"), (1, "M"), (2, "L"), (3, "H"), (4, "H"), (5, "L")],
    ["id", "category2"])
df2.show()

+---+---------+
| id|category2|
+---+---------+
|  0|        H|
|  1|        M|
|  2|        L|
|  3|        H|
|  4|        H|
|  5|        L|
+---+---------+



In [28]:
df_vectorized.show()

+---+--------+-------------+-------------+---------+
| id|category|categoryIndex|  categoryVec| features|
+---+--------+-------------+-------------+---------+
|  0|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|
|  1|       b|          2.0|    (2,[],[])|(2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|
|  3|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|
|  4|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|
|  5|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|
+---+--------+-------------+-------------+---------+



In [29]:
df2 = df_vectorized.join(df2, 'id')

In [30]:
df2.show()

+---+--------+-------------+-------------+---------+---------+
| id|category|categoryIndex|  categoryVec| features|category2|
+---+--------+-------------+-------------+---------+---------+
|  0|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|
|  5|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|
|  1|       b|          2.0|    (2,[],[])|(2,[],[])|        M|
|  3|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|
|  2|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|
|  4|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|
+---+--------+-------------+-------------+---------+---------+



In [33]:
# index
indexer2 = StringIndexer(inputCol="category2", outputCol="categoryIndex2")
model_index2 = indexer2.fit(df2)
df_indexed2 = model_index2.transform(df2)

In [34]:
df_indexed2.show()

+---+--------+-------------+-------------+---------+---------+--------------+
| id|category|categoryIndex|  categoryVec| features|category2|categoryIndex2|
+---+--------+-------------+-------------+---------+---------+--------------+
|  0|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|
|  5|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|           1.0|
|  1|       b|          2.0|    (2,[],[])|(2,[],[])|        M|           2.0|
|  3|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|
|  2|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|           1.0|
|  4|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|
+---+--------+-------------+-------------+---------+---------+--------------+



In [37]:
# encode 把categoryIndex2变成 稀疏矩阵
encoder2 = OneHotEncoderEstimator(inputCols=['categoryIndex2'], outputCols=['categoryVec2'])

model_encoder2 = encoder2.fit(df_indexed2)
df_encoded2 = model_encoder2.transform(df_indexed2)

In [38]:
df_encoded2.show()

+---+--------+-------------+-------------+---------+---------+--------------+-------------+
| id|category|categoryIndex|  categoryVec| features|category2|categoryIndex2| categoryVec2|
+---+--------+-------------+-------------+---------+---------+--------------+-------------+
|  0|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|(2,[0],[1.0])|
|  5|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|           1.0|(2,[1],[1.0])|
|  1|       b|          2.0|    (2,[],[])|(2,[],[])|        M|           2.0|    (2,[],[])|
|  3|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|(2,[0],[1.0])|
|  2|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|           1.0|(2,[1],[1.0])|
|  4|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|(2,[0],[1.0])|
+---+--------+-------------+-------------+---------+---------+--------------+-------------+



In [40]:
# vectorize features2代表了前面所有的表示
vectorizer2 = VectorAssembler(inputCols=['categoryVec', 'categoryVec2'], outputCol="features2")
df_vectorized2 = vectorizer2.transform(df_encoded2)

In [41]:
df_vectorized2.show()  # feature 这里的feature 跟之前一样放在 所有的值放在一个list 里面然后进行并行运算

+---+--------+-------------+-------------+---------+---------+--------------+-------------+-----------------+
| id|category|categoryIndex|  categoryVec| features|category2|categoryIndex2| categoryVec2|        features2|
+---+--------+-------------+-------------+---------+---------+--------------+-------------+-----------------+
|  0|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|(2,[0],[1.0])|[1.0,0.0,1.0,0.0]|
|  5|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|           1.0|(2,[1],[1.0])|[0.0,1.0,0.0,1.0]|
|  1|       b|          2.0|    (2,[],[])|(2,[],[])|        M|           2.0|    (2,[],[])|        (4,[],[])|
|  3|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|(2,[0],[1.0])|[1.0,0.0,1.0,0.0]|
|  2|       c|          1.0|(2,[1],[1.0])|[0.0,1.0]|        L|           1.0|(2,[1],[1.0])|[0.0,1.0,0.0,1.0]|
|  4|       a|          0.0|(2,[0],[1.0])|[1.0,0.0]|        H|           0.0|(2,[0],[1.0])|[1.0,0.0,1.0,0.0]|
+---+-----