In [None]:
from pyspark.ml.feature import StringIndexer

In [2]:
df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [4]:
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df)\
        .transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



In [11]:
from pyspark.ml.feature import OneHotEncoder

In [13]:
OneHotEncoder?

In [23]:
df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoder(inputCol="categoryIndex1",
                        outputCol="categoryVec1",
                        dropLast=False
                       )
encoded = encoder.transform(df)
encoded.show()

+--------------+--------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1|
+--------------+--------------+-------------+
|           0.0|           1.0|(3,[0],[1.0])|
|           1.0|           0.0|(3,[1],[1.0])|
|           2.0|           1.0|(3,[2],[1.0])|
|           0.0|           2.0|(3,[0],[1.0])|
|           0.0|           1.0|(3,[0],[1.0])|
|           2.0|           0.0|(3,[2],[1.0])|
+--------------+--------------+-------------+



In [24]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [26]:
dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features")

output = assembler.transform(dataset)
output.show(truncate=False)

+---+----+------+--------------+-------+-----------------------+
|id |hour|mobile|userFeatures  |clicked|features               |
+---+----+------+--------------+-------+-----------------------+
|0  |18  |1.0   |[0.0,10.0,0.5]|1.0    |[18.0,1.0,0.0,10.0,0.5]|
+---+----+------+--------------+-------+-----------------------+



In [28]:
output.take(1)

[Row(id=0, hour=18, mobile=1.0, userFeatures=DenseVector([0.0, 10.0, 0.5]), clicked=1.0, features=DenseVector([18.0, 1.0, 0.0, 10.0, 0.5]))]

In [27]:
output.printSchema()

root
 |-- id: long (nullable = true)
 |-- hour: long (nullable = true)
 |-- mobile: double (nullable = true)
 |-- userFeatures: vector (nullable = true)
 |-- clicked: double (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
from pyspark.ml.feature import MinMaxScaler

In [33]:
dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

scaler = MinMaxScaler(inputCol="features", 
                      outputCol="scaledFeatures")

scalerModel = scaler.fit(dataFrame)
scaledData = scalerModel.transform(dataFrame)

scaledData.select("features", "scaledFeatures").show()

+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
|[1.0,0.1,-1.0]| [0.0,0.0,0.0]|
| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+



In [34]:
scaledData.take(1)

[Row(id=0, features=DenseVector([1.0, 0.1, -1.0]), scaledFeatures=DenseVector([0.0, 0.0, 0.0]))]

In [48]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x7f9f2fbb46a0>>

In [49]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [54]:
spark = SparkSession\
    .builder\
    .appName('Pipeline-ML')\
    .getOrCreate()

In [55]:
spark

In [58]:
pwd = !pwd
data_path = "file://" + list(pwd)[0] + '/data/ai/'
print(data_path)

file:///root/pyspark-book/data/ai/


In [126]:
df = spark.read.csv(data_path+"bank.csv", 
                    header=True, 
                    inferSchema=True)

In [113]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [127]:
df = df.drop('day','month')
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 666)

In [128]:
print("Train Dataset Num :", train_df.count())
print("Test Dataset Num :", test_df.count())

Train Dataset Num : 7821
Test Dataset Num : 3341


In [129]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler,OneHotEncoderEstimator
from pyspark.ml.classification import LogisticRegression 

In [101]:
# 运用StringIndexer 和 OneHotEncoderEstimator处理categorical columns 
# 声明stages的顺序和内容

In [102]:
categoricalColumns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
numericCols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']

In [132]:
stages = []

for col in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = col, outputCol = col + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[col + "classVec"])    
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol = 'deposit', outputCol = 'label')
stages += [label_stringIdx]

assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
stages += [lr]

In [133]:
pipeline = Pipeline(stages = stages)

In [135]:
pipeline.getStages()

[StringIndexer_8b3b68275586,
 OneHotEncoderEstimator_c64a03f5f9d3,
 StringIndexer_a0f4bc18c094,
 OneHotEncoderEstimator_f43d92145819,
 StringIndexer_aa394a87cc39,
 OneHotEncoderEstimator_688ee7027d18,
 StringIndexer_f1626192a44f,
 OneHotEncoderEstimator_19b52f34d58a,
 StringIndexer_3d6190b7cff4,
 OneHotEncoderEstimator_e2f68d5e2e42,
 StringIndexer_aa6f2bb4b051,
 OneHotEncoderEstimator_d1345f720427,
 StringIndexer_587b696899a9,
 OneHotEncoderEstimator_4506790eb53a,
 StringIndexer_6db25f74f00e,
 OneHotEncoderEstimator_4ab28a628c95,
 StringIndexer_96c5a7321a68,
 VectorAssembler_5a8be98fd15d,
 LogisticRegression_db2ac13a3ab6]

In [136]:
pipelineModel = pipeline.fit(train_df)

In [137]:
pipelineModel.stages[-1]

LogisticRegressionModel: uid = LogisticRegression_db2ac13a3ab6, numClasses = 2, numFeatures = 30

In [117]:
df_out = pipelineModel.transform(test_df)

In [122]:
df_out.select('features', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+--------------------+-----+--------------------+----------+--------------------+
|            features|label|       rawPrediction|prediction|         probability|
+--------------------+-----+--------------------+----------+--------------------+
|(30,[7,12,16,17,1...|  1.0|[-0.6298499099748...|       1.0|[0.34754457104364...|
|(30,[7,12,13,16,1...|  1.0|[-1.0056204029849...|       1.0|[0.26783781916319...|
|(30,[7,12,16,17,1...|  0.0|[-1.0679101456932...|       1.0|[0.25580071979005...|
|(30,[7,12,16,17,1...|  1.0|[-3.8353272081235...|       1.0|[0.02113781576069...|
|(30,[7,12,13,16,1...|  1.0|[-0.6550647423974...|       1.0|[0.34184911852098...|
|(30,[1,12,13,16,1...|  0.0|[-1.7658964151801...|       1.0|[0.14605339240666...|
|(30,[7,12,15,16,1...|  1.0|[-2.9785282464856...|       1.0|[0.04840537602580...|
|(30,[7,12,13,16,1...|  1.0|[-3.7014900436566...|       1.0|[0.02409196334164...|
|(30,[7,12,13,16,1...|  1.0|[-0.5852110096154...|       1.0|[0.35773442391132...|
|(30,[7,12,13,16

In [123]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [124]:
evaluator = BinaryClassificationEvaluator()

In [125]:
print('AUC: Logistic Regression   : ', evaluator.evaluate(df_out))

AUC: Logistic Regression   :  0.8835721328251214


In [140]:
BinaryClassificationEvaluator?

In [None]:
rawPredictionCol='rawPrediction', labelCol='label', metricName='areaUnderROC'