In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

training = spark.createDataFrame([
    (0, "a b c d e spark duo fuxi this is cool mike", 1.0),
    (1, "b d f i hate mike hate data ", 0.0),
    (2, "spark i love spark ddd example hadoop", 1.0),
    (3, "i love coding and ml", 0.0),
   (4, "i want to move fast", 1.0), 
   (5, "Mike like student to ask question", 1.0),
   (6, "Mike hate people to ask hwo to import spark URL", 0.0)
], ["id", "text", "label"])

In [0]:
display(training)

id,text,label
0,a b c d e spark duo fuxi this is cool mike,1.0
1,b d f i hate mike hate data,0.0
2,spark i love spark ddd example hadoop,1.0
3,i love coding and ml,0.0
4,i want to move fast,1.0
5,Mike like student to ask question,1.0
6,Mike hate people to ask hwo to import spark URL,0.0


In [0]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

# build a pipeline 
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# pipeline.saveAs("location")
# pipeline.load("location")

In [0]:
preprocess_model = pipeline.fit(training)

In [0]:
model = pipeline.fit(training)
# model.saveAs(directory)

In [0]:
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [0]:
display(test)

id,text
4,spark i j k
5,l m n
6,spark hadoop spark
7,apache hadoop


In [0]:
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")

In [0]:
hash_results = preprocess_model.transform(test)

In [0]:
display(selected)

id,text,probability,prediction
4,spark i j k,"Map(vectorType -> dense, length -> 2, values -> List(0.15933224223332482, 0.8406677577666752))",1.0
5,l m n,"Map(vectorType -> dense, length -> 2, values -> List(0.17275995885020773, 0.8272400411497922))",1.0
6,spark hadoop spark,"Map(vectorType -> dense, length -> 2, values -> List(0.01575278226492635, 0.9842472177350736))",1.0
7,apache hadoop,"Map(vectorType -> dense, length -> 2, values -> List(0.04224348512947865, 0.9577565148705214))",1.0


In [0]:
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

(4, spark i j k) --> prob=[0.15933224223332482,0.8406677577666752], prediction=1.000000
(5, l m n) --> prob=[0.17275995885020773,0.8272400411497922], prediction=1.000000
(6, spark hadoop spark) --> prob=[0.01575278226492635,0.9842472177350736], prediction=1.000000
(7, apache hadoop) --> prob=[0.04224348512947865,0.9577565148705214], prediction=1.000000


In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
 
df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])
 
stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)
indexed.show()
 
encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+

