# PySpark Pipelines

With `pipeline`, we first instantiate the objects and the pipeline method applies the transformations.

## Importing

In [1]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("pipeline").getOrCreate()

In [2]:
from pyspark.ml.feature    import RFormula, VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml import Pipeline

## Loading Data

In [3]:
data = spark.read.load(
    "../../data/iris.csv",
    format="csv",
    sep=",",
    header = True, 
    inferSchema=True)

dataTrain, dataTest = data.randomSplit([.7, .3], seed=11)

## Preparing Data

Taking our features columns into a vector column

In [4]:
asb = VectorAssembler(
    inputCols=["sepallength","sepalwidth","petallength","petalwidth"],
    outputCol="features"
)

In [5]:
ind = StringIndexer(
    inputCol="class",
    outputCol="target"
)

## Model Development

In [6]:
clf = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="target",
    maxIter=1000,
    layers=[4,5,4,3]
)

## Running the Pipeline

In [7]:
pipeline = Pipeline(
    stages=[asb, ind, clf]
)

model = pipeline.fit(dataTrain)

## Predicting on Test Set

In [8]:
prediction = model.transform(dataTest)
prediction.select("features", "class", "probability").show(5, truncate=False)

+-----------------+-----------+----------------------------------------------------+
|features         |class      |probability                                         |
+-----------------+-----------+----------------------------------------------------+
|[4.6,3.1,1.5,0.2]|Iris-setosa|[5.6229320802346665E-136,9.185091099379931E-188,1.0]|
|[4.6,3.2,1.4,0.2]|Iris-setosa|[5.6229320802346665E-136,9.185091099379931E-188,1.0]|
|[4.8,3.0,1.4,0.1]|Iris-setosa|[5.6229320802346665E-136,9.185091099379931E-188,1.0]|
|[4.8,3.0,1.4,0.3]|Iris-setosa|[5.6229320802346665E-136,9.185091099379931E-188,1.0]|
|[4.8,3.1,1.6,0.2]|Iris-setosa|[5.6229320802346665E-136,9.185091099379931E-188,1.0]|
+-----------------+-----------+----------------------------------------------------+
only showing top 5 rows



## Model Evaluation

In [9]:
performance = MulticlassClassificationEvaluator(
    labelCol="target",
    predictionCol="prediction",
    metricName="accuracy"
)

acc = performance.evaluate(prediction)

In [10]:
print(acc)

1.0
