In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql import SparkSession

#Session and load
spark = SparkSession.builder.appName("IrisClassification").getOrCreate()
df_iris = spark.read.csv("data/iris.csv", header=True, inferSchema=True)


In [None]:
# Convert the target column to a numeric column using StringIndexer
indexer = StringIndexer(inputCol="species", outputCol="label")
df_iris = indexer.fit(df_iris).transform(df_iris)
df_iris.show()

In [None]:
#Feature vector and logistic classifier by combining all features
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")
classifier = LogisticRegression(labelCol="label", featuresCol="features")

In [None]:
#Pipeline that combines the assembler and classifier
pipeline = Pipeline(stages=[assembler, classifier])

In [None]:
# Split the data in training and test set
(trainingData, testData) = df_iris.randomSplit([0.7, 0.3])

In [None]:
#Fit pipeline to the training data
model = pipeline.fit(trainingData)

In [None]:
#Predict on the test set
predictions = model.transform(testData)

In [None]:
#Evaluate the accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

In [None]:
#Predict a specific value
single = spark.createDataFrame([(5.7,3.8,1.7,0.3)], ["sepal_length", "sepal_width", "petal_length", "petal_width"])
singlePred = model.transform(single).select("prediction").first()[0]
if singlePred == 0.0:
    predict = "Sentosa"
elif singlePred == 1.0:
    predict = "Versicolor"
else:
    predict = "Verginica"
print("Prediction:", predict)


In [None]:
spark.stop()