In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [None]:
!pip install -q findspark
!pip install pyspark



In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'spark' as hello ")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.python.pyspark.shell import spark

# read input and show
df = spark.read.options(delimiter=',', header=True).csv('/content/cars.csv')
df = df.withColumn("WE", df["weight"] - 0).withColumn("label", df['origin'] - 0).withColumn("ACC", df["acceleration"] - 0).\
    withColumn("MOY", df["model year"] - 0).withColumn("HP", df["horsepower"] - 0)
df.show(5)

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
      /_/

Using Python version 3.7.10 (default, Feb 20 2021 21:17:23)
Spark context Web UI available at http://f8580c6550f5:4040
Spark context available as 'sc' (master = local[*], app id = local-1618845892932).
SparkSession available as 'spark'.
+---+---------+------------+----------+------+------------+----------+------+--------------------+------+-----+----+----+-----+
|mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|            car name|    WE|label| ACC| MOY|   HP|
+---+---------+------------+----------+------+------------+----------+------+--------------------+------+-----+----+----+-----+
| 18|        8|         307|       130|  3504|          12|        70|     1|chevrolet chevell...|3504.0|  1.0|12.0|70.0|130.0|
| 15|        8|         350|       165|  3693|        11.5|        70|     1|   buick skylark 320|36

In [None]:
# combine column label and distance to new column name features
# https://spark.apache.org/docs/latest/ml-features#vectorindexer
merge_col = VectorAssembler(inputCols=["label", "MOY"], outputCol='features')
df = merge_col.transform(df)
df.select("features").show(5)

+----------+
|  features|
+----------+
|[1.0,70.0]|
|[1.0,70.0]|
|[1.0,70.0]|
|[1.0,70.0]|
|[1.0,70.0]|
+----------+
only showing top 5 rows



In [None]:
# create a column indexedLabel mark label with most frequency start from 0.0 (most frequency) to n(least frequency)
labelIndexerFrequency = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
# use transform(df) to show, not use when run
# labelIndexerFrequency = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df).transform(df)
# labelIndexerFrequency.show(5)

In [None]:
featureIndexerFrequency = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df)

In [None]:
# https://spark.apache.org/docs/latest/ml-classification-regression.html#multilayer-perceptron-classifier
# split to 70-30
(trainingData, testData) = df.randomSplit([0.7, 0.3])
# specify layers for the neural network:
# input layer of size 2 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [2, 5, 4, 3]
# Train a RandomForest model.
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, labelCol="indexedLabel", featuresCol="indexedFeatures", blockSize=128, seed=1234)
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexerFrequency.labels)
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexerFrequency, featureIndexerFrequency, trainer, labelConverter])
# Train model. This also runs the indexers.
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Multilayer Perceptron - Test Accuracy = %g" % (accuracy))
print("Multilayer Perceptron - Test Error = %g" % (1.0 - accuracy))
mpModel = model.stages[2]
print(mpModel) # summary only

+--------------+-----+----------+
|predictedLabel|label|  features|
+--------------+-----+----------+
|           1.0|  1.0|[1.0,70.0]|
|           1.0|  1.0|[1.0,70.0]|
|           1.0|  1.0|[1.0,72.0]|
|           1.0|  1.0|[1.0,73.0]|
|           1.0|  1.0|[1.0,75.0]|
+--------------+-----+----------+
only showing top 5 rows

Multilayer Perceptron - Test Accuracy = 0.838983
Multilayer Perceptron - Test Error = 0.161017
MultilayerPerceptronClassificationModel: uid=MultilayerPerceptronClassifier_b6e84083ec14, numLayers=4, numClasses=3, numFeatures=2
