In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import DataFrameReader, SQLContext
from sqlalchemy import create_engine
import pandas as pd
spark= SparkSession.builder.getOrCreate()
spark
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

In [2]:
import urllib.request
url = 'https://github.com/LeonardoPalaciosPando1996/Database/blob/Master/a2.parquet?raw=true'
filename = 'a2.parquet'
urllib.request.urlretrieve(url, filename)

('a2.parquet', <http.client.HTTPMessage at 0x203e79f6f10>)

In [3]:
df = spark.read.parquet('a2.parquet')
df.createOrReplaceTempView('df')
spark.sql("SELECT * from df").show()

+-----+-----------+-------------------+-------------------+-------------------+
|CLASS|   SENSORID|                  X|                  Y|                  Z|
+-----+-----------+-------------------+-------------------+-------------------+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|
|    0|25769803830| 178.62396382387422| -47.07529438881511|  84.38310769821979|
|    0|25769803831|  85.03128805189493|-4.3024316644854546|-1.1841857567516714|
|    0|34359738411| 26.786262674736566| -46.33193951911338| 20.880756008396055|
|    0| 8589934592|-16.203752396859194| 51.080957032176954| -96.80526656416971|
|    0|25769803852|   47.2048142440404| 

In [4]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

Now it’s time to have a look at the recorded sensor data. You should see data similar to the one exemplified below….


Please create a VectorAssembler which consumes columns X, Y and Z and produces a column “features”


In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

indexer = StringIndexer(inputCol="CLASS", outputCol="label")
vectorAssembler = VectorAssembler(inputCols=["X","Y","Z"],
                                  outputCol="features")

normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")

Please instantiate a classifier from the SparkML package and assign it to the classifier variable. Make sure to either
1.	Rename the “CLASS” column to “label” or
2.	Specify the label-column correctly to be “CLASS”


In [6]:
from pyspark.ml.classification import LogisticRegression
classifier = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

Let’s train and evaluate…


In [7]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer,vectorAssembler, normalizer,classifier])

In [8]:
model = pipeline.fit(df_train)

In [9]:
prediction = model.transform(df_train)

In [10]:
prediction.show()

+-----+--------+-------------------+-------------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|CLASS|SENSORID|                  X|                  Y|                  Z|label|            features|       features_norm|       rawPrediction|         probability|prediction|
+-----+--------+-------------------+-------------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|    0|       0|-104.60513286125266|  -54.0262751326433| 345.25976930323344|  0.0|[-104.60513286125...|[0.29984225435296...|[0.15662190434040...|[0.53907563025210...|       0.0|
|    0|       0|  56.54732521345129| -7.980106018032676|  95.05162719436447|  0.0|[56.5473252134512...|[0.53235665727429...|[0.15662190434040...|[0.53907563025210...|       0.0|
|    0|       1| 15.798748332829806| -86.21159407546875|   85.2514617870864|  0.0|[15.7987483328298...|[0.4735

In [11]:
prediction.printSchema()

root
 |-- CLASS: integer (nullable = true)
 |-- SENSORID: long (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- Z: double (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- features_norm: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")
binEval.evaluate(prediction)

0.5390756302521008

Are you happy with the result? I’m happy with > 0.55.