In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.python.pyspark.shell import spark

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.1
      /_/

Using Python version 3.6.5 (default, Mar 29 2018 13:32:41)
SparkSession available as 'spark'.


In [4]:
#read weather data
data = spark.read.load('weatherdata.csv', format = 'csv', header = True, delimiter = ',')
#convert some columns's type string to numeric
data = data.withColumn('dewPoint', data['dewPoint'] - 0).withColumn('humidity', data['humidity'] - 0).\
withColumn('windSpeed', data['windSpeed'] - 0).withColumn('cloudCover', data['cloudCover'] - 0).\
withColumn('temperatureMin', data['temperatureMin'] - 0).withColumn('temperatureMax', data['temperatureMax'] - 0).\
withColumn('uvIndex', data['uvIndex'] - 0)
#data.show(5)

In [5]:
#get some column to vector
assembler = VectorAssembler(inputCols = ['dewPoint', 'humidity', 'windSpeed', 'cloudCover', \
                                         'temperatureMin', 'temperatureMax', 'uvIndex'], outputCol = 'features')
data = assembler.transform(data)

labelIndexer = StringIndexer(inputCol = 'precipType', outputCol = 'indexedLabel').fit(data)
featureIndexer = VectorIndexer(inputCol = 'features', outputCol = 'indexedFeatures', maxCategories = 4).fit(data)

In [6]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)

treeModel = model.stages[2]

# summary only
print(treeModel)
print("Decision Tree - Test Accuracy = %g" % (accuracy))
print("Decision Tree - Test Error = %g" % (1.0 - accuracy))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|[18.85,0.69,4.63,...|
|       0.0|         0.0|[20.47,0.71,2.83,...|
|       0.0|         0.0|[20.27,0.71,2.64,...|
|       0.0|         0.0|[20.46,0.7,5.39,0...|
|       0.0|         0.0|[20.65,0.71,5.52,...|
+----------+------------+--------------------+
only showing top 5 rows

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_427a94bd6066ecf2f5c9) of depth 5 with 61 nodes
Decision Tree - Test Accuracy = 0.7447
Decision Tree - Test Error = 0.2553
