In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import SparseVector
from pyspark.sql import SparkSession

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [4]:
#read weather data
spark = SparkSession.builder.getOrCreate()
data = spark.read.load('weatherdata.csv', format = 'csv', header = True, delimiter = ',')
#convert some columns's type string to numeric
data = data.withColumn('dewPoint', data['dewPoint'] - 0).withColumn('humidity', data['humidity'] - 0).\
withColumn('windSpeed', data['windSpeed'] - 0).withColumn('cloudCover', data['cloudCover'] - 0).\
withColumn('temperatureMin', data['temperatureMin'] - 0).withColumn('temperatureMax', data['temperatureMax'] - 0).\
withColumn('uvIndex', data['uvIndex'] - 0).withColumn('label', data['precipType'] - 0)
#data.show(5)

In [5]:
#get some column to vector
assembler = VectorAssembler(inputCols = ['dewPoint', 'humidity', 'windSpeed', 'cloudCover', \
                                         'temperatureMin', 'temperatureMax', 'uvIndex'], outputCol = 'features')
data = assembler.transform(data)

In [6]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(trainingData)

# select example rows to display.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# summary only
print("Decision Tree - Test Accuracy = %g" % (accuracy))
print("Decision Tree - Test Error = %g" % (1.0 - accuracy))

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  1.0|[19.93,0.73,2.0,0...|
|       1.0|  1.0|[18.85,0.69,4.63,...|
|       1.0|  1.0|[19.61,0.7,1.74,0...|
|       1.0|  1.0|[21.85,0.78,2.33,...|
|       1.0|  1.0|[20.47,0.71,2.83,...|
+----------+-----+--------------------+
only showing top 5 rows

Decision Tree - Test Accuracy = 0.690909
Decision Tree - Test Error = 0.309091
