In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('GuangzhouPM_Explore').getOrCreate()

In [2]:
# Let's read in the data. Note that it's in the format of csv.
GuangzhouPM = spark.read.csv('i4_Dataset/GuangzhouPM20100101_20151231.csv', inferSchema=True, header=True)

In [3]:
from pyspark.sql.functions import col
GuangzhouPM.describe().filter(col("summary") == "count").show()


+-------+-----+-----+-----+-----+-----+------+---------------+--------------------+----------+-----+-----+-----+-----+-----+-----+-------------+-----+
|summary|   No| year|month|  day| hour|season|PM_City Station|PM_5th Middle School|PM_US Post| DEWP| HUMI| PRES| TEMP| cbwd|  Iws|precipitation|Iprec|
+-------+-----+-----+-----+-----+-----+------+---------------+--------------------+----------+-----+-----+-----+-----+-----+-----+-------------+-----+
|  count|52584|52584|52584|52584|52584| 52583|          32352|               21095|     32352|52583|52583|52583|52583|52583|52583|        52583|52583|
+-------+-----+-----+-----+-----+-----+------+---------------+--------------------+----------+-----+-----+-----+-----+-----+-----+-------------+-----+



In [4]:
GuangzhouPM_F = GuangzhouPM.filter(GuangzhouPM['year'] > 2010)

In [5]:
GuangzhouPM_RemoveNA = GuangzhouPM_F.na.drop()

In [6]:
GuangzhouPM_RemoveNA.count()

20074

In [7]:
from pyspark.sql.types import IntegerType
changedTypeGuangzhouPM_RemoveNA = GuangzhouPM_RemoveNA.withColumn("dayTime", GuangzhouPM_RemoveNA["year"].cast(IntegerType()))

In [8]:
changedTypeGuangzhouPM_RemoveNA.printSchema()

root
 |-- No: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- season: integer (nullable = true)
 |-- PM_City Station: integer (nullable = true)
 |-- PM_5th Middle School: integer (nullable = true)
 |-- PM_US Post: integer (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- HUMI: integer (nullable = true)
 |-- PRES: double (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- cbwd: string (nullable = true)
 |-- Iws: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- Iprec: double (nullable = true)
 |-- dayTime: integer (nullable = true)



In [9]:
from pyspark.sql import functions as F

In [10]:
GuangzhouPM2 = changedTypeGuangzhouPM_RemoveNA.withColumn('dayTime', 

                    F.concat(F.col('year'),F.lit('_'), F.col('month'),F.lit('_'),F.col('day'),F.lit('_'),F.col('hour')))

In [11]:
GuangzhouPM3 = GuangzhouPM2.withColumn('OutdoorSafeIndex', 
                                       F.when((F.col("PM_City Station")<=26), 'Good')\
                                       .when((F.col('PM_City Station')> 40), 'Bad')\
                                       .otherwise ('Fair')
                                      )


In [12]:
GuangzhouPM4 = GuangzhouPM3.sort("year","month","day","hour","dayTime")

In [13]:
GuangzhouPM5 = GuangzhouPM4.drop("No")

In [14]:
GuangzhouPM6 = GuangzhouPM5.drop('PM_5th Middle School','PM_US Post')

In [15]:
GuangzhouPM6_log = GuangzhouPM6.withColumn('PM_City Station_Log', F.log(GuangzhouPM6['PM_City Station']))

In [16]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [17]:
assembler = VectorAssembler(
    inputCols=["season", "DEWP", 
               "HUMI",'PRES','TEMP','Iws','precipitation','Iprec'],
    outputCol="features")

In [18]:
output = assembler.transform(GuangzhouPM6_log)

In [19]:
final_GuangzhouPM = output.select("features",'PM_City Station')
final_GuangzhouPM.show()

+--------------------+---------------+
|            features|PM_City Station|
+--------------------+---------------+
|[4.0,3.7,91.0,101...|             83|
|[4.0,4.2,88.0,101...|             95|
|[4.0,3.5,76.0,101...|             55|
|[4.0,2.7,69.0,101...|             60|
|[4.0,1.5,62.0,101...|             41|
|[4.0,2.3,66.0,101...|             42|
|[4.0,2.9,69.0,101...|             40|
|[4.0,1.7,61.0,101...|             40|
|[4.0,0.4,51.0,101...|             35|
|[4.0,1.3,52.0,101...|             42|
|[4.0,0.7,46.0,101...|             48|
|[4.0,0.1,42.0,101...|             62|
|[4.0,1.1,44.0,101...|             51|
|[4.0,1.1,43.0,101...|             49|
|[4.0,1.6,46.0,101...|             57|
|[4.0,3.0,52.0,101...|             60|
|[4.0,5.1,62.0,101...|             61|
|[4.0,6.3,72.0,101...|             72|
|[4.0,6.7,76.0,101...|             53|
|[4.0,6.5,75.0,101...|             50|
+--------------------+---------------+
only showing top 20 rows



In [20]:
train_GuangzhouPM,test_GuangzhouPM = final_GuangzhouPM.randomSplit([0.7,0.3])
train_GuangzhouPM.show()

+--------------------+---------------+
|            features|PM_City Station|
+--------------------+---------------+
|[4.0,0.4,51.0,101...|             35|
|[4.0,0.7,46.0,101...|             48|
|[4.0,1.1,44.0,101...|             51|
|[4.0,1.6,70.0,101...|             29|
|[4.0,1.7,61.0,101...|             40|
|[4.0,1.7,69.0,101...|             39|
|[4.0,1.8,72.0,101...|             26|
|[4.0,1.8,73.0,101...|             30|
|[4.0,2.1,74.0,101...|             31|
|[4.0,2.3,66.0,101...|             42|
|[4.0,2.3,69.0,101...|             37|
|[4.0,2.4,75.0,101...|             33|
|[4.0,2.5,68.0,101...|             43|
|[4.0,2.5,70.0,101...|             37|
|[4.0,2.7,68.0,101...|             33|
|[4.0,2.7,68.0,101...|             48|
|[4.0,2.7,69.0,101...|             60|
|[4.0,2.7,70.0,101...|             26|
|[4.0,2.8,70.0,101...|             38|
|[4.0,2.9,67.0,101...|             47|
+--------------------+---------------+
only showing top 20 rows



In [21]:
final_GuangzhouPM.count()

20074

In [22]:
from pyspark.ml.regression import LinearRegression

In [23]:
lr = LinearRegression(labelCol='PM_City Station')

In [24]:
lrModel = lr.fit(train_GuangzhouPM)

In [25]:
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [2.6213634242252555,-3.5771885011375284,0.8189846030581893,0.388534677881508,2.4386266821970897,-0.3400465489846502,-0.1571365426607032,-0.5963570064208119] Intercept: -402.2745868254232


In [26]:
test_results = lrModel.evaluate(test_GuangzhouPM)

In [27]:
test_results.residuals.show()
print("RSME: {}".format(test_results.rootMeanSquaredError))

+-------------------+
|          residuals|
+-------------------+
| -4.141205213049091|
|-14.185905247534947|
| -24.59009659596535|
|-24.098045006398365|
|-5.1685839222430445|
|-37.951890225581224|
| -33.31947671776851|
| -28.67964554094732|
| -5.718337189385011|
| -33.80857190155251|
| -18.76679739122949|
|-27.754327300454406|
| -33.89669425966662|
| -31.22663616786741|
| -53.22172124787403|
|-13.186276986346968|
| -58.56327298958723|
| 11.154553191083494|
| -18.06865172423562|
|  20.81855000029293|
+-------------------+
only showing top 20 rows

RSME: 31.699241821385712


In [28]:
print("R2: {}".format(test_results.r2))

R2: 0.1352948625074547


In [29]:
final_GuangzhouPM.describe().show()

+-------+------------------+
|summary|   PM_City Station|
+-------+------------------+
|  count|             20074|
|   mean|47.056241904951676|
| stddev| 33.89550223308404|
|    min|                 1|
|    max|               526|
+-------+------------------+



In [30]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [31]:
cbwdIndex = StringIndexer(inputCol='cbwd',outputCol='cbwdIndex')

# Now we can one hot encode these numbers. This converts the various outputs into a single vector.
# This makes it easier to process when you have multiple classes.
cbwdEncoder = OneHotEncoder(inputCol='cbwdIndex',outputCol='cbwdVec')

In [32]:
#Logistic regression modelling
assembler_L1 = VectorAssembler(inputCols=['cbwdVec', 'season','DEWP',
 'HUMI',
 'TEMP',
 'PRES',
 'precipitation', 'Iws'],outputCol='features')

In [33]:
from pyspark.ml.classification import LogisticRegression

In [34]:
from pyspark.ml import Pipeline

In [35]:
log_reg_GuangzhouPM6 = LogisticRegression(featuresCol='features',labelCol='PM_City Station')

In [36]:
# Lists everything we want to do. Index data, encode data, assemble data and then pass in the actual model.
pipeline = Pipeline(stages=[cbwdIndex, cbwdEncoder,assembler_L1,log_reg_GuangzhouPM6])

In [37]:
train_Guangzhou_L, test_Guangzhou_L = GuangzhouPM6_log.randomSplit([0.7,.3])

In [38]:
fit_model1 = pipeline.fit(train_Guangzhou_L)

In [39]:
L_results1 = fit_model1.transform(test_Guangzhou_L)

In [40]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='PM_City Station')

In [41]:
AUC = my_eval.evaluate(L_results1)

AUC

1.0

In [42]:
totalResults = L_results1.select('PM_City Station','prediction')

correctResults = totalResults.filter(totalResults['PM_City Station'] == totalResults['prediction'])

countTR = totalResults.count()
print("Correct: " + str(countTR))

countTC = correctResults.count()
print("Total Correct: " + str(countTC))

Correct: 6089
Total Correct: 152


In [43]:
assembler_L2 = VectorAssembler(inputCols=['cbwdVec', 'season',
 'HUMI',
 'TEMP',
 'precipitation', 'Iws'],outputCol='features')

In [44]:
log_reg2 = LogisticRegression(featuresCol='features',labelCol='PM_City Station')

In [45]:
pipeline2 = Pipeline(stages=[cbwdIndex, cbwdEncoder,assembler_L1,log_reg_GuangzhouPM6])

In [46]:
fit_model2 = pipeline2.fit(train_Guangzhou_L)

In [48]:
L_results2 = fit_model2.transform(test_Guangzhou_L)

In [49]:
my_eval2 = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='PM_City Station')

In [50]:
L_results2.select('PM_City Station','prediction').show()

+---------------+----------+
|PM_City Station|prediction|
+---------------+----------+
|             60|      48.0|
|             40|      57.0|
|             35|      57.0|
|             57|      57.0|
|             60|      48.0|
|             50|      48.0|
|             55|      48.0|
|             86|      48.0|
|             82|      48.0|
|             69|      48.0|
|             59|      48.0|
|             92|      48.0|
|             53|      36.0|
|             49|      36.0|
|             43|      48.0|
|             64|      39.0|
|             58|      39.0|
|             48|      39.0|
|             45|      48.0|
|             55|      48.0|
+---------------+----------+
only showing top 20 rows



In [51]:
AUC2 = my_eval2.evaluate(L_results2)

AUC2

1.0

In [52]:
totalResults = L_results2.select('PM_City Station','prediction')

correctResults = totalResults.filter(totalResults['PM_City Station'] == totalResults['prediction'])

countTR = totalResults.count()
print("Correct: " + str(countTR))

countTC = correctResults.count()
print("Total Correct: " + str(countTC))

Correct: 6103
Total Correct: 142


In [54]:
#random forest decision tree modeling 
GuangzhouPM6_log.columns

['year',
 'month',
 'day',
 'hour',
 'season',
 'PM_City Station',
 'DEWP',
 'HUMI',
 'PRES',
 'TEMP',
 'cbwd',
 'Iws',
 'precipitation',
 'Iprec',
 'dayTime',
 'OutdoorSafeIndex',
 'PM_City Station_Log']

In [60]:
GuangzhouPM6_log.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- season: integer (nullable = true)
 |-- PM_City Station: integer (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- HUMI: integer (nullable = true)
 |-- PRES: double (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- cbwd: string (nullable = true)
 |-- Iws: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- Iprec: double (nullable = true)
 |-- dayTime: string (nullable = true)
 |-- OutdoorSafeIndex: string (nullable = false)
 |-- PM_City Station_Log: double (nullable = true)



In [61]:
# Combine all features into one vector named features.
assembler_DT1 = VectorAssembler(
  inputCols=['season',
             'PM_City Station',
             'DEWP',
             'HUMI',
             'PRES',
             'TEMP',
             'Iws',
             'precipitation',
             'Iprec'],
              outputCol="features")

In [62]:
output_DT1 = assembler_DT1.transform(GuangzhouPM6_log)

In [63]:
final_dataDT1 = output_DT1.select("features",'OutdoorSafeIndex')
final_dataDT1.show()

+--------------------+----------------+
|            features|OutdoorSafeIndex|
+--------------------+----------------+
|[4.0,83.0,3.7,91....|             Bad|
|[4.0,95.0,4.2,88....|             Bad|
|[4.0,55.0,3.5,76....|             Bad|
|[4.0,60.0,2.7,69....|             Bad|
|[4.0,41.0,1.5,62....|             Bad|
|[4.0,42.0,2.3,66....|             Bad|
|[4.0,40.0,2.9,69....|            Fair|
|[4.0,40.0,1.7,61....|            Fair|
|[4.0,35.0,0.4,51....|            Fair|
|[4.0,42.0,1.3,52....|             Bad|
|[4.0,48.0,0.7,46....|             Bad|
|[4.0,62.0,0.1,42....|             Bad|
|[4.0,51.0,1.1,44....|             Bad|
|[4.0,49.0,1.1,43....|             Bad|
|[4.0,57.0,1.6,46....|             Bad|
|[4.0,60.0,3.0,52....|             Bad|
|[4.0,61.0,5.1,62....|             Bad|
|[4.0,72.0,6.3,72....|             Bad|
|[4.0,53.0,6.7,76....|             Bad|
|[4.0,50.0,6.5,75....|             Bad|
+--------------------+----------------+
only showing top 20 rows



In [64]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [65]:
from pyspark.ml.feature import StringIndexer

In [66]:
indexer = StringIndexer(inputCol="OutdoorSafeIndex", outputCol="OutdoorSafeIndex output")
output_fixed = indexer.fit(output_DT1).transform(output_DT1)

In [67]:
final_dataDT1 = output_fixed.select('OutdoorSafeIndex output',"features")
final_dataDT1.show()

+-----------------------+--------------------+
|OutdoorSafeIndex output|            features|
+-----------------------+--------------------+
|                    0.0|[4.0,83.0,3.7,91....|
|                    0.0|[4.0,95.0,4.2,88....|
|                    0.0|[4.0,55.0,3.5,76....|
|                    0.0|[4.0,60.0,2.7,69....|
|                    0.0|[4.0,41.0,1.5,62....|
|                    0.0|[4.0,42.0,2.3,66....|
|                    2.0|[4.0,40.0,2.9,69....|
|                    2.0|[4.0,40.0,1.7,61....|
|                    2.0|[4.0,35.0,0.4,51....|
|                    0.0|[4.0,42.0,1.3,52....|
|                    0.0|[4.0,48.0,0.7,46....|
|                    0.0|[4.0,62.0,0.1,42....|
|                    0.0|[4.0,51.0,1.1,44....|
|                    0.0|[4.0,49.0,1.1,43....|
|                    0.0|[4.0,57.0,1.6,46....|
|                    0.0|[4.0,60.0,3.0,52....|
|                    0.0|[4.0,61.0,5.1,62....|
|                    0.0|[4.0,72.0,6.3,72....|
|            

In [68]:
train_dataDT1,test_dataDT1 = final_dataDT1.randomSplit([0.7,0.3])

In [69]:
train_dataDT1.printSchema()

root
 |-- OutdoorSafeIndex output: double (nullable = true)
 |-- features: vector (nullable = true)



In [70]:
dtc = DecisionTreeClassifier(labelCol='OutdoorSafeIndex output',featuresCol='features')
rfc = RandomForestClassifier(labelCol='OutdoorSafeIndex output',featuresCol='features')

In [71]:
dtc_model = dtc.fit(train_dataDT1)
rfc_model = rfc.fit(train_dataDT1)

In [72]:
#Compare model accuracy 
prediction_dtc = dtc_model.transform(test_dataDT1)
prediction_rfc = rfc_model.transform(test_dataDT1)

In [73]:
prediction_dtc.show(10)

+-----------------------+--------------------+----------------+-------------+----------+
|OutdoorSafeIndex output|            features|   rawPrediction|  probability|prediction|
+-----------------------+--------------------+----------------+-------------+----------+
|                    0.0|[4.0,42.0,2.4,77....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,42.0,2.5,77....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,45.0,5.4,66....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,46.0,4.6,68....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,47.0,2.9,67....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,48.0,2.7,68....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,48.0,5.7,61....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    0.0|[4.0,49.0,1.1,43....|[6572.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|                    

In [74]:
prediction_rfc.show(10)

+-----------------------+--------------------+--------------------+--------------------+----------+
|OutdoorSafeIndex output|            features|       rawPrediction|         probability|prediction|
+-----------------------+--------------------+--------------------+--------------------+----------+
|                    0.0|[4.0,42.0,2.4,77....|[19.0766281446299...|[0.95383140723149...|       0.0|
|                    0.0|[4.0,42.0,2.5,77....|[19.0766281446299...|[0.95383140723149...|       0.0|
|                    0.0|[4.0,45.0,5.4,66....|[19.0766281446299...|[0.95383140723149...|       0.0|
|                    0.0|[4.0,46.0,4.6,68....|[19.0766281446299...|[0.95383140723149...|       0.0|
|                    0.0|[4.0,47.0,2.9,67....|[19.0766281446299...|[0.95383140723149...|       0.0|
|                    0.0|[4.0,48.0,2.7,68....|[19.0766281446299...|[0.95383140723149...|       0.0|
|                    0.0|[4.0,48.0,5.7,61....|[19.4510583595203...|[0.97255291797601...|       0.0|


In [76]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [77]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="OutdoorSafeIndex output", predictionCol="prediction", metricName="accuracy")

In [78]:
dtc_acc = acc_evaluator.evaluate(prediction_dtc)
rfc_acc = acc_evaluator.evaluate(prediction_rfc)

In [79]:
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*40)

Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 98.35%
----------------------------------------
A random forest ensemble has an accuracy of: 98.45%
----------------------------------------


In [80]:
assembler_DT2 = VectorAssembler(
  inputCols=['season',
             'PM_City Station',
             'HUMI',
             'Iws',
             'precipitation'],
              outputCol="features")

In [81]:
output_DT2 = assembler_DT2.transform(GuangzhouPM6_log)

In [82]:
final_dataDT2 = output_DT2.select("features",'OutdoorSafeIndex')
final_dataDT2.show()

+--------------------+----------------+
|            features|OutdoorSafeIndex|
+--------------------+----------------+
|[4.0,83.0,91.0,1....|             Bad|
|[4.0,95.0,88.0,3....|             Bad|
|[4.0,55.0,76.0,5....|             Bad|
|[4.0,60.0,69.0,8....|             Bad|
|[4.0,41.0,62.0,9....|             Bad|
|[4.0,42.0,66.0,11...|             Bad|
|[4.0,40.0,69.0,2....|            Fair|
|[4.0,40.0,61.0,4....|            Fair|
|[4.0,35.0,51.0,6....|            Fair|
|[4.0,42.0,52.0,1....|             Bad|
|[4.0,48.0,46.0,4....|             Bad|
|[4.0,62.0,42.0,1....|             Bad|
|[4.0,51.0,44.0,3....|             Bad|
|[4.0,49.0,43.0,5....|             Bad|
|[4.0,57.0,46.0,7....|             Bad|
|[4.0,60.0,52.0,8....|             Bad|
|[4.0,61.0,62.0,0....|             Bad|
|[4.0,72.0,72.0,1....|             Bad|
|[4.0,53.0,76.0,1....|             Bad|
|[4.0,50.0,75.0,1....|             Bad|
+--------------------+----------------+
only showing top 20 rows



In [83]:
indexer2 = StringIndexer(inputCol="OutdoorSafeIndex", outputCol="OutdoorSafeIndex output")
output_fixed2 = indexer.fit(output_DT2).transform(output_DT2)

In [84]:
final_dataDT2 = output_fixed2.select('OutdoorSafeIndex output',"features")
final_dataDT2.show()

+-----------------------+--------------------+
|OutdoorSafeIndex output|            features|
+-----------------------+--------------------+
|                    0.0|[4.0,83.0,91.0,1....|
|                    0.0|[4.0,95.0,88.0,3....|
|                    0.0|[4.0,55.0,76.0,5....|
|                    0.0|[4.0,60.0,69.0,8....|
|                    0.0|[4.0,41.0,62.0,9....|
|                    0.0|[4.0,42.0,66.0,11...|
|                    2.0|[4.0,40.0,69.0,2....|
|                    2.0|[4.0,40.0,61.0,4....|
|                    2.0|[4.0,35.0,51.0,6....|
|                    0.0|[4.0,42.0,52.0,1....|
|                    0.0|[4.0,48.0,46.0,4....|
|                    0.0|[4.0,62.0,42.0,1....|
|                    0.0|[4.0,51.0,44.0,3....|
|                    0.0|[4.0,49.0,43.0,5....|
|                    0.0|[4.0,57.0,46.0,7....|
|                    0.0|[4.0,60.0,52.0,8....|
|                    0.0|[4.0,61.0,62.0,0....|
|                    0.0|[4.0,72.0,72.0,1....|
|            

In [85]:
train_dataDT2,test_dataDT2 = final_dataDT2.randomSplit([0.7,0.3])

In [86]:
dtc2 = DecisionTreeClassifier(labelCol='OutdoorSafeIndex output',featuresCol='features')
rfc2 = RandomForestClassifier(labelCol='OutdoorSafeIndex output',featuresCol='features')

In [87]:
dtc2_model = dtc.fit(train_dataDT2)
rfc2_model = rfc.fit(train_dataDT2)

In [88]:
prediction_dtc2 = dtc_model.transform(test_dataDT2)
prediction_rfc2 = rfc_model.transform(test_dataDT2)

In [89]:
acc2_evaluator = MulticlassClassificationEvaluator(labelCol="OutdoorSafeIndex output", predictionCol="prediction", metricName="accuracy")

In [90]:
dtc2_acc = acc2_evaluator.evaluate(prediction_dtc)
rfc2_acc = acc2_evaluator.evaluate(prediction_rfc)

In [91]:
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc2_acc*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc2_acc*100))
print('-'*40)

Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 98.49%
----------------------------------------
A random forest ensemble has an accuracy of: 98.30%
----------------------------------------
