# About this notebook

This notebook explores classifers for fire occurrence using pyspark.

In [1]:
# Run this cell locally to set up Spark. On AWS EMR the "spark" context is provided already
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = """\
    --conf spark.driver.bindAddress=localhost \
    --conf spark.driver.host=localhost \
    --packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell"""
os.environ['SPARK_MASTER_HOST'] = 'localhost'
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext
conf = SparkConf()\
    .setMaster("local[12]")\
    .setAppName("fire-occurrence-classifer")\
    .set("spark.executor.memory", "14G") \
    .set("spark.driver.memory", "14G") \
    .set("spark.sql.parquet.compression.codec", "snappy")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

In [2]:
# Must rename a column that is invalid for pyspark
import pandas as pd
df = pd.read_parquet("integratedData.parquet.gz")
cols = [col for col in df]
renamed = [col.replace(" ", "_") for col in cols]
df.date = pd.to_numeric(df.date)
df.fire_occurred = df.fire_occurred.astype(int)
df = df.rename(columns=dict(zip(cols, renamed)))
print(df.head(1))
df.to_parquet("integratedData.renamed.parquet.gz", compression="gzip")

                 date  precipitation_amount_mm  relative_humidity_%  \
0  946684800000000000                      0.0                 40.5   

   specific_humidity_kg/kg  surface_downwelling_shortwave_flux_in_air_W_m-2  \
0                    0.006                                            139.7   

   wind_from_direction_Degrees_Clockwise_from_north  wind_speed_m/s  \
0                                             222.0             2.1   

   max_air_temperature_K  min_air_temperature_K  burning_index_g_Unitless  \
0                  292.0                  282.2                      31.0   

   dead_fuel_moisture_100hr_Percent  dead_fuel_moisture_1000hr_Percent  \
0                              12.3                               12.1   

   energy_release_component-g_Unitless  potential_evapotranspiration_mm  \
0                                 48.0                              1.8   

   mean_vapor_pressure_deficit_kPa  fire_occurred  acres_burned fire_name  \
0                      

In [3]:
df = spark.read.parquet("integratedData.renamed.parquet.gz")
# df.printSchema()
# df = df.withColumnRenamed("surface_downwelling_shortwave_flux_in_air_W m-2",
#                           "surface_downwelling_shortwave_flux_in_air_W_m-2")
# df.printSchema()
spark.registerDataFrameAsTable(df, "integratedData")
spark.sql("SELECT * FROM integratedData WHERE fire_occurred=1 LIMIT 2").collect()

[Row(date=978480000000000000, precipitation_amount_mm=0.0, relative_humidity_%=9.9, specific_humidity_kg/kg=0.0019100000000000002, surface_downwelling_shortwave_flux_in_air_W_m-2=147.20000000000002, wind_from_direction_Degrees_Clockwise_from_north=37.0, wind_speed_m/s=4.4, max_air_temperature_K=297.0, min_air_temperature_K=281.2, burning_index_g_Unitless=68.0, dead_fuel_moisture_100hr_Percent=6.9, dead_fuel_moisture_1000hr_Percent=9.5, energy_release_component-g_Unitless=72.0, potential_evapotranspiration_mm=4.800000000000001, mean_vapor_pressure_deficit_kPa=1.73, fire_occurred=1, acres_burned=10438.01953125, fire_name='VIEJAS', longitude=-116.76666663333334, latitude=32.81666666666667, __index_level_0__=513149),
 Row(date=978480000000000000, precipitation_amount_mm=0.0, relative_humidity_%=9.8, specific_humidity_kg/kg=0.0019500000000000001, surface_downwelling_shortwave_flux_in_air_W_m-2=145.1, wind_from_direction_Degrees_Clockwise_from_north=54.0, wind_speed_m/s=4.9, max_air_temperat

In [4]:
df.groupBy("fire_occurred").count().show()

+-------------+-------+
|fire_occurred|  count|
+-------------+-------+
|            1|   1743|
|            0|6499707|
+-------------+-------+



In [5]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler

cols = [
    'date',
    'precipitation_amount_mm', 
    'relative_humidity_%',
    'specific_humidity_kg/kg',
    'surface_downwelling_shortwave_flux_in_air_W_m-2',
    'wind_from_direction_Degrees_Clockwise_from_north',
    'wind_speed_m/s',
    'max_air_temperature_K',
    'min_air_temperature_K',
    'burning_index_g_Unitless',
    'dead_fuel_moisture_100hr_Percent',
    'dead_fuel_moisture_1000hr_Percent',
    'energy_release_component-g_Unitless',
    'potential_evapotranspiration_mm', 
    'mean_vapor_pressure_deficit_kPa',
#     'fire_occurred',
#     'acres_burned',
#     'fire_name',
    'longitude',
    'latitude',
]

assembler = VectorAssembler(
    inputCols=cols,
    outputCol="features")
output = assembler.transform(df)
rf = RandomForestClassifier(
    numTrees=20,
    maxDepth=8,
    labelCol="fire_occurred",
    seed=42)

In [6]:
%%time
model = rf.fit(output)
model.featureImportances

Wall time: 2min 50s


SparseVector(17, {0: 0.1743, 1: 0.0047, 2: 0.0557, 3: 0.0479, 4: 0.0822, 5: 0.0424, 6: 0.0725, 7: 0.051, 8: 0.0549, 9: 0.0145, 10: 0.0528, 11: 0.058, 12: 0.0393, 13: 0.0561, 14: 0.0532, 15: 0.0867, 16: 0.0539})

In [7]:
model.featureImportances.values

array([0.1742823 , 0.00466235, 0.05568691, 0.04789484, 0.08224204,
       0.04237697, 0.07246099, 0.05098685, 0.05488177, 0.01450643,
       0.0528353 , 0.05801185, 0.03932865, 0.05612418, 0.05318839,
       0.08667671, 0.05385347])

In [8]:
model.transform(output).groupBy("fire_occurred", "prediction").count().show()

+-------------+----------+-------+
|fire_occurred|prediction|  count|
+-------------+----------+-------+
|            1|       0.0|   1743|
|            0|       0.0|6499707|
+-------------+----------+-------+



In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

cols = [
    'date',
    'precipitation_amount_mm', 
    'relative_humidity_%',
    'specific_humidity_kg/kg',
    'surface_downwelling_shortwave_flux_in_air_W_m-2',
    'wind_from_direction_Degrees_Clockwise_from_north',
    'wind_speed_m/s',
    'max_air_temperature_K',
    'min_air_temperature_K',
    'burning_index_g_Unitless',
    'dead_fuel_moisture_100hr_Percent',
    'dead_fuel_moisture_1000hr_Percent',
    'energy_release_component-g_Unitless',
    'potential_evapotranspiration_mm', 
    'mean_vapor_pressure_deficit_kPa',
#     'fire_occurred',
#     'acres_burned',
#     'fire_name',
    'longitude',
    'latitude',
]

assembler = VectorAssembler(
    inputCols=cols,
    outputCol="features")

rf = RandomForestClassifier(
    numTrees=20,
    maxDepth=8,
    labelCol="fire_occurred",
    seed=42)

pipeline = Pipeline(stages=[assembler, rf])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
# paramGrid = ParamGridBuilder() \
#     .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
#     .addGrid(lr.regParam, [0.1, 0.01]) \
#     .build()
paramGrid = ParamGridBuilder() \
    .build()

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='prediction', 
    labelCol='fire_occurred',
    metricName='areaUnderPR')

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)  # use 3+ folds in practice

In [10]:
%%time
cvModel = crossval.fit(df)

Wall time: 11min 36s


In [11]:
cvModel.transform(df).select("fire_occurred", "prediction").limit(10).show()

+-------------+----------+
|fire_occurred|prediction|
+-------------+----------+
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
|            0|       0.0|
+-------------+----------+



In [12]:
cvModel.transform(df).groupBy("fire_occurred", "prediction").count().show()

+-------------+----------+-------+
|fire_occurred|prediction|  count|
+-------------+----------+-------+
|            1|       0.0|   1743|
|            0|       0.0|6499707|
+-------------+----------+-------+



In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

cols = [
    'date',
    'precipitation_amount_mm', 
    'relative_humidity_%',
    'specific_humidity_kg/kg',
    'surface_downwelling_shortwave_flux_in_air_W_m-2',
    'wind_from_direction_Degrees_Clockwise_from_north',
    'wind_speed_m/s',
    'max_air_temperature_K',
    'min_air_temperature_K',
    'burning_index_g_Unitless',
    'dead_fuel_moisture_100hr_Percent',
    'dead_fuel_moisture_1000hr_Percent',
    'energy_release_component-g_Unitless',
    'potential_evapotranspiration_mm', 
    'mean_vapor_pressure_deficit_kPa',
#     'fire_occurred',
#     'acres_burned',
#     'fire_name',
    'longitude',
    'latitude',
]

assembler = VectorAssembler(
    inputCols=cols,
    outputCol="features")

gbt = GBTClassifier(
    maxIter=10,
    labelCol="fire_occurred",
    seed=42)

pipeline = Pipeline(stages=[assembler, gbt])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
# paramGrid = ParamGridBuilder() \
#     .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
#     .addGrid(lr.regParam, [0.1, 0.01]) \
#     .build()
paramGrid = ParamGridBuilder() \
    .build()

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='prediction', 
    labelCol='fire_occurred',
    metricName='areaUnderPR')

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)  # use 3+ folds in practice

In [14]:
%%time
cvModel = crossval.fit(df)

Wall time: 8min 42s


In [16]:
cvModel.transform(df).groupBy("fire_occurred", "prediction").count().show()

+-------------+----------+-------+
|fire_occurred|prediction|  count|
+-------------+----------+-------+
|            1|       0.0|   1742|
|            0|       0.0|6499707|
|            1|       1.0|      1|
+-------------+----------+-------+

