In [1]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.pca.parquet.gz'
pca_df = spark.read.parquet(s3_url)
pca_df.createOrReplaceTempView('pca')

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1586972758394_0004,pyspark3,idle,,,✔


SparkSession available as 'spark'.


In [2]:
pca_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- pcaFeatures: vector (nullable = true)

In [3]:
pca_df.show(3)

+------------------+------------------+-------------------+--------------------+
|              date|          latitude|          longitude|         pcaFeatures|
+------------------+------------------+-------------------+--------------------+
|915235200000000000|32.608333333333334|-116.93333330000002|[-18.953460978093...|
|915235200000000000|32.775000000000006|-117.26666663333334|[-27.538244860325...|
|915235200000000000|32.775000000000006|-116.14166663333334|[-3.5834800101858...|
+------------------+------------------+-------------------+--------------------+
only showing top 3 rows

In [4]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.renamed.parquet.gz'
base_df = spark.read.parquet(s3_url)
base_df.createOrReplaceTempView('base')

In [5]:
base_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- precipitation_amount_mm: double (nullable = true)
 |-- relative_humidity_%: double (nullable = true)
 |-- specific_humidity_kg/kg: double (nullable = true)
 |-- surface_downwelling_shortwave_flux_in_air_W_m-2: double (nullable = true)
 |-- wind_from_direction_Degrees_Clockwise_from_north: double (nullable = true)
 |-- wind_speed_m/s: double (nullable = true)
 |-- max_air_temperature_K: double (nullable = true)
 |-- min_air_temperature_K: double (nullable = true)
 |-- burning_index_g_Unitless: double (nullable = true)
 |-- dead_fuel_moisture_100hr_Percent: double (nullable = true)
 |-- dead_fuel_moisture_1000hr_Percent: double (nullable = true)
 |-- energy_release_component-g_Unitless: double (nullable = true)
 |-- potential_evapotranspiration_mm: double (nullable = true)
 |-- mean_vapor_pressure_deficit_kPa: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- fire_name: strin

In [6]:
base_df.show(3, vertical=True)

-RECORD 0----------------------------------------------------------------
 date                                             | 915148800000000000   
 precipitation_amount_mm                          | 0.0                  
 relative_humidity_%                              | 40.300000000000004   
 specific_humidity_kg/kg                          | 0.00589              
 surface_downwelling_shortwave_flux_in_air_W_m-2  | 138.0                
 wind_from_direction_Degrees_Clockwise_from_north | 123.0                
 wind_speed_m/s                                   | 1.6                  
 max_air_temperature_K                            | 293.1                
 min_air_temperature_K                            | 281.1                
 burning_index_g_Unitless                         | 24.0                 
 dead_fuel_moisture_100hr_Percent                 | 16.0                 
 dead_fuel_moisture_1000hr_Percent                | 15.5                 
 energy_release_component-g_Unitless  

In [7]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

without_fires_spark_df = spark.sql("""
SELECT * FROM base WHERE fire_occurred = 0
""").sample(fraction=0.1)
without_fires_spark_df.createOrReplaceTempView("without_fires")

without_fires_df = spark.sql("""
SELECT pca.date, pca.latitude, pca.longitude, fire_occurred, fire_name, acres_burned, pcaFeatures
FROM pca, without_fires
WHERE pca.date = without_fires.date AND pca.latitude = without_fires.latitude AND pca.longitude = without_fires.longitude
""")
without_fires_df.cache()

without_fires_df = without_fires_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["pca.date", "pca.latitude", "pca.longitude"]
                             + [col("pcaFeaturesArr")[i] for i in range(30)])

In [8]:
without_fires_df.show(2, vertical=True)

-RECORD 0---------------------------------
 date               | 915408000000000000  
 latitude           | 33.483333333333334  
 longitude          | -116.76666663333334 
 pcaFeaturesArr[0]  | -7.557582163752993  
 pcaFeaturesArr[1]  | -19.930267307015963 
 pcaFeaturesArr[2]  | -22.28836908357233  
 pcaFeaturesArr[3]  | 53.183717083208506  
 pcaFeaturesArr[4]  | -37.18876502467     
 pcaFeaturesArr[5]  | -1.927539693353312  
 pcaFeaturesArr[6]  | 16.148205718631456  
 pcaFeaturesArr[7]  | 10.490691767796232  
 pcaFeaturesArr[8]  | -3.692035581901515  
 pcaFeaturesArr[9]  | -1.3707094010202956 
 pcaFeaturesArr[10] | 7.158356008318716   
 pcaFeaturesArr[11] | -3.1545045833958882 
 pcaFeaturesArr[12] | -2.126304145547572  
 pcaFeaturesArr[13] | -13.709169728260003 
 pcaFeaturesArr[14] | 11.469667564993532  
 pcaFeaturesArr[15] | -17.993715040888503 
 pcaFeaturesArr[16] | -3.0831337178721716 
 pcaFeaturesArr[17] | -9.311531199676976  
 pcaFeaturesArr[18] | -13.722424265038184 
 pcaFeature

In [13]:
n_components = 20
without_fire_histograms = [without_fires_df.select(col("pcaFeaturesArr[{}]".format(i))).rdd\
              .map(lambda row: row[0]).histogram(50) for i in range(n_components)]
without_fire_histograms

[([-36.59438167567295, -35.02753783066697, -33.460693985661, -31.89385014065503, -30.327006295649056, -28.760162450643083, -27.19331860563711, -25.626474760631133, -24.059630915625164, -22.492787070619187, -20.925943225613217, -19.35909938060724, -17.792255535601267, -16.225411690595294, -14.658567845589321, -13.091724000583348, -11.524880155577375, -9.958036310571401, -8.391192465565428, -6.824348620559455, -5.257504775553482, -3.6906609305475087, -2.123817085541532, -0.5569732405355623, 1.0098706044704144, 2.576714449476384, 4.143558294482361, 5.71040213948833, 7.277245984494307, 8.844089829500277, 10.410933674506254, 11.977777519512223, 13.5446213645182, 15.111465209524177, 16.678309054530146, 18.245152899536123, 19.811996744542093, 21.37884058954807, 22.94568443455404, 24.512528279560016, 26.079372124565985, 27.646215969571955, 29.21305981457793, 30.77990365958391, 32.346747504589885, 33.91359134959585, 35.480435194601824, 37.0472790396078, 38.61412288461378, 40.180966729619755, 41

In [14]:
with_fires_spark_df = spark.sql("""
SELECT * FROM base WHERE fire_occurred = 1
""")
with_fires_spark_df.createOrReplaceTempView("with_fires")

with_fires_df = spark.sql("""
SELECT pca.date, pca.latitude, pca.longitude, fire_occurred, fire_name, acres_burned, pcaFeatures
FROM pca, with_fires
WHERE pca.date = with_fires.date AND pca.latitude = with_fires.latitude AND pca.longitude = with_fires.longitude
""")
with_fires_df.cache()

with_fires_df = with_fires_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["pca.date", "pca.latitude", "pca.longitude"]
                             + [col("pcaFeaturesArr")[i] for i in range(30)])

In [15]:
with_fire_histograms = [with_fires_df.select(col("pcaFeaturesArr[{}]".format(i))).rdd\
              .map(lambda row: row[0]).histogram(without_fire_histograms[i][0]) for i in range(n_components)]
with_fire_histograms

[([-36.59438167567295, -35.02753783066697, -33.460693985661, -31.89385014065503, -30.327006295649056, -28.760162450643083, -27.19331860563711, -25.626474760631133, -24.059630915625164, -22.492787070619187, -20.925943225613217, -19.35909938060724, -17.792255535601267, -16.225411690595294, -14.658567845589321, -13.091724000583348, -11.524880155577375, -9.958036310571401, -8.391192465565428, -6.824348620559455, -5.257504775553482, -3.6906609305475087, -2.123817085541532, -0.5569732405355623, 1.0098706044704144, 2.576714449476384, 4.143558294482361, 5.71040213948833, 7.277245984494307, 8.844089829500277, 10.410933674506254, 11.977777519512223, 13.5446213645182, 15.111465209524177, 16.678309054530146, 18.245152899536123, 19.811996744542093, 21.37884058954807, 22.94568443455404, 24.512528279560016, 26.079372124565985, 27.646215969571955, 29.21305981457793, 30.77990365958391, 32.346747504589885, 33.91359134959585, 35.480435194601824, 37.0472790396078, 38.61412288461378, 40.180966729619755, 41