In [1]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.pca.parquet.gz'
pca_df = spark.read.parquet(s3_url)
pca_df.createOrReplaceTempView('pca')
pca_df.count()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1589074996094_0002,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.
6825410

In [2]:
base_df = spark.read.parquet('s3a://dse-cohort5-group5/wildfire_capstone/integratedData.renamed.parquet.gz')
base_df.createOrReplaceTempView("fire_occurrences")
base_df.count()

6826300

In [3]:
base_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- precipitation_amount_mm: double (nullable = true)
 |-- relative_humidity_%: double (nullable = true)
 |-- specific_humidity_kg/kg: double (nullable = true)
 |-- surface_downwelling_shortwave_flux_in_air_W_m-2: double (nullable = true)
 |-- wind_from_direction_Degrees_Clockwise_from_north: double (nullable = true)
 |-- wind_speed_m/s: double (nullable = true)
 |-- max_air_temperature_K: double (nullable = true)
 |-- min_air_temperature_K: double (nullable = true)
 |-- burning_index_g_Unitless: double (nullable = true)
 |-- dead_fuel_moisture_100hr_Percent: double (nullable = true)
 |-- dead_fuel_moisture_1000hr_Percent: double (nullable = true)
 |-- energy_release_component-g_Unitless: double (nullable = true)
 |-- potential_evapotranspiration_mm: double (nullable = true)
 |-- mean_vapor_pressure_deficit_kPa: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- fire_name: strin

In [4]:
join_query = """
SELECT fire_occurrences.date,          fire_occurrences.latitude,     fire_occurrences.longitude,
       fire_occurrences.fire_occurred, fire_occurrences.acres_burned, pca.pcaFeatures,
       fire_occurrences.fire_name,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'yyyy') as year,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'MM') as month,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'dd') as day
FROM fire_occurrences, pca
WHERE pca.date      = fire_occurrences.date
  AND pca.latitude  = fire_occurrences.latitude
  AND pca.longitude = fire_occurrences.longitude
"""

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

N_FEATURES_TO_KEEP = 40

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

joined_df = spark.sql(join_query)
joined_df = joined_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["fire_occurrences.date", "fire_occurrences.latitude", "fire_occurrences.longitude",
                              "fire_occurrences.fire_occurred", "fire_occurrences.acres_burned",
                              "year", "month", "day", "fire_occurrences.fire_name"]
                             + [col("pcaFeaturesArr")[i] for i in range(N_FEATURES_TO_KEEP)])

In [6]:
joined_df.cache()
joined_df.count()

6825410

In [7]:
joined_df.createOrReplaceTempView("joined")

In [8]:
joined_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- fire_name: string (nullable = true)
 |-- pcaFeaturesArr[0]: double (nullable = true)
 |-- pcaFeaturesArr[1]: double (nullable = true)
 |-- pcaFeaturesArr[2]: double (nullable = true)
 |-- pcaFeaturesArr[3]: double (nullable = true)
 |-- pcaFeaturesArr[4]: double (nullable = true)
 |-- pcaFeaturesArr[5]: double (nullable = true)
 |-- pcaFeaturesArr[6]: double (nullable = true)
 |-- pcaFeaturesArr[7]: double (nullable = true)
 |-- pcaFeaturesArr[8]: double (nullable = true)
 |-- pcaFeaturesArr[9]: double (nullable = true)
 |-- pcaFeaturesArr[10]: double (nullable = true)
 |-- pcaFeaturesArr[11]: double (nullable = true)
 |-- pcaFeaturesArr[12]: double (nullable = true)
 

In [9]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def evaluate_weighted_logistic_regression(fires_weight=1, no_fires_weight=1):
    # Add class weights for the Logistic Regression classifier below
    with_fires_df_tmp = spark.sql("""
    SELECT *, row_number() OVER (PARTITION BY fire_name, year, month, day
                                 ORDER BY     fire_name, year, month, day) AS rnum
    FROM joined WHERE fire_occurred = 1
    """)
    with_fires_df_tmp.createOrReplaceTempView("with_fires_tmp")
    
    with_fires_df = spark.sql("""
    SELECT *, {weight} as weight
    FROM with_fires_tmp
    WHERE rnum = 1
    """.format(weight=fires_weight)).drop("rnum")
    
    without_fires_df = spark.sql("""
    SELECT *, {weight} as weight
    FROM joined
    WHERE joined.fire_occurred = 0
    """.format(pca_features=", ".join("pcaFeaturesArr[{}]".format(i) for i in range(0, 40)),
               weight=no_fires_weight))
    
    with_fires_train,    with_fires_test    = with_fires_df.randomSplit([0.8, 0.2], seed=42)
    without_fires_train, without_fires_test = without_fires_df.randomSplit([0.8, 0.2], seed=42)
    
    print("Training fires:", len(with_fires_train.collect()))
    print("Test fires:", len(with_fires_test.collect()))
    
    train_df = with_fires_train.union(without_fires_train).sample(fraction=1.0, seed=42)
    test_df  = with_fires_test.union(without_fires_test).sample(fraction=1.0, seed=42)
    train_df.cache()
    test_df.cache()
    print("Train count, test count:", train_df.count(), test_df.count())


    assembler = VectorAssembler(
        inputCols=["pcaFeaturesArr[{}]".format(i) for i in range(0, 40)],
        outputCol="features")

    lr = LogisticRegression(
        featuresCol='features', 
        labelCol='fire_occurred',
        weightCol='weight',
        family="binomial")

    pipeline = Pipeline(stages=[assembler, lr])

    model = pipeline.fit(train_df)

    predictions = model.transform(test_df)
    predictions.createOrReplaceTempView('predictions')

    evaluator = MulticlassClassificationEvaluator(labelCol="fire_occurred", predictionCol="prediction",
                                                  metricName="f1")

    print("Test results for fire/no fire weights", fires_weight, no_fires_weight)

    f1 = evaluator.evaluate(predictions)
    print("Test set f1 score = " + str(f1))

    true_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 1""")
    true_positive = true_positive.count()

    false_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 0""")
    false_negative = false_negative.count()

    true_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 0""")
    true_negative = true_negative.count()

    false_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 1""")
    false_positive = false_positive.count()
    
    print("TP:", true_positive)
    print("FP:", false_positive)
    print("TN:", true_negative)
    print("FN:", false_negative)

    print("% precision for fires: {:%}".format(true_positive/(true_positive + false_positive + 1e-3)))
    print("% of fires recalled: {:%}".format(true_positive/(true_positive + false_negative + 1e-3)))
    print("Accuracy for non-fires: {:%}".format(true_negative/(true_negative + false_positive + 1e-3)))
    
    print("*" * 80)

In [10]:
for n in [0, 3, 4, 5, 6]:
    evaluate_weighted_logistic_regression(10**n, 1)

Training fires: 331
Test fires: 83
Train count, test count: 5460075 1363582
Test results for fire/no fire weights 1 1
Test set f1 score = 0.9999086972863181
TP: 0
FP: 0
TN: 1363499
FN: 83
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Accuracy for non-fires: 100.000000%
********************************************************************************
Training fires: 331
Test fires: 83
Train count, test count: 5460075 1363582
Test results for fire/no fire weights 1000 1
Test set f1 score = 0.9983044338574267
TP: 13
FP: 4382
TN: 1359117
FN: 70
% precision for fires: 0.295791%
% of fires recalled: 15.662462%
Accuracy for non-fires: 99.678621%
********************************************************************************
Training fires: 331
Test fires: 83
Train count, test count: 5460075 1363582
Test results for fire/no fire weights 10000 1
Test set f1 score = 0.907733948712437
TP: 66
FP: 230215
TN: 1133284
FN: 17
% precision for fires: 0.028661%
% of fires recalled: 79.5

In [11]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def evaluate_weighted_logistic_regression_random_fires(fires_weight=1, no_fires_weight=1):
    # Add class weights for the Logistic Regression classifier below
    # Use the same number of fire rows as above, but select randomly
    with_fires_df = spark.sql("""
    SELECT *, {weight} as weight, rand(42) as rnd
    FROM joined
    WHERE joined.fire_occurred = 1
    ORDER BY rnd
    LIMIT 133
    """.format(weight=fires_weight)).drop("rnd")
    
    without_fires_df = spark.sql("""
    SELECT *, {weight} as weight
    FROM joined
    WHERE joined.fire_occurred = 0
    """.format(pca_features=", ".join("pcaFeaturesArr[{}]".format(i) for i in range(0, 40)),
               weight=no_fires_weight))
    
    # Had to adjust random split numbers to get closer to first split, for some reason I can't determine
    with_fires_train,    with_fires_test    = with_fires_df.randomSplit([0.7, 0.3], seed=42)
    without_fires_train, without_fires_test = without_fires_df.randomSplit([0.8, 0.2], seed=42)
    
    print("Training fires:", len(with_fires_train.collect()))
    print("Test fires:", len(with_fires_test.collect()))
    
    train_df = with_fires_train.union(without_fires_train).sample(fraction=1.0, seed=42)
    test_df  = with_fires_test.union(without_fires_test).sample(fraction=1.0, seed=42)
    train_df.cache()
    test_df.cache()
    print("Train count, test count:", train_df.count(), test_df.count())


    assembler = VectorAssembler(
        inputCols=["pcaFeaturesArr[{}]".format(i) for i in range(0, 40)],
        outputCol="features")

    lr = LogisticRegression(
        featuresCol='features', 
        labelCol='fire_occurred',
        weightCol='weight',
        family="binomial")

    pipeline = Pipeline(stages=[assembler, lr])

    model = pipeline.fit(train_df)

    predictions = model.transform(test_df)
    predictions.createOrReplaceTempView('predictions')

    evaluator = MulticlassClassificationEvaluator(labelCol="fire_occurred", predictionCol="prediction",
                                                  metricName="f1")

    print("Test results for fire/no fire weights", fires_weight, no_fires_weight)

    f1 = evaluator.evaluate(predictions)
    print("Test set f1 score = " + str(f1))

    true_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 1""")
    true_positive = true_positive.count()

    false_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 0""")
    false_negative = false_negative.count()

    true_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 0""")
    true_negative = true_negative.count()

    false_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 1""")
    false_positive = false_positive.count()
    
    print("TP:", true_positive)
    print("FP:", false_positive)
    print("TN:", true_negative)
    print("FN:", false_negative)
    
    print("% precision for fires: {:%}".format(true_positive/(true_positive + false_positive + 1e-3)))
    print("% of fires recalled: {:%}".format(true_positive/(true_positive + false_negative + 1e-3)))
    print("Accuracy for non-fires: {:%}".format(true_negative/(true_negative + false_positive + 1e-3)))
    
    print("*" * 80)

In [12]:
for n in [0, 3, 4, 5, 6]:
    evaluate_weighted_logistic_regression_random_fires(10**n, 1)

Training fires: 104
Test fires: 29
Train count, test count: 5459848 1363528
Test results for fire/no fire weights 1 1
Test set f1 score = 0.9999680975778988
TP: 0
FP: 0
TN: 1363499
FN: 29
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Accuracy for non-fires: 100.000000%
********************************************************************************
Training fires: 104
Test fires: 29
Train count, test count: 5459848 1363528
Test results for fire/no fire weights 1000 1
Test set f1 score = 0.9992479408186572
TP: 9
FP: 1972
TN: 1361527
FN: 20
% precision for fires: 0.454316%
% of fires recalled: 31.033413%
Accuracy for non-fires: 99.855372%
********************************************************************************
Training fires: 104
Test fires: 29
Train count, test count: 5459848 1363528
Test results for fire/no fire weights 10000 1
Test set f1 score = 0.9833462956040537
TP: 15
FP: 44602
TN: 1318897
FN: 14
% precision for fires: 0.033619%
% of fires recalled: 51.72