In [1]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.pca.parquet.gz'
pca_df = spark.read.parquet(s3_url)
pca_df.createOrReplaceTempView('pca')
pca_df.count()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1587935937072_0003,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.
6825410

In [2]:
base_df = spark.read.parquet('s3a://dse-cohort5-group5/wildfire_capstone/integratedData.renamed.parquet.gz')
base_df.createOrReplaceTempView("fire_occurrences")
base_df.count()

6826300

In [3]:
join_query = """
SELECT fire_occurrences.date,          fire_occurrences.latitude,     fire_occurrences.longitude,
       fire_occurrences.fire_occurred, fire_occurrences.acres_burned, pca.pcaFeatures,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'yyyy') as year,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'MM') as month,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'dd') as day
FROM fire_occurrences, pca
WHERE pca.date      = fire_occurrences.date
  AND pca.latitude  = fire_occurrences.latitude
  AND pca.longitude = fire_occurrences.longitude
"""

In [4]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

N_FEATURES_TO_KEEP = 40

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

joined_df = spark.sql(join_query)
joined_df = joined_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["fire_occurrences.date", "fire_occurrences.latitude", "fire_occurrences.longitude",
                              "fire_occurrences.fire_occurred", "fire_occurrences.acres_burned",
                              "year", "month", "day"]
                             + [col("pcaFeaturesArr")[i] for i in range(N_FEATURES_TO_KEEP)])

In [5]:
joined_df.cache()
joined_df.count()

6825410

In [6]:
joined_df.createOrReplaceTempView("joined")

In [7]:
joined_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- pcaFeaturesArr[0]: double (nullable = true)
 |-- pcaFeaturesArr[1]: double (nullable = true)
 |-- pcaFeaturesArr[2]: double (nullable = true)
 |-- pcaFeaturesArr[3]: double (nullable = true)
 |-- pcaFeaturesArr[4]: double (nullable = true)
 |-- pcaFeaturesArr[5]: double (nullable = true)
 |-- pcaFeaturesArr[6]: double (nullable = true)
 |-- pcaFeaturesArr[7]: double (nullable = true)
 |-- pcaFeaturesArr[8]: double (nullable = true)
 |-- pcaFeaturesArr[9]: double (nullable = true)
 |-- pcaFeaturesArr[10]: double (nullable = true)
 |-- pcaFeaturesArr[11]: double (nullable = true)
 |-- pcaFeaturesArr[12]: double (nullable = true)
 |-- pcaFeaturesArr[13]: double (nullable 

In [68]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def evaluate_weighted_logistic_regression(fires_weight=1, no_fires_weight=1):
    # Add class weights for the Logistic Regression classifier below
    with_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 1
    """.format(fires_weight))
    without_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 0
    """.format(no_fires_weight))

    with_fires_train,    with_fires_test    = with_fires_df.randomSplit([0.8, 0.2], seed=42)
    without_fires_train, without_fires_test = without_fires_df.randomSplit([0.8, 0.2], seed=42)

    train_df = with_fires_train.union(without_fires_train).sample(fraction=1.0, seed=42)
    test_df  = with_fires_test.union(without_fires_test).sample(fraction=1.0, seed=42)
    train_df.cache()
    test_df.cache()
    print("Train count, test count:", train_df.count(), test_df.count())


    assembler = VectorAssembler(
        inputCols=["pcaFeaturesArr[{}]".format(i) for i in range(0, 40)],
        outputCol="features")

    lr = LogisticRegression(
        featuresCol='features', 
        labelCol='fire_occurred',
        weightCol='weight',
        family="binomial")

    pipeline = Pipeline(stages=[assembler, lr])

    model = pipeline.fit(train_df)

    predictions = model.transform(test_df)
    predictions.createOrReplaceTempView('predictions')

    evaluator = MulticlassClassificationEvaluator(labelCol="fire_occurred", predictionCol="prediction",
                                                  metricName="f1")

    print("Test results for fire/no fire weights", fires_weight, no_fires_weight)

    f1 = evaluator.evaluate(predictions)
    print("Test set f1 score = " + str(f1))

    true_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 1""")
    true_positive = true_positive.count()

    false_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 0""")
    false_negative = false_negative.count()

    true_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 0""")
    true_negative = true_negative.count()

    false_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 1""")
    false_positive = false_positive.count()
    
    print("TP:", true_positive)
    print("FP:", false_positive)
    print("TN:", true_negative)
    print("FN:", false_negative)
    
    print("% of fires recalled: {:%}".format(true_positive/(true_positive + false_negative)))
    print("Accuracy for non-fires: {:%}".format(true_negative/(true_negative + false_positive)))
    
    print("*" * 80)

In [69]:
evaluate_weighted_logistic_regression(1, 1)

Train count, test count: 5461464 1363946
Test results for fire/no fire weights 1 1
Test set f1 score = 0.9996621919227574
TP: 46
FP: 24
TN: 1363556
FN: 320
% of fires recalled: 12.568306%
Accuracy for non-fires: 99.998240%
********************************************************************************

In [70]:
evaluate_weighted_logistic_regression(1e3, 1)

Train count, test count: 5461464 1363946
Test results for fire/no fire weights 1000.0 1
Test set f1 score = 0.9838473118140173
TP: 308
FP: 42607
TN: 1320973
FN: 58
% of fires recalled: 84.153005%
Accuracy for non-fires: 96.875358%
********************************************************************************

In [71]:
evaluate_weighted_logistic_regression(1e4, 1)

Train count, test count: 5461464 1363946
Test results for fire/no fire weights 10000.0 1
Test set f1 score = 0.9194332456547004
TP: 352
FP: 202750
TN: 1160830
FN: 14
% of fires recalled: 96.174863%
Accuracy for non-fires: 85.131052%
********************************************************************************

In [72]:
evaluate_weighted_logistic_regression(1e5, 1)

Train count, test count: 5461464 1363946
Test results for fire/no fire weights 100000.0 1
Test set f1 score = 0.8250901631433544
TP: 366
FP: 405558
TN: 958022
FN: 0
% of fires recalled: 100.000000%
Accuracy for non-fires: 70.257851%
********************************************************************************

In [73]:
evaluate_weighted_logistic_regression(1e6, 1)

Train count, test count: 5461464 1363946
Test results for fire/no fire weights 1000000.0 1
Test set f1 score = 0.7718372685731948
TP: 366
FP: 506266
TN: 857314
FN: 0
% of fires recalled: 100.000000%
Accuracy for non-fires: 62.872292%
********************************************************************************