In [1]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.pca.parquet.gz'
pca_df = spark.read.parquet(s3_url)
pca_df.createOrReplaceTempView('pca')
pca_df.count()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1589570987803_0004,pyspark3,idle,,,✔


SparkSession available as 'spark'.
6825410

In [2]:
base_df = spark.read.parquet('s3a://dse-cohort5-group5/wildfire_capstone/integratedData.renamed.parquet.gz')
base_df.createOrReplaceTempView("fire_occurrences")
base_df.count()

6826300

In [3]:
base_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- precipitation_amount_mm: double (nullable = true)
 |-- relative_humidity_%: double (nullable = true)
 |-- specific_humidity_kg/kg: double (nullable = true)
 |-- surface_downwelling_shortwave_flux_in_air_W_m-2: double (nullable = true)
 |-- wind_from_direction_Degrees_Clockwise_from_north: double (nullable = true)
 |-- wind_speed_m/s: double (nullable = true)
 |-- max_air_temperature_K: double (nullable = true)
 |-- min_air_temperature_K: double (nullable = true)
 |-- burning_index_g_Unitless: double (nullable = true)
 |-- dead_fuel_moisture_100hr_Percent: double (nullable = true)
 |-- dead_fuel_moisture_1000hr_Percent: double (nullable = true)
 |-- energy_release_component-g_Unitless: double (nullable = true)
 |-- potential_evapotranspiration_mm: double (nullable = true)
 |-- mean_vapor_pressure_deficit_kPa: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- fire_name: strin

In [4]:
join_query = """
SELECT fire_occurrences.date,          fire_occurrences.latitude,     fire_occurrences.longitude,
       fire_occurrences.fire_occurred, fire_occurrences.acres_burned, pca.pcaFeatures,
       fire_occurrences.fire_name,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'yyyy') as year,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'MM') as month,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'dd') as day
FROM fire_occurrences, pca
WHERE pca.date      = fire_occurrences.date
  AND pca.latitude  = fire_occurrences.latitude
  AND pca.longitude = fire_occurrences.longitude
"""

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

N_FEATURES_TO_KEEP = 40

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

joined_df = spark.sql(join_query)
joined_df = joined_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["fire_occurrences.date", "fire_occurrences.latitude", "fire_occurrences.longitude",
                              "fire_occurrences.fire_occurred", "fire_occurrences.acres_burned",
                              "year", "month", "day", "fire_occurrences.fire_name"]
                             + [col("pcaFeaturesArr")[i] for i in range(N_FEATURES_TO_KEEP)])

In [6]:
joined_df.cache()
joined_df.createOrReplaceTempView("joined")
joined_df.count()

6825410

In [7]:
joined_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- fire_name: string (nullable = true)
 |-- pcaFeaturesArr[0]: double (nullable = true)
 |-- pcaFeaturesArr[1]: double (nullable = true)
 |-- pcaFeaturesArr[2]: double (nullable = true)
 |-- pcaFeaturesArr[3]: double (nullable = true)
 |-- pcaFeaturesArr[4]: double (nullable = true)
 |-- pcaFeaturesArr[5]: double (nullable = true)
 |-- pcaFeaturesArr[6]: double (nullable = true)
 |-- pcaFeaturesArr[7]: double (nullable = true)
 |-- pcaFeaturesArr[8]: double (nullable = true)
 |-- pcaFeaturesArr[9]: double (nullable = true)
 |-- pcaFeaturesArr[10]: double (nullable = true)
 |-- pcaFeaturesArr[11]: double (nullable = true)
 |-- pcaFeaturesArr[12]: double (nullable = true)
 

In [8]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def evaluate_weighted_logistic_regression(
    fires_weight=10000, # Chosen for good recall (About 80%) and decent accuracy (around 83%) on validation set.
    no_fires_weight=1,
    predict_year=2019):
    
    # Add class weights for the Logistic Regression classifier below
        # Add class weights for the Logistic Regression classifier below
    with_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 1
    """.format(fires_weight))
    without_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 0
    """.format(no_fires_weight))

    with_fires_train    = with_fires_df.filter("year != {}".format(predict_year))
    with_fires_test     = with_fires_df.filter("year  = {}".format(predict_year))
    without_fires_train = without_fires_df.filter("year != {}".format(predict_year))
    without_fires_test  = without_fires_df.filter("year  = {}".format(predict_year))
    
    train_df = with_fires_train.union(without_fires_train)
    test_df  = with_fires_test.union(without_fires_test)
    train_df.cache()
    test_df.cache()
    print("Train count, test count:", train_df.count(), test_df.count())

    assembler = VectorAssembler(
        inputCols=["pcaFeaturesArr[{}]".format(i) for i in range(0, 40)],
        outputCol="features")

    lr = LogisticRegression(
        featuresCol='features', 
        labelCol='fire_occurred',
        weightCol='weight',
        family="binomial")

    pipeline = Pipeline(stages=[assembler, lr])

    model = pipeline.fit(train_df)

    predictions = model.transform(test_df)
    predictions.createOrReplaceTempView('predictions')

    evaluator = MulticlassClassificationEvaluator(labelCol="fire_occurred", predictionCol="prediction",
                                                  metricName="f1")

    print("Test results for year={}, when trained on other years".format(predict_year), fires_weight, no_fires_weight)

    f1 = evaluator.evaluate(predictions)
    print("Test set f1 score = " + str(f1))

    true_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 1""")
    true_positive = true_positive.count()

    false_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 0""")
    false_negative = false_negative.count()

    true_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 0""")
    true_negative = true_negative.count()

    false_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 1""")
    false_positive = false_positive.count()
    
    print("True positives (fires predicted correctly):", true_positive)
    print("False positives (fire predicted incorrectly):", false_positive)
    print("True negatives (no fire predicted correctly):", true_negative)
    print("False negatives (no fire predicted incorrectly):", false_negative)

    print("% precision for fires: {:%}".format(true_positive/(true_positive + false_positive + 1e-3)))
    print("% of fires recalled: {:%}".format(true_positive/(true_positive + false_negative + 1e-3)))
    
    keep_cols = ["date", "latitude", "longitude", "fire_occurred",
                 "acres_burned", "year", "month", "day", "prediction"]
    s3_url = "s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/{}/predictions.parquet.gz"\
             .format(predict_year)
    print("Saving predictions to {}".format(s3_url))
    predictions.select(*keep_cols).write.parquet(s3_url, mode="overwrite")
    
    print("*" * 80)

In [9]:
evaluate_weighted_logistic_regression(predict_year=2000)

Train count, test count: 6499670 325740
Test results for year=2000, when trained on other years 10000 1
Test set f1 score = 0.8267413925377382
True positives (fires predicted correctly): 0
False positives (fire predicted incorrectly): 96206
True negatives (no fire predicted correctly): 229534
False negatives (no fire predicted incorrectly): 0
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2000/predictions.parquet.gz
********************************************************************************

In [10]:
evaluate_weighted_logistic_regression(predict_year=2001)

Train count, test count: 6500560 324850
Test results for year=2001, when trained on other years 10000 1
Test set f1 score = 0.8672770438549545
True positives (fires predicted correctly): 6
False positives (fire predicted incorrectly): 76091
True negatives (no fire predicted correctly): 248742
False negatives (no fire predicted incorrectly): 11
% precision for fires: 0.007885%
% of fires recalled: 35.292042%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2001/predictions.parquet.gz
********************************************************************************

In [11]:
evaluate_weighted_logistic_regression(predict_year=2002)

Train count, test count: 6500560 324850
Test results for year=2002, when trained on other years 10000 1
Test set f1 score = 0.7484321281656211
True positives (fires predicted correctly): 66
False positives (fire predicted incorrectly): 130484
True negatives (no fire predicted correctly): 194291
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.050555%
% of fires recalled: 87.998827%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2002/predictions.parquet.gz
********************************************************************************

In [12]:
evaluate_weighted_logistic_regression(predict_year=2003)

Train count, test count: 6500560 324850
Test results for year=2003, when trained on other years 10000 1
Test set f1 score = 0.8256508211129582
True positives (fires predicted correctly): 161
False positives (fire predicted incorrectly): 96189
True negatives (no fire predicted correctly): 228487
False negatives (no fire predicted incorrectly): 13
% precision for fires: 0.167099%
% of fires recalled: 92.528204%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2003/predictions.parquet.gz
********************************************************************************

In [13]:
evaluate_weighted_logistic_regression(predict_year=2004)

Train count, test count: 6499670 325740
Test results for year=2004, when trained on other years 10000 1
Test set f1 score = 0.8229588271618705
True positives (fires predicted correctly): 45
False positives (fire predicted incorrectly): 97906
True negatives (no fire predicted correctly): 227781
False negatives (no fire predicted incorrectly): 8
% precision for fires: 0.045941%
% of fires recalled: 84.904058%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2004/predictions.parquet.gz
********************************************************************************

In [14]:
evaluate_weighted_logistic_regression(predict_year=2005)

Train count, test count: 6500560 324850
Test results for year=2005, when trained on other years 10000 1
Test set f1 score = 0.785567240077933
True positives (fires predicted correctly): 54
False positives (fire predicted incorrectly): 114635
True negatives (no fire predicted correctly): 210158
False negatives (no fire predicted incorrectly): 3
% precision for fires: 0.047084%
% of fires recalled: 94.735180%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2005/predictions.parquet.gz
********************************************************************************

In [15]:
evaluate_weighted_logistic_regression(predict_year=2006)

Train count, test count: 6500560 324850
Test results for year=2006, when trained on other years 10000 1
Test set f1 score = 0.7803596447116296
True positives (fires predicted correctly): 68
False positives (fire predicted incorrectly): 116896
True negatives (no fire predicted correctly): 207881
False negatives (no fire predicted incorrectly): 5
% precision for fires: 0.058138%
% of fires recalled: 93.149409%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2006/predictions.parquet.gz
********************************************************************************

In [16]:
evaluate_weighted_logistic_regression(predict_year=2007)

Train count, test count: 6500560 324850
Test results for year=2007, when trained on other years 10000 1
Test set f1 score = 0.74468685026667
True positives (fires predicted correctly): 1046
False positives (fire predicted incorrectly): 130685
True negatives (no fire predicted correctly): 193088
False negatives (no fire predicted incorrectly): 31
% precision for fires: 0.794042%
% of fires recalled: 97.121544%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2007/predictions.parquet.gz
********************************************************************************

In [17]:
evaluate_weighted_logistic_regression(predict_year=2008)

Train count, test count: 6499670 325740
Test results for year=2008, when trained on other years 10000 1
Test set f1 score = 0.6982687156906009
True positives (fires predicted correctly): 28
False positives (fire predicted incorrectly): 150972
True negatives (no fire predicted correctly): 174740
False negatives (no fire predicted incorrectly): 0
% precision for fires: 0.018543%
% of fires recalled: 99.996429%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2008/predictions.parquet.gz
********************************************************************************

In [18]:
evaluate_weighted_logistic_regression(predict_year=2009)

Train count, test count: 6500560 324850
Test results for year=2009, when trained on other years 10000 1
Test set f1 score = 0.8192554031478546
True positives (fires predicted correctly): 11
False positives (fire predicted incorrectly): 99429
True negatives (no fire predicted correctly): 225406
False negatives (no fire predicted incorrectly): 4
% precision for fires: 0.011062%
% of fires recalled: 73.328445%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2009/predictions.parquet.gz
********************************************************************************

In [19]:
evaluate_weighted_logistic_regression(predict_year=2010)

Train count, test count: 6500560 324850
Test results for year=2010, when trained on other years 10000 1
Test set f1 score = 0.8734497810887527
True positives (fires predicted correctly): 22
False positives (fire predicted incorrectly): 72927
True negatives (no fire predicted correctly): 251892
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.030158%
% of fires recalled: 70.965453%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2010/predictions.parquet.gz
********************************************************************************

In [20]:
evaluate_weighted_logistic_regression(predict_year=2011)

Train count, test count: 6500560 324850
Test results for year=2011, when trained on other years 10000 1
Test set f1 score = 0.7703350520454142
True positives (fires predicted correctly): 81
False positives (fire predicted incorrectly): 121228
True negatives (no fire predicted correctly): 203539
False negatives (no fire predicted incorrectly): 2
% precision for fires: 0.066772%
% of fires recalled: 97.589186%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2011/predictions.parquet.gz
********************************************************************************

In [21]:
evaluate_weighted_logistic_regression(predict_year=2012)

Train count, test count: 6499670 325740
Test results for year=2012, when trained on other years 10000 1
Test set f1 score = 0.8288866314066394
True positives (fires predicted correctly): 76
False positives (fire predicted incorrectly): 94916
True negatives (no fire predicted correctly): 230676
False negatives (no fire predicted incorrectly): 72
% precision for fires: 0.080007%
% of fires recalled: 51.351004%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2012/predictions.parquet.gz
********************************************************************************

In [22]:
evaluate_weighted_logistic_regression(predict_year=2013)

Train count, test count: 6500560 324850
Test results for year=2013, when trained on other years 10000 1
Test set f1 score = 0.8064859454055139
True positives (fires predicted correctly): 79
False positives (fire predicted incorrectly): 105207
True negatives (no fire predicted correctly): 219555
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.075034%
% of fires recalled: 89.771707%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2013/predictions.parquet.gz
********************************************************************************

In [23]:
evaluate_weighted_logistic_regression(predict_year=2014)

Train count, test count: 6500560 324850
Test results for year=2014, when trained on other years 10000 1
Test set f1 score = 0.8025908929894952
True positives (fires predicted correctly): 125
False positives (fire predicted incorrectly): 106912
True negatives (no fire predicted correctly): 217804
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.116782%
% of fires recalled: 93.282886%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2014/predictions.parquet.gz
********************************************************************************

In [24]:
evaluate_weighted_logistic_regression(predict_year=2015)

Train count, test count: 6500560 324850
Test results for year=2015, when trained on other years 10000 1
Test set f1 score = 0.9229317988956491
True positives (fires predicted correctly): 11
False positives (fire predicted incorrectly): 46459
True negatives (no fire predicted correctly): 278376
False negatives (no fire predicted incorrectly): 4
% precision for fires: 0.023671%
% of fires recalled: 73.328445%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2015/predictions.parquet.gz
********************************************************************************

In [25]:
evaluate_weighted_logistic_regression(predict_year=2016)

Train count, test count: 6499670 325740
Test results for year=2016, when trained on other years 10000 1
Test set f1 score = 0.9059215974544884
True positives (fires predicted correctly): 46
False positives (fire predicted incorrectly): 55940
True negatives (no fire predicted correctly): 269753
False negatives (no fire predicted incorrectly): 1
% precision for fires: 0.082163%
% of fires recalled: 97.870258%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2016/predictions.parquet.gz
********************************************************************************

In [26]:
evaluate_weighted_logistic_regression(predict_year=2017)

Train count, test count: 6500560 324850
Test results for year=2017, when trained on other years 10000 1
Test set f1 score = 0.9600284322357391
True positives (fires predicted correctly): 5
False positives (fire predicted incorrectly): 24865
True negatives (no fire predicted correctly): 299945
False negatives (no fire predicted incorrectly): 35
% precision for fires: 0.020105%
% of fires recalled: 12.499688%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2017/predictions.parquet.gz
********************************************************************************

In [27]:
evaluate_weighted_logistic_regression(predict_year=2018)

Train count, test count: 6500560 324850
Test results for year=2018, when trained on other years 10000 1
Test set f1 score = 0.9351630502915388
True positives (fires predicted correctly): 7
False positives (fire predicted incorrectly): 39534
True negatives (no fire predicted correctly): 285304
False negatives (no fire predicted incorrectly): 5
% precision for fires: 0.017703%
% of fires recalled: 58.328473%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2018/predictions.parquet.gz
********************************************************************************

In [28]:
evaluate_weighted_logistic_regression(predict_year=2019)

Train count, test count: 6500560 324850
Test results for year=2019, when trained on other years 10000 1
Test set f1 score = 0.8855114776707756
True positives (fires predicted correctly): 0
False positives (fire predicted incorrectly): 66742
True negatives (no fire predicted correctly): 258108
False negatives (no fire predicted incorrectly): 0
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Saving predictions to s3a://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/2019/predictions.parquet.gz
********************************************************************************