In [1]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.pca.parquet.gz'
pca_df = spark.read.parquet(s3_url)
pca_df.createOrReplaceTempView('pca')
pca_df.count()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1590254739326_0002,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.
6825410

In [2]:
base_df = spark.read.parquet('s3a://dse-cohort5-group5/wildfire_capstone/integratedData.renamed.parquet.gz')
base_df.createOrReplaceTempView("fire_occurrences")
base_df.count()

6826300

In [3]:
base_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- precipitation_amount_mm: double (nullable = true)
 |-- relative_humidity_%: double (nullable = true)
 |-- specific_humidity_kg/kg: double (nullable = true)
 |-- surface_downwelling_shortwave_flux_in_air_W_m-2: double (nullable = true)
 |-- wind_from_direction_Degrees_Clockwise_from_north: double (nullable = true)
 |-- wind_speed_m/s: double (nullable = true)
 |-- max_air_temperature_K: double (nullable = true)
 |-- min_air_temperature_K: double (nullable = true)
 |-- burning_index_g_Unitless: double (nullable = true)
 |-- dead_fuel_moisture_100hr_Percent: double (nullable = true)
 |-- dead_fuel_moisture_1000hr_Percent: double (nullable = true)
 |-- energy_release_component-g_Unitless: double (nullable = true)
 |-- potential_evapotranspiration_mm: double (nullable = true)
 |-- mean_vapor_pressure_deficit_kPa: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- fire_name: strin

In [4]:
pca_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- pcaFeatures: vector (nullable = true)

In [5]:
join_query = """
SELECT fire_occurrences.date,          fire_occurrences.latitude,     fire_occurrences.longitude,
       fire_occurrences.fire_occurred, fire_occurrences.acres_burned, pca.pcaFeatures,
       fire_occurrences.fire_name,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'yyyy') as year,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'MM') as month,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'dd') as day
FROM fire_occurrences, pca
WHERE pca.date      = fire_occurrences.date
  AND pca.latitude  = fire_occurrences.latitude
  AND pca.longitude = fire_occurrences.longitude
"""

In [6]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

N_FEATURES_TO_KEEP = 40

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

joined_df = spark.sql(join_query)
joined_df = joined_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["fire_occurrences.date", "fire_occurrences.latitude", "fire_occurrences.longitude",
                              "fire_occurrences.fire_occurred", "fire_occurrences.acres_burned",
                              "year", "month", "day", "fire_occurrences.fire_name"]
                             + [col("pcaFeaturesArr")[i] for i in range(N_FEATURES_TO_KEEP)])

In [7]:
joined_df.cache()
joined_df.createOrReplaceTempView("joined")
joined_df.count()

6825410

In [8]:
joined_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- fire_occurred: integer (nullable = true)
 |-- acres_burned: double (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- fire_name: string (nullable = true)
 |-- pcaFeaturesArr[0]: double (nullable = true)
 |-- pcaFeaturesArr[1]: double (nullable = true)
 |-- pcaFeaturesArr[2]: double (nullable = true)
 |-- pcaFeaturesArr[3]: double (nullable = true)
 |-- pcaFeaturesArr[4]: double (nullable = true)
 |-- pcaFeaturesArr[5]: double (nullable = true)
 |-- pcaFeaturesArr[6]: double (nullable = true)
 |-- pcaFeaturesArr[7]: double (nullable = true)
 |-- pcaFeaturesArr[8]: double (nullable = true)
 |-- pcaFeaturesArr[9]: double (nullable = true)
 |-- pcaFeaturesArr[10]: double (nullable = true)
 |-- pcaFeaturesArr[11]: double (nullable = true)
 |-- pcaFeaturesArr[12]: double (nullable = true)
 

In [9]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def generate_predictions_with_logistic_regression(
    fires_weight=10000, # Chosen for good recall (About 80%) on validation set with 0.5 threshold.
    no_fires_weight=1,
    predict_year=2019):
    
    # Add class weights for the Logistic Regression classifier below
        # Add class weights for the Logistic Regression classifier below
    with_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 1
    """.format(fires_weight))
    without_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 0
    """.format(no_fires_weight))

    with_fires_train    = with_fires_df.filter("year != {}".format(predict_year))
    with_fires_test     = with_fires_df.filter("year  = {}".format(predict_year))
    without_fires_train = without_fires_df.filter("year != {}".format(predict_year))
    without_fires_test  = without_fires_df.filter("year  = {}".format(predict_year))
    
    train_df = with_fires_train.union(without_fires_train)
    test_df  = with_fires_test.union(without_fires_test)
    train_df.cache()
    test_df.cache()
    print("Train count, test count:", train_df.count(), test_df.count())

    assembler = VectorAssembler(
        inputCols=["pcaFeaturesArr[{}]".format(i) for i in range(0, 40)],
        outputCol="features")

    lr = LogisticRegression(
        featuresCol='features',
        probabilityCol='probability',
        labelCol='fire_occurred',
        weightCol='weight',
        family="binomial")

    pipeline = Pipeline(stages=[assembler, lr])

    model = pipeline.fit(train_df)

    predictions = model.transform(test_df)
    predictions.createOrReplaceTempView('predictions')

    print("Test results for year={}, when trained on other years".format(predict_year), fires_weight, no_fires_weight)

    true_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 1""")
    true_positive = true_positive.count()

    false_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 1 AND  prediction = 0""")
    false_negative = false_negative.count()

    true_negative = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 0""")
    true_negative = true_negative.count()

    false_positive = spark.sql("""
    SELECT * FROM predictions WHERE fire_occurred = 0 AND  prediction = 1""")
    false_positive = false_positive.count()
    
    print("True positives (fires predicted correctly):", true_positive)
    print("False positives (fire predicted incorrectly):", false_positive)
    print("True negatives (no fire predicted correctly):", true_negative)
    print("False negatives (no fire predicted incorrectly):", false_negative)

    precision = true_positive/(true_positive + false_positive + 1e-3)
    recall = true_positive/(true_positive + false_negative + 1e-3)
    print("% precision for fires: {:%}".format(precision))
    print("% of fires recalled: {:%}".format(recall))
    
    f1 = 2 * precision * recall / (precision + recall + 1e-3)
    print("Test set f1 score = " + str(f1))
    
    keep_cols = ["date", "latitude", "longitude", "fire_occurred",
                 "acres_burned", "year", "month", "day", "probability"]
    s3_url = "s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/"\
             "year={year}/weight={weight}"\
             .format(year=predict_year, weight=fires_weight)
    print("Saving predictions to {}".format(s3_url))
    predictions.select(*keep_cols).write.parquet(s3_url,
                                                 partitionBy=["month"],
                                                 compression="gzip",
                                                 mode="overwrite")
    
    print("*" * 80)

In [10]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2001)

Train count, test count: 6500560 324850
Test results for year=2001, when trained on other years 10000 1
True positives (fires predicted correctly): 6
False positives (fire predicted incorrectly): 76091
True negatives (no fire predicted correctly): 248742
False negatives (no fire predicted incorrectly): 11
% precision for fires: 0.007885%
% of fires recalled: 35.292042%
Test set f1 score = 0.00015721288212330195
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2001/weight=10000
********************************************************************************

In [11]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2001)

Train count, test count: 6500560 324850
Test results for year=2001, when trained on other years 1000 1
True positives (fires predicted correctly): 0
False positives (fire predicted incorrectly): 9122
True negatives (no fire predicted correctly): 315711
False negatives (no fire predicted incorrectly): 17
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Test set f1 score = 0.0
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2001/weight=1000
********************************************************************************

In [12]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2002)

Train count, test count: 6500560 324850
Test results for year=2002, when trained on other years 10000 1
True positives (fires predicted correctly): 66
False positives (fire predicted incorrectly): 130484
True negatives (no fire predicted correctly): 194291
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.050555%
% of fires recalled: 87.998827%
Test set f1 score = 0.0010093799209813297
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2002/weight=10000
********************************************************************************

In [13]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2002)

Train count, test count: 6500560 324850
Test results for year=2002, when trained on other years 1000 1
True positives (fires predicted correctly): 37
False positives (fire predicted incorrectly): 33461
True negatives (no fire predicted correctly): 291314
False negatives (no fire predicted incorrectly): 38
% precision for fires: 0.110454%
% of fires recalled: 49.332676%
Test set f1 score = 0.0021997030588378767
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2002/weight=1000
********************************************************************************

In [14]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2003)

Train count, test count: 6500560 324850
Test results for year=2003, when trained on other years 10000 1
True positives (fires predicted correctly): 161
False positives (fire predicted incorrectly): 96189
True negatives (no fire predicted correctly): 228487
False negatives (no fire predicted incorrectly): 13
% precision for fires: 0.167099%
% of fires recalled: 92.528204%
Test set f1 score = 0.00333236286326111
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2003/weight=10000
********************************************************************************

In [15]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2003)

Train count, test count: 6500560 324850
Test results for year=2003, when trained on other years 1000 1
True positives (fires predicted correctly): 92
False positives (fire predicted incorrectly): 16618
True negatives (no fire predicted correctly): 308058
False negatives (no fire predicted incorrectly): 82
% precision for fires: 0.550568%
% of fires recalled: 52.873259%
Test set f1 score = 0.010877529384066357
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2003/weight=1000
********************************************************************************

In [16]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2004)

Train count, test count: 6499670 325740
Test results for year=2004, when trained on other years 10000 1
True positives (fires predicted correctly): 45
False positives (fire predicted incorrectly): 97906
True negatives (no fire predicted correctly): 227781
False negatives (no fire predicted incorrectly): 8
% precision for fires: 0.045941%
% of fires recalled: 84.904058%
Test set f1 score = 0.0009172500924387839
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2004/weight=10000
********************************************************************************

In [17]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2004)

Train count, test count: 6499670 325740
Test results for year=2004, when trained on other years 1000 1
True positives (fires predicted correctly): 23
False positives (fire predicted incorrectly): 18412
True negatives (no fire predicted correctly): 307275
False negatives (no fire predicted incorrectly): 30
% precision for fires: 0.124763%
% of fires recalled: 43.395408%
Test set f1 score = 0.0024823961074315905
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2004/weight=1000
********************************************************************************

In [18]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2005)

Train count, test count: 6500560 324850
Test results for year=2005, when trained on other years 10000 1
True positives (fires predicted correctly): 54
False positives (fire predicted incorrectly): 114635
True negatives (no fire predicted correctly): 210158
False negatives (no fire predicted incorrectly): 3
% precision for fires: 0.047084%
% of fires recalled: 94.735180%
Test set f1 score = 0.0009402172872457612
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2005/weight=10000
********************************************************************************

In [19]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2005)

Train count, test count: 6500560 324850
Test results for year=2005, when trained on other years 1000 1
True positives (fires predicted correctly): 27
False positives (fire predicted incorrectly): 31356
True negatives (no fire predicted correctly): 293437
False negatives (no fire predicted incorrectly): 30
% precision for fires: 0.086034%
% of fires recalled: 47.367590%
Test set f1 score = 0.0017139453103583984
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2005/weight=1000
********************************************************************************

In [20]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2006)

Train count, test count: 6500560 324850
Test results for year=2006, when trained on other years 10000 1
True positives (fires predicted correctly): 68
False positives (fire predicted incorrectly): 116896
True negatives (no fire predicted correctly): 207881
False negatives (no fire predicted incorrectly): 5
% precision for fires: 0.058138%
% of fires recalled: 93.149409%
Test set f1 score = 0.0011607802927492857
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2006/weight=10000
********************************************************************************

In [21]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2006)

Train count, test count: 6500560 324850
Test results for year=2006, when trained on other years 1000 1
True positives (fires predicted correctly): 26
False positives (fire predicted incorrectly): 24286
True negatives (no fire predicted correctly): 300491
False negatives (no fire predicted incorrectly): 47
% precision for fires: 0.106943%
% of fires recalled: 35.615950%
Test set f1 score = 0.0021265055229426778
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2006/weight=1000
********************************************************************************

In [22]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2007)

Train count, test count: 6500560 324850
Test results for year=2007, when trained on other years 10000 1
True positives (fires predicted correctly): 1046
False positives (fire predicted incorrectly): 130685
True negatives (no fire predicted correctly): 193088
False negatives (no fire predicted incorrectly): 31
% precision for fires: 0.794042%
% of fires recalled: 97.121544%
Test set f1 score = 0.015735991913954143
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2007/weight=10000
********************************************************************************

In [23]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2007)

Train count, test count: 6500560 324850
Test results for year=2007, when trained on other years 1000 1
True positives (fires predicted correctly): 622
False positives (fire predicted incorrectly): 16762
True negatives (no fire predicted correctly): 307011
False negatives (no fire predicted incorrectly): 455
% precision for fires: 3.578003%
% of fires recalled: 57.752964%
Test set f1 score = 0.06727559872853058
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2007/weight=1000
********************************************************************************

In [24]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2008)

Train count, test count: 6499670 325740
Test results for year=2008, when trained on other years 10000 1
True positives (fires predicted correctly): 28
False positives (fire predicted incorrectly): 150972
True negatives (no fire predicted correctly): 174740
False negatives (no fire predicted incorrectly): 0
% precision for fires: 0.018543%
% of fires recalled: 99.996429%
Test set f1 score = 0.0003704217997284948
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2008/weight=10000
********************************************************************************

In [25]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2008)

Train count, test count: 6499670 325740
Test results for year=2008, when trained on other years 1000 1
True positives (fires predicted correctly): 11
False positives (fire predicted incorrectly): 57701
True negatives (no fire predicted correctly): 268011
False negatives (no fire predicted incorrectly): 17
% precision for fires: 0.019060%
% of fires recalled: 39.284311%
Test set f1 score = 0.0003800513760655432
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2008/weight=1000
********************************************************************************

In [26]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2009)

Train count, test count: 6500560 324850
Test results for year=2009, when trained on other years 10000 1
True positives (fires predicted correctly): 11
False positives (fire predicted incorrectly): 99429
True negatives (no fire predicted correctly): 225406
False negatives (no fire predicted incorrectly): 4
% precision for fires: 0.011062%
% of fires recalled: 73.328445%
Test set f1 score = 0.00022090435805100715
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2009/weight=10000
********************************************************************************

In [27]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2009)

Train count, test count: 6500560 324850
Test results for year=2009, when trained on other years 1000 1
True positives (fires predicted correctly): 1
False positives (fire predicted incorrectly): 23173
True negatives (no fire predicted correctly): 301662
False negatives (no fire predicted incorrectly): 14
% precision for fires: 0.004315%
% of fires recalled: 6.666222%
Test set f1 score = 8.497391338820717e-05
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2009/weight=1000
********************************************************************************

In [28]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2010)

Train count, test count: 6500560 324850
Test results for year=2010, when trained on other years 10000 1
True positives (fires predicted correctly): 22
False positives (fire predicted incorrectly): 72927
True negatives (no fire predicted correctly): 251892
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.030158%
% of fires recalled: 70.965453%
Test set f1 score = 0.0006020568690852767
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2010/weight=10000
********************************************************************************

In [29]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2010)

Train count, test count: 6500560 324850
Test results for year=2010, when trained on other years 1000 1
True positives (fires predicted correctly): 1
False positives (fire predicted incorrectly): 14788
True negatives (no fire predicted correctly): 310031
False negatives (no fire predicted incorrectly): 30
% precision for fires: 0.006762%
% of fires recalled: 3.225702%
Test set f1 score = 0.00013090310994122903
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2010/weight=1000
********************************************************************************

In [30]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2011)

Train count, test count: 6500560 324850
Test results for year=2011, when trained on other years 10000 1
True positives (fires predicted correctly): 81
False positives (fire predicted incorrectly): 121228
True negatives (no fire predicted correctly): 203539
False negatives (no fire predicted incorrectly): 2
% precision for fires: 0.066772%
% of fires recalled: 97.589186%
Test set f1 score = 0.001333154396772857
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2011/weight=10000
********************************************************************************

In [31]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2011)

Train count, test count: 6500560 324850
Test results for year=2011, when trained on other years 1000 1
True positives (fires predicted correctly): 57
False positives (fire predicted incorrectly): 26006
True negatives (no fire predicted correctly): 298761
False negatives (no fire predicted incorrectly): 26
% precision for fires: 0.218701%
% of fires recalled: 68.673871%
Test set f1 score = 0.004353811524275209
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2011/weight=1000
********************************************************************************

In [32]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2012)

Train count, test count: 6499670 325740
Test results for year=2012, when trained on other years 10000 1
True positives (fires predicted correctly): 76
False positives (fire predicted incorrectly): 94916
True negatives (no fire predicted correctly): 230676
False negatives (no fire predicted incorrectly): 72
% precision for fires: 0.080007%
% of fires recalled: 51.351004%
Test set f1 score = 0.0015945451839090093
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2012/weight=10000
********************************************************************************

In [33]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2012)

Train count, test count: 6499670 325740
Test results for year=2012, when trained on other years 1000 1
True positives (fires predicted correctly): 8
False positives (fire predicted incorrectly): 12104
True negatives (no fire predicted correctly): 313488
False negatives (no fire predicted incorrectly): 140
% precision for fires: 0.066050%
% of fires recalled: 5.405369%
Test set f1 score = 0.001281632745520225
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2012/weight=1000
********************************************************************************

In [34]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2013)

Train count, test count: 6500560 324850
Test results for year=2013, when trained on other years 10000 1
True positives (fires predicted correctly): 79
False positives (fire predicted incorrectly): 105207
True negatives (no fire predicted correctly): 219555
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.075034%
% of fires recalled: 89.771707%
Test set f1 score = 0.0014977540712055783
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2013/weight=10000
********************************************************************************

In [35]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2013)

Train count, test count: 6500560 324850
Test results for year=2013, when trained on other years 1000 1
True positives (fires predicted correctly): 0
False positives (fire predicted incorrectly): 5246
True negatives (no fire predicted correctly): 319516
False negatives (no fire predicted incorrectly): 88
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Test set f1 score = 0.0
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2013/weight=1000
********************************************************************************

In [36]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2014)

Train count, test count: 6500560 324850
Test results for year=2014, when trained on other years 10000 1
True positives (fires predicted correctly): 125
False positives (fire predicted incorrectly): 106912
True negatives (no fire predicted correctly): 217804
False negatives (no fire predicted incorrectly): 9
% precision for fires: 0.116782%
% of fires recalled: 93.282886%
Test set f1 score = 0.002330225664694496
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2014/weight=10000
********************************************************************************

In [37]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2014)

Train count, test count: 6500560 324850
Test results for year=2014, when trained on other years 1000 1
True positives (fires predicted correctly): 89
False positives (fire predicted incorrectly): 19242
True negatives (no fire predicted correctly): 305474
False negatives (no fire predicted incorrectly): 45
% precision for fires: 0.460400%
% of fires recalled: 66.417415%
Test set f1 score = 0.009130964402710533
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2014/weight=1000
********************************************************************************

In [38]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2015)

Train count, test count: 6500560 324850
Test results for year=2015, when trained on other years 10000 1
True positives (fires predicted correctly): 11
False positives (fire predicted incorrectly): 46459
True negatives (no fire predicted correctly): 278376
False negatives (no fire predicted incorrectly): 4
% precision for fires: 0.023671%
% of fires recalled: 73.328445%
Test set f1 score = 0.0004726266014640426
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2015/weight=10000
********************************************************************************

In [39]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2015)

Train count, test count: 6500560 324850
Test results for year=2015, when trained on other years 1000 1
True positives (fires predicted correctly): 0
False positives (fire predicted incorrectly): 1727
True negatives (no fire predicted correctly): 323108
False negatives (no fire predicted incorrectly): 15
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Test set f1 score = 0.0
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2015/weight=1000
********************************************************************************

In [40]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2016)

Train count, test count: 6499670 325740
Test results for year=2016, when trained on other years 10000 1
True positives (fires predicted correctly): 46
False positives (fire predicted incorrectly): 55940
True negatives (no fire predicted correctly): 269753
False negatives (no fire predicted incorrectly): 1
% precision for fires: 0.082163%
% of fires recalled: 97.870258%
Test set f1 score = 0.0016402150403873968
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2016/weight=10000
********************************************************************************

In [41]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2016)

Train count, test count: 6499670 325740
Test results for year=2016, when trained on other years 1000 1
True positives (fires predicted correctly): 37
False positives (fire predicted incorrectly): 6746
True negatives (no fire predicted correctly): 318947
False negatives (no fire predicted incorrectly): 10
% precision for fires: 0.545481%
% of fires recalled: 78.721729%
Test set f1 score = 0.010820899101311057
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2016/weight=1000
********************************************************************************

In [42]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2017)

Train count, test count: 6500560 324850
Test results for year=2017, when trained on other years 10000 1
True positives (fires predicted correctly): 5
False positives (fire predicted incorrectly): 24865
True negatives (no fire predicted correctly): 299945
False negatives (no fire predicted incorrectly): 35
% precision for fires: 0.020105%
% of fires recalled: 12.499688%
Test set f1 score = 0.00039826409454314367
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2017/weight=10000
********************************************************************************

In [43]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2017)

Train count, test count: 6500560 324850
Test results for year=2017, when trained on other years 1000 1
True positives (fires predicted correctly): 0
False positives (fire predicted incorrectly): 1747
True negatives (no fire predicted correctly): 323063
False negatives (no fire predicted incorrectly): 40
% precision for fires: 0.000000%
% of fires recalled: 0.000000%
Test set f1 score = 0.0
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2017/weight=1000
********************************************************************************

In [44]:
generate_predictions_with_logistic_regression(fires_weight=10000, predict_year=2018)

Train count, test count: 6500560 324850
Test results for year=2018, when trained on other years 10000 1
True positives (fires predicted correctly): 7
False positives (fire predicted incorrectly): 39534
True negatives (no fire predicted correctly): 285304
False negatives (no fire predicted incorrectly): 5
% precision for fires: 0.017703%
% of fires recalled: 58.328473%
Test set f1 score = 0.00035334982505162357
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2018/weight=10000
********************************************************************************

In [45]:
generate_predictions_with_logistic_regression(fires_weight=1000, predict_year=2018)

Train count, test count: 6500560 324850
Test results for year=2018, when trained on other years 1000 1
True positives (fires predicted correctly): 4
False positives (fire predicted incorrectly): 5356
True negatives (no fire predicted correctly): 319482
False negatives (no fire predicted incorrectly): 8
% precision for fires: 0.074627%
% of fires recalled: 33.330556%
Test set f1 score = 0.0014847580281758142
Saving predictions to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/with_probs/year=2018/weight=1000
********************************************************************************