In [1]:
s3_url = 's3a://dse-cohort5-group5/wildfire_capstone/integratedData.pca.parquet.gz'
pca_df = spark.read.parquet(s3_url)
pca_df.createOrReplaceTempView('pca')

base_df = spark.read.parquet('s3a://dse-cohort5-group5/wildfire_capstone/integratedData.renamed.parquet.gz')
base_df.createOrReplaceTempView("fire_occurrences")

join_query = """
SELECT fire_occurrences.date,          fire_occurrences.latitude,     fire_occurrences.longitude,
       fire_occurrences.fire_occurred, fire_occurrences.acres_burned, pca.pcaFeatures,
       fire_occurrences.fire_name,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'yyyy') as year,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'MM') as month,
       from_unixtime(cast(fire_occurrences.date/1e9 as long), 'dd') as day
FROM fire_occurrences, pca
WHERE pca.date      = fire_occurrences.date
  AND pca.latitude  = fire_occurrences.latitude
  AND pca.longitude = fire_occurrences.longitude
"""

from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType

N_FEATURES_TO_KEEP = 40

def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

joined_df = spark.sql(join_query)
joined_df = joined_df.withColumn("pcaFeaturesArr", to_array(col("pcaFeatures")))\
                     .select(["fire_occurrences.date", "fire_occurrences.latitude", "fire_occurrences.longitude",
                              "fire_occurrences.fire_occurred", "fire_occurrences.acres_burned",
                              "year", "month", "day", "fire_occurrences.fire_name"]
                             + [col("pcaFeaturesArr")[i] for i in range(N_FEATURES_TO_KEEP)])

joined_df.cache()
joined_df.createOrReplaceTempView("joined")

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def train_logistic_regression_and_save_model(
    fires_weight=10000, # Chosen for good recall (About 80%) on validation set with 0.5 threshold.
    no_fires_weight=1,
    predict_year=2019):
    
    # Add class weights for the Logistic Regression classifier below
        # Add class weights for the Logistic Regression classifier below
    with_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 1
    """.format(fires_weight))
    without_fires_df = spark.sql("""
    SELECT *, {} as weight FROM joined WHERE joined.fire_occurred = 0
    """.format(no_fires_weight))

    with_fires_train    = with_fires_df.filter("year != {}".format(predict_year))
    with_fires_test     = with_fires_df.filter("year  = {}".format(predict_year))
    without_fires_train = without_fires_df.filter("year != {}".format(predict_year))
    without_fires_test  = without_fires_df.filter("year  = {}".format(predict_year))
    
    train_df = with_fires_train.union(without_fires_train)
    test_df  = with_fires_test.union(without_fires_test)
    train_df.cache()
    test_df.cache()
    print("Train count, test count:", train_df.count(), test_df.count())

    assembler = VectorAssembler(
        inputCols=["pcaFeaturesArr[{}]".format(i) for i in range(0, 40)],
        outputCol="features")

    lr = LogisticRegression(
        featuresCol='features',
        probabilityCol='probability',
        labelCol='fire_occurred',
        weightCol='weight',
        family="binomial")

    pipeline = Pipeline(stages=[assembler, lr])

    model = pipeline.fit(train_df)

    model_s3_url = "s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/models/"\
             "year={year}/weight={weight}/model.sparkobject"\
             .format(year=predict_year, weight=fires_weight)
    print("saving model to", model_s3_url)
    model.save(model_s3_url)
    
    print("*" * 80)


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1590806483524_0002,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
train_logistic_regression_and_save_model(fires_weight=10000, predict_year=2007)

Train count, test count: 6500560 324850
saving model to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/models/year=2007/weight=10000/model.sparkobject
********************************************************************************

In [3]:
train_logistic_regression_and_save_model(fires_weight=10000, predict_year=2010)

Train count, test count: 6500560 324850
saving model to s3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/models/year=2010/weight=10000/model.sparkobject
********************************************************************************

In [11]:
from pyspark.ml import PipelineModel
year = 2010
pipeline_model = PipelineModel.load(
    's3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/models/'
    'year={}/weight=10000/model.sparkobject'.format(year))
print("Coefficients for year {}".format(year))
print("Coefficients:", pipeline_model.stages[-1].coefficients)
print("Intercept:", pipeline_model.stages[-1].intercept)

Coefficients for year 2010
Coefficients: [0.08881099235164998,0.13632301177193015,0.11020134395069991,0.0003229175303864513,0.06504448428938507,-0.30694896684238937,-0.1613960726706734,0.11372871376090295,0.38890873919913177,0.2634028293911873,-0.17968386306572232,-0.23232996389199795,-0.22480312207042655,-0.21059037746396092,0.22663742494342698,-0.07903999097628758,-0.5092802827324341,-0.05380340971227158,-0.27770262442585214,0.2260845067438809,0.0529373860383622,-0.005954282211155398,-0.10153784772257698,0.1920697089745939,0.16544992364723882,-0.002088028040021257,-0.027605722592341096,-0.1639381703633287,0.399023898350511,-0.1660266922838218,-0.08063762572915308,0.012606607927062531,-0.20326068628548385,0.11967534081476812,0.27269747774308745,-0.00862799542542835,0.0062055509264404025,0.1860452797761952,-0.20145328956293393,-0.12190226857159799]
Intercept: -2.300353214326727

In [10]:
from pyspark.ml import PipelineModel
year = 2007
pipeline_model = PipelineModel.load(
    's3://dse-cohort5-group5/wildfire_capstone/logistic_regression_predictions/models/'
    'year={}/weight=10000/model.sparkobject'.format(year))
print("Coefficients for year {}".format(year))
print("Coefficients:", pipeline_model.stages[-1].coefficients)
print("Intercept:", pipeline_model.stages[-1].intercept)

Coefficients for year 2007
Coefficients: [0.0751749927532972,0.11931195800526559,0.11870575796818895,0.03175585137308058,0.06355862698396718,-0.26557461018751777,-0.09921473261382105,0.0788782349877773,0.28265808743749854,0.21409965783457982,-0.21541297743912155,-0.15445135855073544,-0.2206521720803125,-0.23769853504149094,0.12532247029072785,-0.09515398917180073,-0.41648255172840287,-0.07249897205843178,-0.1698772074989707,0.13681748574973437,-0.05007667848693983,0.09347101189083838,0.012020792785904785,0.16518160717803462,0.2128146979168255,-0.08861551642551775,-0.0944462656740906,-0.1526228187362223,0.3310517170743802,-0.1688107702386254,-0.07990266124895551,0.021978865991899185,-0.21819743729719543,0.0911705688664554,0.33186866877577725,-0.016854531432428706,-0.16668388215972868,0.08715283470992505,-0.27109544502467553,-0.1802844294983804]
Intercept: -2.0240806207467372