# Imports

In [1]:
import requests
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = (SparkSession
         .builder
         .appName('Wildfire data pipeline')
         .master("local[*]")
         .getOrCreate())
spark.sparkContext.setLogLevel('INFO')
spark

# Loading Data

In [3]:
modis_file = "MODIS_C6_Global_24h.csv"
viirs_file = "VNP14IMGTDL_NRT_Global_24h.csv"

def download(url, file_name):
        r = requests.get(url)
        with open(file_name,'wb') as f:
                f.write(r.content)

def download_files():
        base_url = "https://firms.modaps.eosdis.nasa.gov/data/active_fire/"
        # Download the MODIS data file
        download(base_url + "c6/csv/" + modis_file, 'data/' + modis_file)
        # Download the VIIRS data file
        download(base_url + "viirs/csv/" + viirs_file, 'data/' + viirs_file)

In [4]:
download_files()

In [6]:
%ls data

MODIS_C6_Global_24h.csv         VNP14IMGTDL_NRT_Global_24h.csv


# Data Pipeline

In [7]:
viirs_df = (spark
           .read
           .format("csv")
           .option("header", True)
           .option("inferSchema", True)
           .load('data/' + viirs_file))
viirs_df.show(10)

+---------+---------+----------+----+-----+----------+--------+---------+----------+-------+----------+---+--------+
| latitude|longitude|bright_ti4|scan|track|  acq_date|acq_time|satellite|confidence|version|bright_ti5|frp|daynight|
+---------+---------+----------+----+-----+----------+--------+---------+----------+-------+----------+---+--------+
|-30.53877| 28.98748|     326.5|0.63| 0.72|2020-10-18|       6|        N|   nominal| 1.0NRT|     286.0|4.7|       N|
|-29.63855|  17.8575|     313.3|0.44| 0.39|2020-10-18|       6|        N|   nominal| 1.0NRT|     285.4|1.1|       N|
|-30.74109| 29.99058|     295.7|0.71| 0.75|2020-10-18|       6|        N|   nominal| 1.0NRT|     284.7|1.1|       N|
|-30.74395| 29.99385|     298.4|0.72| 0.75|2020-10-18|       6|        N|   nominal| 1.0NRT|     284.9|1.1|       N|
|-30.99165|  29.4018|     296.8|0.66| 0.73|2020-10-18|       6|        N|   nominal| 1.0NRT|     283.8|0.6|       N|
|-31.00266| 29.02282|     303.3|0.63| 0.72|2020-10-18|       6| 

In [8]:
viirs_df.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- bright_ti4: double (nullable = true)
 |-- scan: double (nullable = true)
 |-- track: double (nullable = true)
 |-- acq_date: string (nullable = true)
 |-- acq_time: integer (nullable = true)
 |-- satellite: string (nullable = true)
 |-- confidence: string (nullable = true)
 |-- version: string (nullable = true)
 |-- bright_ti5: double (nullable = true)
 |-- frp: double (nullable = true)
 |-- daynight: string (nullable = true)



In [9]:
viirs_df.count()

92144

In [10]:
viirs_df2 = (viirs_df
             .withColumn("acq_time_min", F.expr("acq_time % 100"))
             .withColumn("acq_time_hr", F.expr("int(acq_time / 100)"))
             .withColumn("acq_time2", F.unix_timestamp(F.col("acq_date"), 'yyyy-MM-dd'))
             .withColumn("acq_time3", F.expr("acq_time2 + acq_time_min * 60 + acq_time_hr * 3600"))
             .withColumn("acq_datetime", F.from_unixtime(F.col("acq_time3")))
             .drop("acq_date", "acq_time", "acq_time_min", "acq_time_hr", "acq_time2", "acq_time3")
             .withColumnRenamed("confidence", "confidence_level")
             .withColumn("brightness", F.lit(None))
             .withColumn("bright_t31", F.lit(None)))
viirs_df2.show(10)

+---------+---------+----------+----+-----+---------+----------------+-------+----------+---+--------+-------------------+----------+----------+
| latitude|longitude|bright_ti4|scan|track|satellite|confidence_level|version|bright_ti5|frp|daynight|       acq_datetime|brightness|bright_t31|
+---------+---------+----------+----+-----+---------+----------------+-------+----------+---+--------+-------------------+----------+----------+
|-30.53877| 28.98748|     326.5|0.63| 0.72|        N|         nominal| 1.0NRT|     286.0|4.7|       N|2020-10-18 00:06:00|      null|      null|
|-29.63855|  17.8575|     313.3|0.44| 0.39|        N|         nominal| 1.0NRT|     285.4|1.1|       N|2020-10-18 00:06:00|      null|      null|
|-30.74109| 29.99058|     295.7|0.71| 0.75|        N|         nominal| 1.0NRT|     284.7|1.1|       N|2020-10-18 00:06:00|      null|      null|
|-30.74395| 29.99385|     298.4|0.72| 0.75|        N|         nominal| 1.0NRT|     284.9|1.1|       N|2020-10-18 00:06:00|      nu

In [11]:
viirs_df2.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- bright_ti4: double (nullable = true)
 |-- scan: double (nullable = true)
 |-- track: double (nullable = true)
 |-- satellite: string (nullable = true)
 |-- confidence_level: string (nullable = true)
 |-- version: string (nullable = true)
 |-- bright_ti5: double (nullable = true)
 |-- frp: double (nullable = true)
 |-- daynight: string (nullable = true)
 |-- acq_datetime: string (nullable = true)
 |-- brightness: null (nullable = true)
 |-- bright_t31: null (nullable = true)



In [12]:
viirs_df2.rdd.getNumPartitions()

2

In [13]:
df = viirs_df2.groupby('confidence_level').count()
count = viirs_df2.count()
df = df.withColumn('%', F.round(F.expr(f"100 / {count} * count"), 2))
df.show()

+----------------+-----+-----+
|confidence_level|count|    %|
+----------------+-----+-----+
|         nominal|65279|70.84|
|             low|18829|20.43|
|            high| 8036| 8.72|
+----------------+-----+-----+



In [15]:
low = 40
high = 100

modis_df = spark.read.format("csv") \
        .option("header", True) \
        .option("inferSchema", True) \
        .load('data/' + modis_file) \
        .withColumn("acq_time_min", F.expr("acq_time % 100")) \
        .withColumn("acq_time_hr", F.expr("int(acq_time / 100)")) \
        .withColumn("acq_time2", F.unix_timestamp(F.col("acq_date"))) \
        .withColumn("acq_time3", F.expr("acq_time2 + acq_time_min * 60 + acq_time_hr * 3600")) \
        .withColumn("acq_datetime", F.from_unixtime(F.col("acq_time3"))) \
        .drop("acq_date", "acq_time", "acq_time_min", "acq_time_hr", "acq_time2", "acq_time3") \
        .withColumn("confidence_level", F.when(F.col("confidence") <= low, "low")
                    .when((F.col("confidence") > low) & (F.col("confidence") < high), "nominal")
                    .when(F.isnull(F.col("confidence")), "high")
                    .otherwise(F.col("confidence"))) \
        .drop("confidence") \
        .withColumn("bright_ti4", F.lit(None)) \
        .withColumn("bright_ti5", F.lit(None))

modis_df.show()
modis_df.printSchema()

+--------+---------+----------+----+-----+---------+-------+----------+----+--------+------------+----------------+----------+----------+
|latitude|longitude|brightness|scan|track|satellite|version|bright_t31| frp|daynight|acq_datetime|confidence_level|bright_ti4|bright_ti5|
+--------+---------+----------+----+-----+---------+-------+----------+----+--------+------------+----------------+----------+----------+
| -12.073|   26.406|     301.3| 2.4|  1.5|        A| 6.0NRT|     283.5|23.3|       N|        null|             low|      null|      null|
| -15.123|   22.926|     301.0| 1.5|  1.2|        A| 6.0NRT|     284.7|11.4|       N|        null|             low|      null|      null|
| -17.516|   18.274|     301.6| 1.0|  1.0|        A| 6.0NRT|     291.0| 4.7|       N|        null|         nominal|      null|      null|
| -18.863|   16.885|     315.5| 1.0|  1.0|        A| 6.0NRT|     294.9|12.4|       N|        null|             low|      null|      null|
| -18.865|   16.894|     325.3| 1.

In [16]:
df = modis_df.groupBy("confidence_level").count()
count = modis_df.count()
df = df.withColumn("%", F.round(F.expr("100 / {} * count".format(count)), 2))
df.show()

+----------------+-----+-----+
|confidence_level|count|    %|
+----------------+-----+-----+
|         nominal|15963|83.26|
|             low| 1949|10.17|
|             100| 1261| 6.58|
+----------------+-----+-----+



In [17]:
combined_df = viirs_df2.unionByName(modis_df)
combined_df.show()

+---------+---------+----------+----+-----+---------+----------------+-------+----------+---+--------+-------------------+----------+----------+
| latitude|longitude|bright_ti4|scan|track|satellite|confidence_level|version|bright_ti5|frp|daynight|       acq_datetime|brightness|bright_t31|
+---------+---------+----------+----+-----+---------+----------------+-------+----------+---+--------+-------------------+----------+----------+
|-30.53877| 28.98748|     326.5|0.63| 0.72|        N|         nominal| 1.0NRT|     286.0|4.7|       N|2020-10-18 00:06:00|      null|      null|
|-29.63855|  17.8575|     313.3|0.44| 0.39|        N|         nominal| 1.0NRT|     285.4|1.1|       N|2020-10-18 00:06:00|      null|      null|
|-30.74109| 29.99058|     295.7|0.71| 0.75|        N|         nominal| 1.0NRT|     284.7|1.1|       N|2020-10-18 00:06:00|      null|      null|
|-30.74395| 29.99385|     298.4|0.72| 0.75|        N|         nominal| 1.0NRT|     284.9|1.1|       N|2020-10-18 00:06:00|      nu

In [18]:
combined_df.count()

111317

In [19]:
count

19173

In [20]:
combined_df.rdd.getNumPartitions()

3

In [21]:
combined_df.write.format("parquet") \
        .mode("overwrite") \
        .save("data/fires_parquet")

output_df = combined_df.filter("confidence_level = 'high'") \
 .repartition(1)

output_df.write.format("csv") \
        .option("header", True) \
        .mode("overwrite") \
        .save("data/high_confidence_fires_csv")