In [3]:
%%configure -f
{
    "conf": {
        "spark.jars.packages": "org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0,harsha2010:magellan:1.0.5-s_2.11,com.esri.geometry:esri-geometry-api:1.2.1,commons-io:commons-io:2.6,org.apache.spark:spark-streaming_2.11:2.2.0,org.apache.spark:spark-sql_2.11:2.2.0",
        "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.11",
        "spark.dynamicAllocation.enabled": false
    }
}

In [4]:
/**
 * @Description: a spatial join based on Filter-refine approach for NYC taxicab data
 * @author: Isam Al Jawarneh
 * @date: 02/02/2019
 *last update: 14/04/2021
 */

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1619184318710_0004,spark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [5]:
sc.version

res3: String = 2.2.0.2.6.3.84-1

In [6]:
import util.control.Breaks._
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.util.random.XORShiftRandom
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql.functions.from_json
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.ForeachWriter
import magellan._
import magellan.index.ZOrderCurve
import magellan.{Point, Polygon}

import org.apache.spark.sql.magellan.dsl.expressions._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{
  DoubleType,
  StringType,
  StructField,
  StructType
}
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions.{collect_list, collect_set}
import org.apache.spark.sql.SQLContext
import org.apache.log4j.{Level, Logger}
import scala.collection.mutable
import scala.concurrent.duration.Duration
import java.io.{BufferedWriter, FileWriter}
import org.apache.commons.io.FileUtils
import java.io.File
import scala.collection.mutable.ListBuffer
import java.time.Instant
import org.apache.spark.util.CollectionAccumulator
import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.DataFrame

In [7]:
/////////////////////////////
/// Definition of schemas ///
/////////////////////////////

In [9]:
val aerosolDataSchema = StructType(Array(
    StructField("Latitude", DoubleType, false),
    StructField("Longitude", DoubleType, false),
    StructField("Value", DoubleType, false),
    StructField("dataDate", StringType, false),
    StructField("time", StringType, false),
    StructField("shortName", StringType, false)))

aerosolDataSchema: org.apache.spark.sql.types.StructType = StructType(StructField(Latitude,DoubleType,false), StructField(Longitude,DoubleType,false), StructField(Value,DoubleType,false), StructField(dataDate,StringType,false), StructField(time,StringType,false), StructField(shortName,StringType,false))

In [10]:
/////////////////////////////
///// Import Dataframes /////
/////////////////////////////

In [17]:
//"wasb[s]://<BlobStorageContainerName>@<StorageAccountName>.blob.core.windows.net/<path>"
val aerosolData = (spark.read.format("csv")
                        .option("header", "true")
                        .schema(aerosolDataSchema)
                        .csv("wasbs://sspark-2021-04-23t13-18-44-008z@ssparkhdistorage.blob.core.windows.net/cams_data/*")
                        .withColumn("timestamp", to_timestamp(concat($"dataDate", lit(" "), $"time"), "yyyyMMdd HHmm"))
                        .withColumn("Point", point($"Longitude",$"Latitude"))
                        .drop("Longitude", "Latitude", "dataDate", "time"))

aerosolData: org.apache.spark.sql.DataFrame = [Value: double, shortName: string ... 2 more fields]

In [19]:
// aerosolData.show()

root
 |-- Value: double (nullable = true)
 |-- shortName: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- Point: point (nullable = false)

In [12]:
//////////////////
/// Geohashing ///
//////////////////

In [13]:
// a user defined function to get geohash from long/lat point 
val geohashUDF = udf{(curve: Seq[ZOrderCurve]) => curve.map(_.toBase32())}

geohashUDF: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,ArrayType(StringType,true),Some(List(ArrayType(org.apache.spark.sql.types.ZOrderCurveUDT@3cec181,true))))

In [14]:
val precision = 30

precision: Int = 30

In [15]:
//getting plain data from CSV file (file with point Data Structure) and use UDF to get geohashes
val geohashedAerosolData = (aerosolData
                         .withColumn("index", $"point" index  precision)
                         .withColumn("geohashArray1", geohashUDF($"index.curve")))
val explodedGeohashedAerosolData = (geohashedAerosolData
                                 .explode("geohashArray1", "geohash")
                                 { a: mutable.WrappedArray[String] => a })

explodedGeohashedAerosolData: org.apache.spark.sql.DataFrame = [Value: double, dataDate: date ... 6 more fields]

In [16]:
explodedGeohashedAerosolData.show(2,false)

+---------------+----------+-------------------+---------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-------+
|Value          |dataDate  |time               |shortName|Point             |index                                                                                                                                                 |geohashArray1|geohash|
+---------------+----------+-------------------+---------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-------+
|8.0796069835E-9|2020-01-01|1970-01-01 12:00:00|pm10     |Point(43.0, 11.5) |[[ZOrderCurve(42.989501953125, 11.4971923828125, 43.00048828125, 11.502685546875, 30, -4348063485599416320, 110000111010100010010011010011),Contains]]|[sfn96m]     |sfn96

In [None]:
// To be substituted with the city we choose for the presentation

// val rawCity= (spark.read.format("magellan")
//                   .option("type", "geojson")
//                  // .load("wasbs://sspark-2021-04-17t10-30-16-344z@ssparkhdistorage.blob.core.windows.net/bologna/")
//                   .select($"polygon", $"metadata"("NOME_COM").as("City_Name")).cache()
//                   )
// val city = (rawCity
//                .withColumn("index", $"polygon" index  precision)
//                .select($"polygon", $"index", $"City_Name")
//                .cache())
// val zorderIndexedCity = (city
//                             .withColumn("index", explode($"index"))
//                             .select("polygon", "index.curve", "index.relation","City_Name")
//                           )
// val geohashedCity = city.withColumn("geohashArray", geohashUDF($"index.curve"))
// val explodedGeohashedCity = geohashedCity.explode("geohashArray", "geohash") { a: mutable.WrappedArray[String] => a }
// explodedGeohashedCity.count()

In [None]:
//joining geohashed trips with exploded geohashed neighborhood using filter-and-refine approach (.where($"point" within $"polygon") is refine --> using the brute force method ray casting for edge cases or false positives)
val aerosolDataInCity = (explodedGeohashedCity
                         .join(explodedGeohashedAerosolData,
                               explodedGeohashedCity("geohash") === explodedGeohashedAerosolData("geohash"))
                         .where($"point" within $"polygon")
                        )
aerosolDataInCity.show(3)

In [None]:
aerosolDataInCity.columns

In [None]:
// val airDataWithParameters = (airData
//                              .join(
//                                  parametersRegistry,
//                                  airData("Parameter_Id") === parametersRegistry("Parameter_Id")))
// airDataWithParameters.show(2)

In [None]:
// val finalTable = (airDataWithParameters
//                   .join(stationsInEmiliaRomagna,
//                         airDataWithParameters("Station_Code") === stationsInEmiliaRomagna("Station_Code")))
// finalTable.show(3)

In [None]:
// finalTable.printSchema()