In [None]:
%%configure -f
{
    "conf": {
        "spark.jars": "s3://shiheng-poc/jars/.ivy2/jars/org.apache.spark_spark-streaming-kinesis-asl_2.11-2.4.0.jar,s3://shiheng-poc/jars/.ivy2/jars/com.amazonaws_aws-java-sdk-core-1.11.271.jar,s3://shiheng-poc/jars/.ivy2/jars/com.amazonaws_aws-java-sdk-s3-1.11.271.jar,s3://shiheng-poc/jars/.ivy2/jars/com.amazonaws_amazon-kinesis-client-1.8.10.jar"
    }
}

In [None]:
//         "spark.jars.ivy": "/home/hadoop/.ivy2",
//         "spark.jars.packages": "org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.0"
//         "spark.executor.extraClassPath": "s3://shiheng-poc/jars/.ivy2/jars/",


In [2]:
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import com.amazonaws.services.kinesis.AmazonKinesisClient
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kinesis.KinesisInitialPositions.Latest
import org.apache.spark.streaming.kinesis.KinesisInputDStream
import org.apache.spark.streaming._
import scala.util.parsing.json._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StructType,ArrayType}
import org.joda.time.DateTime

def flattenDataframe(df: DataFrame): DataFrame = {
    val fields = df.schema.fields
    val fieldNames = fields.map(x => x.name)
    val length = fields.length
    for(i <- 0 to fields.length-1){
        val field = fields(i)
        val fieldtype = field.dataType
        val fieldName = field.name
        fieldtype match {
            case arrayType: ArrayType =>
                val fieldNamesExcludingArray = fieldNames.filter(_!=fieldName)
                val fieldNamesAndExplode = fieldNamesExcludingArray ++ Array(s"explode_outer($fieldName) as $fieldName")
                // val fieldNamesToSelect = (fieldNamesExcludingArray ++ Array(s"$fieldName.*"))
                val explodedDf = df.selectExpr(fieldNamesAndExplode:_*)
                return flattenDataframe(explodedDf)
            case structType: StructType =>
                val childFieldnames = structType.fieldNames.map(childname => fieldName +"."+childname)
                val newfieldNames = fieldNames.filter(_!= fieldName) ++ childFieldnames
                val renamedcols = newfieldNames.map(x => (col(x.toString()).as(x.toString().replace(".", "_"))))
                val explodedf = df.select(renamedcols:_*)
                return flattenDataframe(explodedf)
            case _ =>
            }
        }
        df
    }

val unixToDT = udf{(ordertime:Long) => new DateTime(ordertime * 1000).toDateTime.toString("yyyyMMddHHmm").toLong}

val appName     = "shiHengKinesisSparkApp"
val streamName  = "shiheng-orders"
val endpointUrl = "https://kinesis.cn-northwest-1.amazonaws.com.cn"
val regionName  = "cn-northwest-1"

val credentials = new DefaultAWSCredentialsProviderChain().getCredentials()
require(credentials != null, "No AWS credentials found. Please specify credentials using one of the methods specified " +
  "in http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html")
val kinesisClient = new AmazonKinesisClient(credentials)
kinesisClient.setEndpoint(endpointUrl)
val numShards = kinesisClient.describeStream(streamName).getStreamDescription.getShards().size()

val numStreams = numShards

// Spark Streaming interval
val batchInterval = Milliseconds(5000)
val kinesisCheckpointInterval = batchInterval

// val regionName = getRegion.getRegionNameByEndpoint(endpointUrl)
val sparkConf   = sc //new SparkConf().setAppName(appName)
//.setMaster("local[*]")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
59,application_1573529792134_0087,spark,idle,Link,Link,✔


SparkSession available as 'spark'.
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import com.amazonaws.services.kinesis.AmazonKinesisClient
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kinesis.KinesisInitialPositions.Latest
import org.apache.spark.streaming.kinesis.KinesisInputDStream
import org.apache.spark.streaming._
import scala.util.parsing.json._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StructType, ArrayType}
import org.joda.time.DateTime
flattenDataframe: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
unixToDT: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,LongType,Some(List(LongType)))
appName: String = shiHengKinesisSparkApp
streamName: String = shiheng-orders
endpointUrl: String = https://kine

In [4]:

val ssc = new StreamingContext(sparkConf, batchInterval)
// ssc.checkpoint("/tmp/checkpoint") // no checkpoint can be set, or you'll get error

val kinesisStreams = (0 until numStreams).map { i =>
  KinesisInputDStream.builder
    .streamingContext(ssc)
    .streamName(streamName)
    .endpointUrl(endpointUrl)
    .regionName(regionName)
    .initialPosition(new Latest())
    .checkpointAppName(appName)
    .checkpointInterval(kinesisCheckpointInterval)
    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
    .build()
}

val unionStreams = ssc.union(kinesisStreams)
val sqlContext = SparkSession.builder().getOrCreate()

unionStreams.foreachRDD ((rdd: RDD[Array[Byte]], time: Time) => {
  println("**********************************************")
  println(rdd.count)
  println("rdd isempty:" + rdd.isEmpty)
  if (rdd.isEmpty()) {
    println("No data input!!!")
  } else {
    val lines   = rdd.map(byteArray => new String(byteArray))//.collect()//.toList
    val linesDF = sqlContext.read.json(lines)
    // linesDF.show()
    // linesDF.printSchema
    val linesFlattenDF = flattenDataframe(linesDF)
    val linesResultDF  = linesFlattenDF.withColumn("ot", unixToDT(linesFlattenDF("ordertime")))
    // linesResultDF.show()
    
    // set mode to append in case `analysisexception path already exists`
    linesResultDF.write.mode("append").partitionBy("ot").csv("s3://shiheng-poc/binc/parted-prod/")
  }
  println("**********************************************")
})

ssc: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@5da8199e
kinesisStreams: scala.collection.immutable.IndexedSeq[org.apache.spark.streaming.kinesis.KinesisInputDStream[Array[Byte]]] = Vector(org.apache.spark.streaming.kinesis.KinesisInputDStream@5f4d042f)
unionStreams: org.apache.spark.streaming.dstream.DStream[Array[Byte]] = org.apache.spark.streaming.dstream.UnionDStream@5fe43376
sqlContext: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@544b543c


In [None]:
ssc.start()
ssc.awaitTermination()

In [None]:
// to stop executing ssc, click Kernel -> Shutdown in the toolbar




### 下方为测试代码请忽略 ###




In [1]:
import spark.implicits._
val jsonStr = """{ "metadata": { "key": 84896, "value": 54 }},{ "metadata": { "key": 84896, "value": 54 }}"""
val df = spark.read.json(Seq(jsonStr).toDS)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
20,application_1573529792134_0036,spark,idle,Link,Link,✔


SparkSession available as 'spark'.
import spark.implicits._
jsonStr: String = { "metadata": { "key": 84896, "value": 54 }}
df: org.apache.spark.sql.DataFrame = [metadata: struct<key: bigint, value: bigint>]


In [2]:
df.show()

+-----------+
|   metadata|
+-----------+
|[84896, 54]|
+-----------+



In [9]:
val jsonStr = """{ "metadata": { "key": 84896, "value": 54 }}"""
val df = spark.read.json(Seq(jsonStr).toDS)
df.show()

jsonStr: String = { "metadata": { "key": 84896, "value": 54 }}
df: org.apache.spark.sql.DataFrame = [metadata: struct<key: bigint, value: bigint>]
+-----------+
|   metadata|
+-----------+
|[84896, 54]|
+-----------+



In [27]:
import org.apache.spark.sql.SparkSession
// val s_2 = jsonStr + "\n" + jsonStr
val events = sc.parallelize(
  """{"action":"create","timestamp":1452121277}""" ::
  """{"action":"create","timestamp":"1452121277"}""" ::
  """{"action":"create","timestamp":""}""" ::
  """{"action":"create","timestamp":null}""" ::
  """{"action":"create","timestamp":"null"}""" ::
  Nil
)
val sqlContext = SparkSession.builder().getOrCreate()
// val ssss = sc.parallelize(s_2)
val df = sqlContext.read.json(events)
df.show()

import org.apache.spark.sql.SparkSession
events: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[69] at parallelize at <console>:56
sqlContext: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7e7af19b
df: org.apache.spark.sql.DataFrame = [action: string, timestamp: string]
+------+----------+
|action| timestamp|
+------+----------+
|create|1452121277|
|create|1452121277|
|create|          |
|create|      null|
|create|      null|
+------+----------+



In [31]:
var strList = List.empty[String]
var jsonString1 = """{"ID" : "111","NAME":"Arkay","LOC":"Pune"}"""
var jsonString2 = """{"ID" : "222","NAME":"DineshS","LOC":"PCMC"}"""
strList = strList :+ jsonString1
strList = strList :+ jsonString2
val pS = sc.parallelize(strList)
val df = sqlContext.read.json(pS)
df.show()

strList: List[String] = List()
jsonString1: String = {"ID" : "111","NAME":"Arkay","LOC":"Pune"}
jsonString2: String = {"ID" : "222","NAME":"DineshS","LOC":"PCMC"}
strList: List[String] = List({"ID" : "111","NAME":"Arkay","LOC":"Pune"})
strList: List[String] = List({"ID" : "111","NAME":"Arkay","LOC":"Pune"}, {"ID" : "222","NAME":"DineshS","LOC":"PCMC"})
pS: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[79] at parallelize at <console>:57
df: org.apache.spark.sql.DataFrame = [ID: string, LOC: string ... 1 more field]
+---+----+-------+
| ID| LOC|   NAME|
+---+----+-------+
|111|Pune|  Arkay|
|222|PCMC|DineshS|
+---+----+-------+



In [None]:
val a = 
  Array(
    Array("4580056797", "0", "2015-07-29 10:38:42", "0", "1", "1"), 
    Array("4580056797", "0", "2015-07-29 10:38:42", "0", "1", "1"))

val rdd = sc.makeRDD(a)

Starting Spark application
