In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("btc_streamer") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
    .config("spark.sql.shuffle.partitions", 1) \
    .master("local[*]") \
    .getOrCreate()

:: loading settings :: url = jar:file:/Users/michieldekoninck/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/michieldekoninck/.ivy2/cache
The jars for the packages stored in: /Users/michieldekoninck/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f2af2ac9-86ad-47ac-830c-16573cc25835;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.1

In [2]:
spark

In [4]:
streaming_df = spark.readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "bitcoin") \
    .option("startingOffsets", "earliest") \
    .load()

In [6]:
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, DoubleType, FloatType, IntegerType, LongType
from pyspark.sql.functions import from_json
from pyspark.sql.functions import explode, col


schema = StructType([
    StructField("price", DoubleType(), True),
    StructField("volume_24h", DoubleType(), True),
    StructField("volume_24h_change_24h", DoubleType(), True),
    StructField("market_cap", LongType(), True),
    StructField("market_cap_change_24h", DoubleType(), True),
    StructField("percent_change_15m", DoubleType(), True),
    StructField("percent_change_30m", DoubleType(), True),
    StructField("percent_change_1h", DoubleType(), True),
    StructField("percent_change_6h", DoubleType(), True),
    StructField("percent_change_12h", DoubleType(), True),
    StructField("percent_change_24h", DoubleType(), True),
    StructField("percent_change_7d", DoubleType(), True),
    StructField("percent_change_30d", DoubleType(), True),
    StructField("percent_change_1y", DoubleType(), True),
    StructField("ath_price", DoubleType(), True),
    StructField("ath_date", StringType(), True),
    StructField("percent_from_price_ath", DoubleType(), True),
    StructField("symbol", StringType(), True),
    StructField("beta_value", DoubleType(), True)
])

# Cast the value from binary to string, since Kafka sends messages as bytes
kafka_df =streaming_df.selectExpr("CAST(value AS STRING) as json_string")

# Parse the JSON string in the 'value' column using the defined schema
json_df = kafka_df.withColumn("json_data", from_json(col("json_string"), schema))

json_df

DataFrame[json_string: string, json_data: struct<price:double,volume_24h:double,volume_24h_change_24h:double,market_cap:bigint,market_cap_change_24h:double,percent_change_15m:double,percent_change_30m:double,percent_change_1h:double,percent_change_6h:double,percent_change_12h:double,percent_change_24h:double,percent_change_7d:double,percent_change_30d:double,percent_change_1y:double,ath_price:double,ath_date:string,percent_from_price_ath:double,symbol:string,beta_value:double>]

In [None]:
# Parse the JSON string in the 'value' column using the defined schema
json_df = kafka_df.withColumn("json_data", from_json(col("json_string"), schema))

# Extract specific fields from the JSON
extracted_df = json_df.select(
    col("json_data.price"),
    col("json_data.volume_24h"),
    col("json_data.market_cap"),
    col("json_data.symbol"),
    col("json_data.percent_change_24h")
)

# Write the output to console (for debugging) or further processing
query = extracted_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Await termination of the streaming query
query.awaitTermination()

In [76]:
streaming_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [23]:
import os, requests

url = f"https://api.coinpaprika.com/v1/tickers/btc-bitcoin"
    
headers = {
    'Accept-Encoding': 'gzip',
    'Authorization': f'Bearer {os.getenv("COINCAP_API_KEY")}',
}
    
response = requests.get(url, headers=headers)
data = response.json()

In [24]:
data

{'id': 'btc-bitcoin',
 'name': 'Bitcoin',
 'symbol': 'BTC',
 'rank': 1,
 'total_supply': 19758244,
 'max_supply': 21000000,
 'beta_value': 0.979908,
 'first_data_at': '2010-07-17T00:00:00Z',
 'last_updated': '2024-09-24T08:36:35Z',
 'quotes': {'USD': {'price': 63669.604706256774,
   'volume_24h': 27957015512.798916,
   'volume_24h_change_24h': -4.82,
   'market_cap': 1258000158196,
   'market_cap_change_24h': 0.07,
   'percent_change_15m': -0.07,
   'percent_change_30m': 0.08,
   'percent_change_1h': 0.38,
   'percent_change_6h': 0.94,
   'percent_change_12h': 0.4,
   'percent_change_24h': 0.07,
   'percent_change_7d': 8.3,
   'percent_change_30d': -0.23,
   'percent_change_1y': 141.8,
   'ath_price': 73686.92856165291,
   'ath_date': '2024-03-14T07:07:09Z',
   'percent_from_price_ath': -13.59}}}

24/09/24 10:42:09 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

In [22]:
diction = data['quotes']['USD'] 
diction['symbol'] = data['symbol']
diction['beta_value'] = data['beta_value']
diction['timestamp'] = data['last_updated']
del diction['ath_price']
del diction['ath_date']
del diction['percent_from_price_ath']
diction

{'price': 63723.96699995236,
 'volume_24h': 33554610549.493538,
 'volume_24h_change_24h': 114.41,
 'market_cap': 1259052978344,
 'market_cap_change_24h': 1.42,
 'percent_change_15m': 0.19,
 'percent_change_30m': 0.37,
 'percent_change_1h': 0.78,
 'percent_change_6h': 0.22,
 'percent_change_12h': -1,
 'percent_change_24h': 1.41,
 'percent_change_7d': 10.2,
 'percent_change_30d': -1.26,
 'percent_change_1y': 142.44,
 'symbol': 'BTC',
 'beta_value': 0.980327,
 'timestamp': '2024-09-23T15:55:36Z'}

24/09/24 07:02:52 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2352345 ms exceeds timeout 120000 ms
24/09/24 07:02:52 WARN SparkContext: Killing executors is not supported by current scheduler.
24/09/24 07:51:12 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [91]:
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, DoubleType, FloatType, IntegerType, LongType
json_schema = StructType([StructField('data', ArrayType(StructType([ \
StructField('price', FloatType(), nullable=False), \
StructField('volume_24h', DoubleType(), nullable=False), \
StructField('volume_24h_change_24h', FloatType(), nullable=False), \
StructField('market_cap', IntegerType(), nullable=False),\
StructField('market_cap_change_24h', FloatType(), nullable=False), \
StructField('percent_change_15m', FloatType(), nullable=False), \
StructField('percent_change_30m', FloatType(), nullable=False), \
StructField('percent_change_1h', FloatType(), nullable=False), \
StructField('percent_change_6h', FloatType(), nullable=False), \
StructField('percent_change_12h', IntegerType(), nullable=False),\
StructField('percent_change_24h', FloatType(), nullable=False), \
StructField('percent_change_7d', FloatType(), nullable=False), \
StructField('percent_change_30d', FloatType(), nullable=False), \
StructField('percent_change_1y', FloatType(), nullable=False), \
StructField('BTC', StringType(), nullable=False), \
StructField('beta_value', FloatType(), nullable=False)])),nullable=False)])

In [151]:
# Parse value from binay to string
json_df = streaming_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

json_df.printSchema()


root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [140]:
from pyspark.sql.functions import from_json

json_expanded_df = json_df.withColumn("value", from_json("value", json_schema)).select("value.*") 

json_expanded_df.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- price: float (nullable = true)
 |    |    |-- volume_24h: double (nullable = true)
 |    |    |-- volume_24h_change_24h: float (nullable = true)
 |    |    |-- market_cap: integer (nullable = true)
 |    |    |-- market_cap_change_24h: float (nullable = true)
 |    |    |-- percent_change_15m: float (nullable = true)
 |    |    |-- percent_change_30m: float (nullable = true)
 |    |    |-- percent_change_1h: float (nullable = true)
 |    |    |-- percent_change_6h: float (nullable = true)
 |    |    |-- percent_change_12h: integer (nullable = true)
 |    |    |-- percent_change_24h: float (nullable = true)
 |    |    |-- percent_change_7d: float (nullable = true)
 |    |    |-- percent_change_30d: float (nullable = true)
 |    |    |-- percent_change_1y: float (nullable = true)
 |    |    |-- BTC: string (nullable = true)
 |    |    |-- beta_value: float (nullable = true)



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `price` cannot be resolved. Did you mean one of the following? [`data`].;
'Project ['price]
+- Project [value#561.data AS data#564]
   +- Project [key#539, from_json(StructField(data,ArrayType(StructType(StructField(price,FloatType,false),StructField(volume_24h,DoubleType,false),StructField(volume_24h_change_24h,FloatType,false),StructField(market_cap,IntegerType,false),StructField(market_cap_change_24h,FloatType,false),StructField(percent_change_15m,FloatType,false),StructField(percent_change_30m,FloatType,false),StructField(percent_change_1h,FloatType,false),StructField(percent_change_6h,FloatType,false),StructField(percent_change_12h,IntegerType,false),StructField(percent_change_24h,FloatType,false),StructField(percent_change_7d,FloatType,false),StructField(percent_change_30d,FloatType,false),StructField(percent_change_1y,FloatType,false),StructField(BTC,StringType,false),StructField(beta_value,FloatType,false)),true),false), value#540, Some(Europe/Brussels)) AS value#561]
      +- Project [cast(key#311 as string) AS key#539, cast(value#312 as string) AS value#540]
         +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@5dc0e718, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@530702d0, [startingOffsets=earliest, kafka.bootstrap.servers=localhost:9092, subscribe=bitcoin], [key#311, value#312, topic#313, partition#314, offset#315L, timestamp#316, timestampType#317], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@329c251,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> localhost:9092, subscribe -> bitcoin, startingOffsets -> earliest),None), kafka, [key#304, value#305, topic#306, partition#307, offset#308L, timestamp#309, timestampType#310]


In [142]:

# Apply Schema to JSON value column and expand the value
from pyspark.sql.functions import from_json

json_expanded_df = json_df.withColumn("value", from_json("value", json_schema)).select("value.*") 

json_expanded_df.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- price: float (nullable = true)
 |    |    |-- volume_24h: double (nullable = true)
 |    |    |-- volume_24h_change_24h: float (nullable = true)
 |    |    |-- market_cap: integer (nullable = true)
 |    |    |-- market_cap_change_24h: float (nullable = true)
 |    |    |-- percent_change_15m: float (nullable = true)
 |    |    |-- percent_change_30m: float (nullable = true)
 |    |    |-- percent_change_1h: float (nullable = true)
 |    |    |-- percent_change_6h: float (nullable = true)
 |    |    |-- percent_change_12h: integer (nullable = true)
 |    |    |-- percent_change_24h: float (nullable = true)
 |    |    |-- percent_change_7d: float (nullable = true)
 |    |    |-- percent_change_30d: float (nullable = true)
 |    |    |-- percent_change_1y: float (nullable = true)
 |    |    |-- BTC: string (nullable = true)
 |    |    |-- beta_value: float (nullable = true)



In [112]:
query = json_expanded_df \
    .writeStream \
    .format("console") \
    .option("checkpointLocation", "checkpoint_dir") \
    .start()

query.awaitTermination()

24/09/23 16:22:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/09/23 16:22:06 WARN StreamingQueryManager: Stopping existing streaming query [id=2c25dccb-fd05-4488-8951-8cf7ef297d58, runId=1656591d-74e3-4452-9b92-7afcf724396e], as a new run is being started.
24/09/23 16:22:06 WARN OffsetSeqMetadata: Updating the value of conf 'spark.sql.shuffle.partitions' in current session from '1' to '4'.
24/09/23 16:22:06 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/09/23 16:22:06 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/09/23 16:22:06 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/09/23 16:22:06 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
24/09/23 16:22:06 WARN AdminClientConfig: The 

-------------------------------------------
Batch: 16
-------------------------------------------
+----+
|data|
+----+
|NULL|
+----+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [66]:
from pyspark.sql.functions import explode, col


exploded_df = json_expanded_df.select('data').withColumn("data", explode("data"))
exploded_df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- price: float (nullable = true)
 |    |-- volume_24h: double (nullable = true)
 |    |-- volume_24h_change_24h: float (nullable = true)
 |    |-- market_cap: integer (nullable = true)
 |    |-- market_cap_change_24h: float (nullable = true)
 |    |-- percent_change_15m: float (nullable = true)
 |    |-- percent_change_30m: float (nullable = true)
 |    |-- percent_change_1h: float (nullable = true)
 |    |-- percent_change_6h: float (nullable = true)
 |    |-- percent_change_12h: integer (nullable = true)
 |    |-- percent_change_24h: float (nullable = true)
 |    |-- percent_change_7d: float (nullable = true)
 |    |-- percent_change_30d: float (nullable = true)
 |    |-- percent_change_1y: float (nullable = true)
 |    |-- BTC: string (nullable = true)
 |    |-- beta_value: float (nullable = true)



In [67]:
flattened_df = exploded_df \
    .selectExpr("data.price as price", "data.volume_24h as volume_24h", 
                "data.volume_24h_change_24h as volume_24h_change_24h", "data.market_cap as market_cap",
                "data.market_cap_change_24h as market_cap_change_24h", "data.percent_change_15m as percent_change_15m",
                "data.percent_change_30m as percent_change_30m", "data.percent_change_1h as percent_change_1h",
                "data.percent_change_6h as percent_change_6h", "data.percent_change_12h as percent_change_12h",
                "data.percent_change_24h as percent_change_24h", "data.percent_change_7d as percent_change_7d",
                "data.percent_change_30d as percent_change_30d", "data.percent_change_1y as percent_change_1y",
                "data.BTC as BTC", "data.beta_value as beta_value") 
    
flattened_df.printSchema()

root
 |-- price: float (nullable = true)
 |-- volume_24h: double (nullable = true)
 |-- volume_24h_change_24h: float (nullable = true)
 |-- market_cap: integer (nullable = true)
 |-- market_cap_change_24h: float (nullable = true)
 |-- percent_change_15m: float (nullable = true)
 |-- percent_change_30m: float (nullable = true)
 |-- percent_change_1h: float (nullable = true)
 |-- percent_change_6h: float (nullable = true)
 |-- percent_change_12h: integer (nullable = true)
 |-- percent_change_24h: float (nullable = true)
 |-- percent_change_7d: float (nullable = true)
 |-- percent_change_30d: float (nullable = true)
 |-- percent_change_1y: float (nullable = true)
 |-- BTC: string (nullable = true)
 |-- beta_value: float (nullable = true)



-------------------------------------------
Batch: 10
-------------------------------------------
+----+
|data|
+----+
|NULL|
+----+



In [59]:
# df = spark\
#       .readStream \
#       .format("kafka") \
#       .option("kafka.bootstrap.servers", "localhost:9092") \
#       .option("subscribe", "bitcoin") \
#       .option("startingOffsets", "earliest") \
#       .load()
      
query = flattened_df \
    .writeStream \
    .format("console") \
    .option("checkpointLocation", "checkpoint_dir") \
    .outputMode("complete") \
    .start()

query.awaitTermination()

24/09/23 15:51:15 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


AnalysisException: Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets;
Project [data#691.price AS price#693, data#691.volume_24h AS volume_24h#694, data#691.volume_24h_change_24h AS volume_24h_change_24h#695, data#691.market_cap AS market_cap#696, data#691.market_cap_change_24h AS market_cap_change_24h#697, data#691.percent_change_15m AS percent_change_15m#698, data#691.percent_change_30m AS percent_change_30m#699, data#691.percent_change_1h AS percent_change_1h#700, data#691.percent_change_6h AS percent_change_6h#701, data#691.percent_change_12h AS percent_change_12h#702, data#691.percent_change_24h AS percent_change_24h#703, data#691.percent_change_7d AS percent_change_7d#704, data#691.percent_change_30d AS percent_change_30d#705, data#691.percent_change_1y AS percent_change_1y#706, data#691.BTC AS BTC#707, data#691.beta_value AS beta_value#708]
+- Project [data#691]
   +- Generate explode(data#663), false, [data#691]
      +- Project [data#663]
         +- Project [value#661.data AS data#663]
            +- Project [from_json(StructField(data,ArrayType(StructType(StructField(price,FloatType,true),StructField(volume_24h,DoubleType,true),StructField(volume_24h_change_24h,FloatType,true),StructField(market_cap,IntegerType,true),StructField(market_cap_change_24h,FloatType,true),StructField(percent_change_15m,FloatType,true),StructField(percent_change_30m,FloatType,true),StructField(percent_change_1h,FloatType,true),StructField(percent_change_6h,FloatType,true),StructField(percent_change_12h,IntegerType,true),StructField(percent_change_24h,FloatType,true),StructField(percent_change_7d,FloatType,true),StructField(percent_change_30d,FloatType,true),StructField(percent_change_1y,FloatType,true),StructField(BTC,StringType,true),StructField(beta_value,FloatType,true)),true),true), value#613, Some(Europe/Brussels)) AS value#661]
               +- Project [cast(value#544 as string) AS value#613]
                  +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@3f78339c, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@31bfda03, [startingOffsets=earliest, kafka.bootstrap.servers=localhost:9092, subscribe=bitcoin], [key#543, value#544, topic#545, partition#546, offset#547L, timestamp#548, timestampType#549], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@16eed49e,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> localhost:9092, subscribe -> bitcoin, startingOffsets -> earliest),None), kafka, [key#536, value#537, topic#538, partition#539, offset#540L, timestamp#541, timestampType#542]


-------------------------------------------
Batch: 9
-------------------------------------------
+-----+----------+---------------------+----------+---------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+---+----------+
|price|volume_24h|volume_24h_change_24h|market_cap|market_cap_change_24h|percent_change_15m|percent_change_30m|percent_change_1h|percent_change_6h|percent_change_12h|percent_change_24h|percent_change_7d|percent_change_30d|percent_change_1y|BTC|beta_value|
+-----+----------+---------------------+----------+---------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+---+----------+
+-----+----------+---------------------+----------+---------------------+------------------+------------------+-----------------+------

In [37]:
df = spark\
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "localhost:9092") \
      .option("subscribe", "bitcoin") \
      .option("startingOffsets", "earliest") \
      .load()
      
query = flattened_df \
    .writeStream \
    .format("console") \
    .option("checkpointLocation", "path/to/HDFS/dir") \
    .start()

query.awaitTermination()

24/09/23 15:34:40 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/09/23 15:34:40 WARN StreamingQueryManager: Stopping existing streaming query [id=fea4c1ab-b277-430a-a538-0f5cc54a7dca, runId=d6df1f01-e184-4963-b051-c190422d2429], as a new run is being started.
24/09/23 15:34:40 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/09/23 15:34:40 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/09/23 15:34:40 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/09/23 15:34:40 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
24/09/23 15:34:40 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.


-------------------------------------------
Batch: 1
-------------------------------------------
+-----+----------+---------------------+----------+---------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+---+----------+
|price|volume_24h|volume_24h_change_24h|market_cap|market_cap_change_24h|percent_change_15m|percent_change_30m|percent_change_1h|percent_change_6h|percent_change_12h|percent_change_24h|percent_change_7d|percent_change_30d|percent_change_1y|BTC|beta_value|
+-----+----------+---------------------+----------+---------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+---+----------+
+-----+----------+---------------------+----------+---------------------+------------------+------------------+-----------------+------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 