In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, TimestampType
from pyspark.sql.functions import col, lag, from_unixtime, round
from pyspark.sql.window import Window

# Step 1: Create Spark session
spark = SparkSession.builder \
    .appName("CryptoDataProcessing").config("spark.executor.memory", "2g").config("spark.sql.shuffle.partitions", "4").getOrCreate()

# Step 2: Define the schema
schema = StructType([
    StructField("price", DoubleType(), True),
    StructField("volume_24h", DoubleType(), True),
    StructField("volume_24h_change_24h", DoubleType(), True),
    StructField("market_cap", LongType(), True),
    StructField("market_cap_change_24h", DoubleType(), True),
    StructField("percent_change_15m", DoubleType(), True),
    StructField("percent_change_30m", DoubleType(), True),
    StructField("percent_change_1h", DoubleType(), True),
    StructField("percent_change_6h", DoubleType(), True),
    StructField("percent_change_12h", DoubleType(), True),
    StructField("percent_change_24h", DoubleType(), True),
    StructField("percent_change_7d", DoubleType(), True),
    StructField("percent_change_30d", DoubleType(), True),
    StructField("percent_change_1y", DoubleType(), True),
    StructField("symbol", StringType(), True),
    StructField("beta_value", DoubleType(), True),
    StructField("timestamp", TimestampType(), True)
])


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 15:10:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/25 15:10:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
import os
import re

file_list = os.listdir('../data')

# Regular expression to match a year (four consecutive digits)
year_pattern = re.compile(r'\d{4}')

# Filter files that contain a year in their names
files_with_years = sorted(['../data/' + file for file in file_list if year_pattern.search(file)])
files_with_years

['../data/BTC-2017min.csv',
 '../data/BTC-2018min.csv',
 '../data/BTC-2019min.csv',
 '../data/BTC-2020min.csv',
 '../data/BTC-2021min.csv']

In [3]:
# Step 3: Load the CSV file into a DataFrame
df = spark.read.csv(files_with_years, header=True, inferSchema=True)

                                                                                

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, last, window, from_unixtime, lag
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType, TimestampType


# Step 1: Convert unix time to timestamp
df = df.withColumn("timestamp", from_unixtime(col("unix")).cast(TimestampType())).orderBy(col("timestamp")).drop('date')

df.show(10)


[Stage 2:>                                                        (0 + 10) / 11]

+----------+-------+------+------+------+------+----------+------------------+-------------------+
|      unix| symbol|  open|  high|   low| close|Volume BTC|        Volume USD|          timestamp|
+----------+-------+------+------+------+------+----------+------------------+-------------------+
|1483228860|BTC/USD|966.34|966.34|966.34|966.34|      7.61| 7353.847400000001|2017-01-01 01:01:00|
|1483228920|BTC/USD|966.16|966.37|966.16|966.37|8.08737633| 7815.397864022099|2017-01-01 01:02:00|
|1483228980|BTC/USD|966.37|966.37|966.37|966.37|       0.0|               0.0|2017-01-01 01:03:00|
|1483229040|BTC/USD|966.37|966.37|966.37|966.37|       0.0|               0.0|2017-01-01 01:04:00|
|1483229100|BTC/USD|966.43|966.43|966.43|966.43|     0.107|103.40800999999999|2017-01-01 01:05:00|
|1483229160|BTC/USD|966.58|966.58|966.58|966.58|0.33173245|320.64595152100003|2017-01-01 01:06:00|
|1483229220|BTC/USD|966.58|966.58|966.58|966.58|       0.0|               0.0|2017-01-01 01:07:00|
|148322928

                                                                                

In [26]:
# Step 2: Resample the DataFrame to 5-minute intervals
df_5min = df.groupBy(window(col("timestamp"), "5 minutes").alias("time_window")) \
    .agg(
        last("close").alias("price"),          # Get the last close price within the 5-minute window
        _sum("Volume USD").alias("volume_5m")  # Sum of Volume USD for the 5-minute window
    ).orderBy(col("time_window.end"))

# Step 4: Select the desired columns and rename window to timestamp
df_final = df_5min.select(
    col("time_window.end").alias("timestamp"),  # Use the end of the 5-minute window as the timestamp
    col("price"),
    col("volume_5m").alias("Volume_USD"),
)

# Verify the 5-minute resampling
df_final.show(1500, truncate=False)




+-------------------+-------+------------------+
|timestamp          |price  |Volume_USD        |
+-------------------+-------+------------------+
|2017-01-01 01:05:00|966.37 |15169.2452640221  |
|2017-01-01 01:10:00|966.58 |424.053961521     |
|2017-01-01 01:15:00|965.55 |6434.2622184592   |
|2017-01-01 01:20:00|965.55 |20078.145180898802|
|2017-01-01 01:25:00|964.87 |647.7043203818    |
|2017-01-01 01:30:00|965.24 |6640.8512         |
|2017-01-01 01:35:00|965.24 |0.0               |
|2017-01-01 01:40:00|966.39 |5519.856963785    |
|2017-01-01 01:45:00|966.38 |10032.1113757011  |
|2017-01-01 01:50:00|966.97 |33706.5180215286  |
|2017-01-01 01:55:00|966.97 |148.91338000000002|
|2017-01-01 02:00:00|966.6  |235.4161105288    |
|2017-01-01 02:05:00|966.6  |308.9669715753    |
|2017-01-01 02:10:00|965.08 |28095.831308792003|
|2017-01-01 02:15:00|962.54 |96347.3           |
|2017-01-01 02:20:00|964.37 |196.83023         |
|2017-01-01 02:25:00|963.95 |1069.5085448447999|
|2017-01-01 02:30:00

                                                                                

In [27]:
window_spec = Window.partitionBy(window(col("timestamp"), "1 week").alias("month")).orderBy("timestamp")
window_spec = Window.orderBy("timestamp")


In [28]:
288*3

864

In [29]:
df_transformed = df_final.withColumn(
            "percent_change_15m", 
            (col("price") - lag("price", 3).over(window_spec)) / lag("price", 3).over(window_spec)
        ).withColumn(
            "percent_change_30m", 
            (col("price") - lag("price", 6).over(window_spec)) / lag("price", 6).over(window_spec)
        ).withColumn(
            "percent_change_1h", 
            (col("price") - lag("price", 12).over(window_spec)) / lag("price", 12).over(window_spec)
        ).withColumn(
            "percent_change_6h", 
            (col("price") - lag("price", 72).over(window_spec)) / lag("price", 72).over(window_spec)
        ).withColumn(
            "percent_change_12h", 
            (col("price") - lag("price", 144).over(window_spec)) / lag("price", 144).over(window_spec)
        ).withColumn(
            "percent_change_24h", 
            (col("price") - lag("price", 288).over(window_spec)) / lag("price", 288).over(window_spec)
        ).withColumn(
            "percent_change_48h", 
            (col("price") - lag("price", 576).over(window_spec)) / lag("price", 576).over(window_spec)
        ).withColumn(
            "percent_change_72h", 
            (col("price") - lag("price", 864).over(window_spec)) / lag("price", 864).over(window_spec)
        )
df_transformed.show(2000, truncate=False)

24/09/25 15:34:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 15:34:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 15:34:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 15:34:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 15:34:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 15:34:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 1

+-------------------+-------+------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+---------------------+----------------------+
|timestamp          |price  |Volume_USD        |percent_change_15m    |percent_change_30m    |percent_change_1h     |percent_change_6h     |percent_change_12h    |percent_change_24h    |percent_change_48h   |percent_change_72h    |
+-------------------+-------+------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+---------------------+----------------------+
|2017-01-01 01:05:00|966.37 |15169.2452640221  |NULL                  |NULL                  |NULL                  |NULL                  |NULL                  |NULL                  |NULL                 |NULL                  |
|2017-01-01 01:10:00|966.58 |424.053961521     |NULL                  |N

In [24]:
df_transformed = df_final.select(
    col("timestamp"),
    col("price"),
    col("Volume_USD"),
    round(calculate_percentage_change(col("price"), lag("price", 1).over(window_spec)), 2).alias("percent_change_5m"),  # Assuming 1m intervals
    round(calculate_percentage_change(col("price"), lag("price", 2).over(window_spec)), 2).alias("percent_change_10m"),  # Assuming 1m intervals
    round(calculate_percentage_change(col("price"), lag("price", 3).over(window_spec)), 2).alias("percent_change_15m"),  # Assuming 1m intervals
    round(calculate_percentage_change(col("price"), lag("price", 6).over(window_spec)), 2).alias("percent_change_30m"),
    round(calculate_percentage_change(col("price"), lag("price", 12).over(window_spec)), 2).alias("percent_change_1h"),
    round(calculate_percentage_change(col("price"), lag("price", 72).over(window_spec)), 2).alias("percent_change_6h"),   # 72 mins is approx. 6h
    round(calculate_percentage_change(col("price"), lag("price", 144).over(window_spec)), 2).alias("percent_change_12h"), # 144 mins is approx. 12h
    round(calculate_percentage_change(col("price"), lag("price", 288).over(window_spec)), 2).alias("percent_change_24h"), # 288 mins is approx. 24h
    round(calculate_percentage_change(col("price"), lag("price", 2016).over(window_spec)), 2).alias("percent_change_7d"), # 2016 mins is approx. 7 days
    round(calculate_percentage_change(col("price"), lag("price", 8640).over(window_spec)), 2).alias("percent_change_30d")# 8640 mins is approx. 30 days
)

NameError: name 'calculate_percentage_change' is not defined

In [18]:
df_transformed.show(10)

24/09/25 10:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 10:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 10:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 10:47:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 10:47:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 10:47:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 1

+-------------------+------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+
|          timestamp| price|        Volume_USD|percent_change_5m|percent_change_10m|percent_change_15m|percent_change_30m|percent_change_1h|percent_change_6h|percent_change_12h|percent_change_24h|percent_change_7d|percent_change_30d|
+-------------------+------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+
|2017-01-01 01:05:00|966.37|  15169.2452640221|             NULL|              NULL|              NULL|              NULL|             NULL|             NULL|              NULL|              NULL|             NULL|              NULL|
|2017-01-01 01:10:00|966.58|     424.053961521|             0.02

In [33]:
965.55 - ((965.55-966.37)/966.37)*966.37

966.37

In [1]:
from btc_streamer.model.preprocessing import BTCDataloader

In [2]:
btc = BTCDataloader()
btc.setup_spark()
df = btc.load_data()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 11:22:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
train_df, test_df = btc.preproc_split(df)

24/09/25 11:22:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 1

len_train: 18654, len_test: 7925


In [4]:
train_df.show(5)

24/09/25 11:22:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 11:22:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/25 1

+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+
|          timestamp|  percent_change_30m|   percent_change_1h|   percent_change_6h|  percent_change_12h|  percent_change_24h|   percent_change_7d|target|
+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+
|2017-01-08 01:35:00|-0.00137178763842...|0.008186651179413912|0.011553374123131468|0.019977175399148103|0.006579020729490841| -0.0648025361568107|     1|
|2017-01-08 06:20:00|-0.00255981834929...|0.008577462500676838|0.035204535349043976| 0.03518152108668102| 0.10996293250378421|-0.03294911734164072|     0|
|2017-01-08 08:40:00|-0.00210811624755...|-0.00577601080189...| 0.03181823236985221|0.040112107623318356|  0.1156565656565656|-0.03853009451168967|     0|
|2017-01-08 16:50:00|0.007409811844505888|-0.01528446389496...|-0.0323

In [45]:
import ml.dmlc.xgboost4j.scala.spark.XGBoost


ModuleNotFoundError: No module named 'ml'

In [15]:
import re

In [33]:
feature_names = [x.name for x in train_df.schema if re.search(r'percent', x.name)]
feature_names

['percent_change_30m',
 'percent_change_1h',
 'percent_change_6h',
 'percent_change_12h',
 'percent_change_24h',
 'percent_change_7d']

In [40]:
from xgboost.spark import SparkXGBClassifier

In [41]:
SparkXGBClassifier()

SparkXGBClassifier_b550363c931b

In [42]:
xgb_regressor = SparkXGBClassifier(
  features_col=feature_names,
  label_col="target",
  num_workers=2,
  device='cpu'
)

In [43]:
xgb_regressor.fit(dataset=train_df)

ValueError: features_col param with list value requires `device=cuda`.

In [29]:
from pyspark.sql.functions import expr

df_joined = df_5min.alias("a") \
    .join(df_5min.alias("b"), 
          (col("b.time_window.end") <= col("a.time_window.end")) &
          (col("b.time_window.end") > col("a.time_window.end") - expr("INTERVAL 24 HOURS")),
          "left") \
    .groupBy("a.time_window.end") \
    .agg(
        last("a.price").alias("price"),
        _sum("b.volume_5m").alias("volume_24h")
    )

# Add previous 24-hour volume and calculate percentage change
window_spec_change = Window.orderBy(col("time_window.end"))



NameError: name 'df_5min' is not defined

In [76]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, last, window, from_unixtime, lag, expr
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType, TimestampType


df_joined = df_5min.alias("a") \
    .join(df_5min.alias("b"), 
          (col("b.time_window.end") <= col("a.time_window.end")) &
          (col("b.time_window.end") > col("a.time_window.end") - expr("INTERVAL 24 HOURS")),
          "left") \
    .groupBy("a.time_window.end") \
    .agg(
        last("a.price").alias("price"),
        _sum("b.volume_5m").alias("volume_24h")
    )


[Stage 156:>                                                        (0 + 4) / 4]

In [19]:
df_joined.show(10)

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 4) / 4]
Traceback (most recent call last):
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

[Stage 29:>                                                         (0 + 4) / 4]

In [None]:
df_24h_volume = df_joined.withColumn("prev_volume_24h", lag("volume_24h").over(window_spec_change))
df_24h_volume = df_24h_volume.withColumn(
    "volume_24h_change_24h",
    ((col("volume_24h") - col("prev_volume_24h")) / col("prev_volume_24h") * 100).cast(DoubleType())
)

In [77]:
df_joined.show(10)

ERROR:root:KeyboardInterrupt while sending command.][Stage 164:>  (0 + 0) / 4]  
Traceback (most recent call last):
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/envs/kafka_streamer/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/michieldekoninck/.pyenv/versions/3.10.6/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [67]:
# Step 3: Calculate the 24-hour volume and percentage change
# Define a window spec to calculate the rolling 24-hour volume
window_spec_24h = Window.orderBy(col("time_window.end")).rangeBetween(-24*60*60, Window.currentRow)  # 24 hours in seconds


# Calculate 24-hour volume
df_5min.withColumn("volume_24h", _sum("volume_5m").over(window_spec_24h))


AnalysisException: [DATATYPE_MISMATCH.RANGE_FRAME_INVALID_TYPE] Cannot resolve "(ORDER BY time_window.end ASC NULLS FIRST RANGE BETWEEN -86400 FOLLOWING AND CURRENT ROW)" due to data type mismatch: The data type "TIMESTAMP" used in the order specification does not match the data type "BIGINT" which is used in the range frame.;
'Project [time_window#2569, price#2580, volume_5m#2582, sum(volume_5m#2582) windowspecdefinition(time_window#2569.end ASC NULLS FIRST, specifiedwindowframe(RangeFrame, -86400, currentrow$())) AS volume_24h#2623]
+- Sort [time_window#2569.end ASC NULLS FIRST], true
   +- Aggregate [window#2583], [window#2583 AS time_window#2569, last(close#2222, false) AS price#2580, sum(Volume USD#2224) AS volume_5m#2582]
      +- Project [named_struct(start, knownnullable(precisetimestampconversion(((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - 0) % 300000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - 0) % 300000000) + 300000000) ELSE ((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - 0) % 300000000) END) - 0), LongType, TimestampType)), end, knownnullable(precisetimestampconversion((((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - 0) % 300000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - 0) % 300000000) + 300000000) ELSE ((precisetimestampconversion(timestamp#2234, TimestampType, LongType) - 0) % 300000000) END) - 0) + 300000000), LongType, TimestampType))) AS window#2583, unix#2216, symbol#2218, open#2219, high#2220, low#2221, close#2222, Volume BTC#2223, Volume USD#2224, timestamp#2234]
         +- Filter isnotnull(timestamp#2234)
            +- Project [unix#2216, symbol#2218, open#2219, high#2220, low#2221, close#2222, Volume BTC#2223, Volume USD#2224, timestamp#2234]
               +- Sort [timestamp#2234 ASC NULLS FIRST], true
                  +- SubqueryAlias timestamp
                     +- Project [unix#2216, cast(from_unixtime(cast(unix#2216 as bigint), yyyy-MM-dd HH:mm:ss, Some(Europe/Brussels)) as timestamp) AS date#2356, symbol#2218, open#2219, high#2220, low#2221, close#2222, Volume BTC#2223, Volume USD#2224, timestamp#2234]
                        +- Sort [timestamp#2234 ASC NULLS FIRST], true
                           +- SubqueryAlias timestamp
                              +- Project [unix#2216, cast(from_unixtime(cast(unix#2216 as bigint), yyyy-MM-dd HH:mm:ss, Some(Europe/Brussels)) as timestamp) AS date#2296, symbol#2218, open#2219, high#2220, low#2221, close#2222, Volume BTC#2223, Volume USD#2224, timestamp#2234]
                                 +- Sort [timestamp#2234 ASC NULLS FIRST], true
                                    +- SubqueryAlias timestamp
                                       +- Project [unix#2216, date#2217, symbol#2218, open#2219, high#2220, low#2221, close#2222, Volume BTC#2223, Volume USD#2224, cast(from_unixtime(cast(unix#2216 as bigint), yyyy-MM-dd HH:mm:ss, Some(Europe/Brussels)) as timestamp) AS timestamp#2234]
                                          +- Relation [unix#2216,date#2217,symbol#2218,open#2219,high#2220,low#2221,close#2222,Volume BTC#2223,Volume USD#2224] csv


In [None]:

# Calculate 24-hour volume
df_24h_volume = df_5min.withColumn("volume_24h", _sum("volume_5m").over(window_spec_24h))
df_24h_volume.show(truncate=False)

In [None]:
# Step 3: Calculate the 24-hour volume and percentage change
# Define a window spec to calculate the rolling 24-hour volume
window_spec_24h = Window.orderBy(col("time_window.end")).rangeBetween(-24*60*60, Window.currentRow)  # 24 hours in seconds

# Calculate 24-hour volume
df_24h_volume = df_5min.withColumn("volume_24h", _sum("volume_5m").over(window_spec_24h))

# Add the previous 24-hour volume for calculating percentage change
window_spec_change = Window.orderBy(col("time_window.end"))

df_24h_volume = df_24h_volume.withColumn("prev_volume_24h", lag("volume_24h").over(window_spec_change))

# Calculate the percentage change in volume
df_24h_volume = df_24h_volume.withColumn(
    "volume_24h_change_24h",
    ((col("volume_24h") - col("prev_volume_24h")) / col("prev_volume_24h") * 100).cast(DoubleType())
)

# Step 4: Select the desired columns and rename window to timestamp
df_final = df_24h_volume.select(
    col("time_window.end").alias("timestamp"),  # Use the end of the 5-minute window as the timestamp
    col("price"),
    col("volume_24h"),
    col("volume_24h_change_24h")
)

# Step 5: Sort the DataFrame by timestamp
df_final = df_final.orderBy(col("timestamp"))

# Step 6: Show the final DataFrame
df_final.show(truncate=False)

In [None]:
# Step 3: Calculate the 24-hour volume and percentage change
# Define a window spec to calculate the rolling 24-hour volume
window_spec_24h = Window.orderBy(col("time_window.end")).rangeBetween(-24*60*60, Window.currentRow)  # 24 hours in seconds

# Calculate 24-hour volume
df_24h_volume = df_5min.withColumn("volume_24h", _sum("volume_5m").over(window_spec_24h))

# Add the previous 24-hour volume for calculating percentage change
window_spec_change = Window.orderBy(col("time_window.end"))

df_24h_volume = df_24h_volume.withColumn("prev_volume_24h", lag("volume_24h").over(window_spec_change))

# Calculate the percentage change in volume
df_24h_volume = df_24h_volume.withColumn(
    "volume_24h_change_24h",
    ((col("volume_24h") - col("prev_volume_24h")) / col("prev_volume_24h") * 100).cast(DoubleType())
)

# Step 4: Select the desired columns and rename window to timestamp
df_final = df_24h_volume.select(
    col("time_window.end").alias("timestamp"),  # Use the end of the 5-minute window as the timestamp
    col("price"),
    col("volume_24h"),
    col("volume_24h_change_24h")
)

# Step 5: Sort the DataFrame by timestamp
df_final = df_final.orderBy(col("timestamp"))

# Step 6: Show the final DataFrame
df_final.show(truncate=False)

In [69]:
window_spec = Window.orderBy("timestamp")

In [70]:
# Helper function to calculate percentage change
def calculate_percentage_change(current, previous):
    return (current - previous) / previous * 100

In [87]:
df_transformed = df_resampled.select(
    col("timestamp"),
    col("price"),
    col("Volume_USD"),
    # Calculate volume_24h_change_24h as the percentage change from the previous row
    round((col("Volume_USD") - lag("Volume_USD", 1).over(window_spec)) / lag("Volume_USD", 1).over(window_spec) * 100, 2)
    .alias("volume_24h_change_24h"),
        # Calculate percentage change for different intervals (15m, 30m, 1h, 6h, 12h, 24h, 7d, 30d, 1y)
    round(calculate_percentage_change(col("price"), lag("price", 1).over(window_spec)), 2).alias("percent_change_5m"),  # Assuming 1m intervals
    round(calculate_percentage_change(col("price"), lag("price", 2).over(window_spec)), 2).alias("percent_change_10m"),  # Assuming 1m intervals
    round(calculate_percentage_change(col("price"), lag("price", 3).over(window_spec)), 2).alias("percent_change_15m"),  # Assuming 1m intervals
    round(calculate_percentage_change(col("price"), lag("price", 6).over(window_spec)), 2).alias("percent_change_30m"),
    round(calculate_percentage_change(col("price"), lag("price", 12).over(window_spec)), 2).alias("percent_change_1h"),
    round(calculate_percentage_change(col("price"), lag("price", 72).over(window_spec)), 2).alias("percent_change_6h"),   # 72 mins is approx. 6h
    round(calculate_percentage_change(col("price"), lag("price", 144).over(window_spec)), 2).alias("percent_change_12h"), # 144 mins is approx. 12h
    round(calculate_percentage_change(col("price"), lag("price", 288).over(window_spec)), 2).alias("percent_change_24h"), # 288 mins is approx. 24h
    round(calculate_percentage_change(col("price"), lag("price", 2016).over(window_spec)), 2).alias("percent_change_7d"), # 2016 mins is approx. 7 days
    round(calculate_percentage_change(col("price"), lag("price", 8640).over(window_spec)), 2).alias("percent_change_30d"),# 8640 mins is approx. 30 days
    round(calculate_percentage_change(col("price"), lag("price", 105120).over(window_spec)), 2).alias("percent_change_1y"),# 105120 mins is approx. 1 year
)


In [89]:
df_transformed.show((100))

24/09/23 20:14:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:14:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:14:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:14:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:14:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------+------------------+---------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+
|          timestamp| price|        Volume_USD|volume_24h_change_24h|percent_change_15m|percent_change_30m|percent_change_1h|percent_change_6h|percent_change_12h|percent_change_24h|percent_change_7d|percent_change_30d|percent_change_1y|
+-------------------+------+------------------+---------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+
|2017-01-01 01:05:00|966.37|  15169.2452640221|                 NULL|              NULL|              NULL|             NULL|             NULL|              NULL|              NULL|             NULL|              NULL|             NULL|
|2017-01-01 01:10:00|966.58|     424.053961521|     

24/09/23 20:15:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:15:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:15:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/23 20:15:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

24/09/24 02:05:40 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3243129 ms exceeds timeout 120000 ms
24/09/24 02:05:40 WARN SparkContext: Killing executors is not supported by current scheduler.
24/09/24 02:05:40 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$