In [None]:
import pandas as pd

df = pd.read_csv('full_data.csv')


In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName('financial_analysis').getOrCreate()
df = spark.read.csv('full_data.csv', header=True, inferSchema=True)

df = df.withColumn('bar_range', F.floor((df.bar_num - 1) / 10)) # starts from 1
df = df.withColumn('bar_range', df.bar_range.cast("int"))

In [None]:
df.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import avg, lag, col
# Calculate average profit per bar_range and trade_id
df_range_avg = df.groupBy("trade_id", "bar_range").agg(avg("profit").alias("avg_profit"))

# Define a window partitioned by trade_id and ordered by bar_range
window = Window.partitionBy('trade_id').orderBy('bar_range')

# Create profit_lag column, which is the lagged cumulative average of avg_profit
df_range_avg = df_range_avg.withColumn('cumulative_avg_profit', avg('avg_profit').over(window))
df_range_avg = df_range_avg.withColumn('profit_lag', lag('cumulative_avg_profit').over(window))

df_range_avg.orderBy('trade_id').show()

In [None]:
# profit_lag = avg of avg_profit of bar_ranges before current
# avg_profit = avg of profit of current bar_range
df_new = df.join(df_range_avg, ['trade_id', 'bar_range'], 'left')
df_new.select('trade_id', 'bar_range', 'profit_lag', 'avg_profit', 'profit').orderBy(['trade_id', 'bar_range']).show(200)

In [None]:


spark = SparkSession.builder.appName('financial_analysis').getOrCreate()
df = spark.read.csv('full_data.csv', header=True, inferSchema=True)

df = df.withColumn('bar_range', F.floor((df.bar_num - 1) / 10)) # starts from 1
df = df.withColumn('bar_range', df.bar_range.cast("int"))




In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import avg

window = Window.partitionBy('trade_id', 'bar_range').orderBy('time_stamp').rowsBetween(Window.unboundedPreceding, -1)
df = df.withColumn('profit_lag', avg(df['profit']).over(window))


In [None]:
df.show(15)

In [None]:
df.groupBy('trade_id', 'bar_range').agg(avg('profit').alias('avg_profit')).show(15)

In [None]:
from pyspark.sql.functions import lag
from pyspark.sql.window import Window

# Define the window by 'trade_id', ordered by 'bar_num'
window = Window.partitionBy('trade_id').orderBy('bar_num')

# Use the lag function to get the past 10 profits, then add these as new features
for i in range(1, 11):
    df = df.withColumn(f'profit_lag_{i}', lag(df.profit, count=i).over(window))

# Fill the null values (since the lagged values for the first 10 bars will be null)
df = df.na.fill(0)
