In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import log, col, lag, mean, variance, exp, randn, lit

In [4]:
spark = SparkSession.builder.appName('Week9Q1').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/13 14:20:11 WARN Utils: Your hostname, dbl-23, resolves to a loopback address: 127.0.1.1; using 172.16.58.182 instead (on interface eno1)
25/10/13 14:20:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/13 14:20:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [26]:
df = spark.read.csv('datasets/stockdata_5/AAME.csv', inferSchema=True, header=True)

In [27]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [28]:
df.show(5)

+---------+----+----+----+-----+------+
|     Date|Open|High| Low|Close|Volume|
+---------+----+----+----+-----+------+
|31-Dec-13|4.04|4.13|3.96| 4.09| 30735|
|30-Dec-13|4.05|4.05|3.84|  3.9| 14646|
|27-Dec-13|4.02|4.05|3.99| 4.05|  5047|
|26-Dec-13|3.99|4.04|3.70| 4.01|  6309|
|24-Dec-13|3.90|3.97|3.84| 3.95| 13592|
+---------+----+----+----+-----+------+
only showing top 5 rows


In [29]:
df.count()

3506

## Monte Carlo Simulation

In [31]:
window_spec = Window.orderBy("Date")

In [34]:
df_with_pdr = df.withColumn("prev_close", lag("Close").over(window_spec)).withColumn("PDR", log(col("Close") / col("prev_close")))

In [37]:
stats = df_with_pdr.select(
    mean(col("PDR")).alias("avg_return"),
    variance(col("PDR")).alias("variance_return")
).collect()[0]

25/10/13 14:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/13 14:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/13 14:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/13 14:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [38]:
avg_return = stats["avg_return"]
variance_return = stats["variance_return"]

drift = avg_return - 0.5 * variance_return
drift

-0.13760942271403537

In [None]:
df_pred = df_with_pdr.withColumn("random_value", randn() * variance_return**0.5)

df_pred = df_pred.withColumn("Next_Day_Price",col("Close") * exp(lit(drift) + col("random_value")))

df_pred.select("Date", "Close", "Next_Day_Price").show(15)

+---------+-----+------------------+
|     Date|Close|    Next_Day_Price|
+---------+-----+------------------+
|31-Dec-13| 4.09| 4.387620272045979|
|30-Dec-13|  3.9|4.6048132578231415|
|27-Dec-13| 4.05|  3.08997841532974|
|26-Dec-13| 4.01|1.3885269131695508|
|24-Dec-13| 3.95| 2.190245522349543|
|23-Dec-13| 3.92| 9.903644756052453|
|20-Dec-13| 3.84| 5.160760042667169|
|19-Dec-13| 4.01|2.9525940423474486|
|18-Dec-13|  4.0|3.2883525287126307|
|17-Dec-13| 3.98| 6.161736683497138|
|16-Dec-13| 4.04| 4.398893881278822|
|13-Dec-13| 4.04| 4.134949791461736|
|12-Dec-13| 4.04| 2.537503771822641|
|11-Dec-13| 4.04| 1.521347934585932|
|10-Dec-13| 4.04|1.6808143139714176|
+---------+-----+------------------+
only showing top 15 rows


In [46]:
spark.stop()