In [1]:
import findspark
findspark.init()

import time as t
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os
%matplotlib inline
from matplotlib import pyplot as plt
from pyspark.sql.window import Window
import math as m

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [3]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [4]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Historical Data

#### Read historical data from yahoo finance, write into HDFS

In [5]:
# ^GSPC is S&P500
symbol=["AAPL","MSFT","AMZN","^GSPC"]

# use minute intervals for momentum strategy
period_momentum="5d"
interval_momentum="1m"

# day intervals for mean reverse
period_mean_reverse = "6mo"
interval_mean_reverse = "1d"

hdfs_path = "hdfs://0.0.0.0:19000"

for symbol in symbol:
    sparkStructuredStreaming.history().to_hdfs(symbol, interval_momentum, period_momentum, sqlContext, hdfs_path)

## 2. Backtesting

### 2.1 Momentum Strategy

In [5]:
symbol=["AAPL","MSFT","AMZN"]
#strategy looks at last 10,120,500 minutes to calculate average
strategy_momentum = ["momentum",30,60,120,180,240,300,400,500,600]
# mean reverse strategy
rsi_buy = 15
rsi_sell = 60
ibr_buy = 0.2
ibr_sell = 0.7 
strategy_mean_reverse = ["mean_reverse",rsi_buy,rsi_sell,ibr_buy,ibr_sell,3,5,8,10,14,20]
# granularity of historical data
interval_momentum="1m"
interval_mean_reverse = "1d"
hdfs_path = "hdfs://0.0.0.0:19000"
#start capital
startCap = 10000.0
# distribution of start-capital between stocks
share = [0.4,0.2,0.4]
# regulatory trading fee
commission = 0.000119
# risk free market return, assumed here 0.1% but is not really clear
risk_free = 0.001

b = sparkStructuredStreaming.backtest()

performance = b.performance(symbol, share, startCap, commission, risk_free, strategy_momentum, interval_momentum, hdfs_path, sqlContext)

In [6]:
hdfs_path = "hdfs://0.0.0.0:19000"
df_performance = sqlContext.read.format('parquet').load(hdfs_path+"/performance").orderBy("DepotId")
df_performance.show(40, truncate=10)

+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------------+
|DepotId|     Value|     Alpha|      Beta|Start-Capital|    Profit|Start-Date|Trades|Performance_Strategy|Performance_S&P500|
+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------------+
|      1|9895.99...|0.31244...|0.63103...|      10000.0|-104.00...|2020-05-11|   655|          -1.0400...|        -2.1530...|
|      2|9864.21...|0.08200...|0.67130...|      10000.0|-135.78...|2020-05-11|   522|          -1.3578...|        -2.1530...|
|      3|9910.49...|0.58869...|0.69151...|      10000.0|-89.505...|2020-05-11|   342|          -0.8950...|        -2.1530...|
|      4|9907.22...|0.47020...|0.65197...|      10000.0|-92.774...|2020-05-11|   299|          -0.9277...|        -2.1530...|
|      5|9889.82...|0.33279...|0.66882...|      10000.0|-110.17...|2020-05-11|   242|          -1.1017...|        -2.1

In [7]:
df_depot = sqlContext.read.format('parquet').load(hdfs_path+"/depot")
df_depot.orderBy("DepotId").show(30)

+-------+-------------+------------+------------------+---------------+
|DepotId|Start-Caputal|    Strategy|              ISIN|          Share|
+-------+-------------+------------+------------------+---------------+
|      1|      10000.0|  momentum30|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      2|      10000.0|  momentum60|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      3|      10000.0| momentum120|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      4|      10000.0| momentum180|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      5|      10000.0| momentum240|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      6|      10000.0| momentum300|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      7|      10000.0| momentum400|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      8|      10000.0| momentum500|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      9|      10000.0| momentum600|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     10|      10000.0|Buy and Hold|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
+-------+-------------+------------+------------------+---------

## 3. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [5]:
r = sparkStructuredStreaming.realtime()

# same set up as in backtesting
symbol=["AAPL","MSFT","AMZN"]
share = [0.4,0.2,0.4]
startCap = 10000.0
commission = 0.000119
index = "trading"
# choose momentum which worked best in backtesting
momentum_str = [r.momentum,"momentum",10]


r.realtime(symbol, share, startCap, commission, momentum_str, index, sqlContext)

2020-05-14 15:32:36 9999.999524 10013.43286
2020-05-14 15:33:59 10001.299524000002 10016.42286
2020-05-14 15:35:09 10008.359524 10027.41286
2020-05-14 15:35:24 10008.359524 10038.66286
2020-05-14 15:37:45 10008.359524 10047.70286
2020-05-14 15:38:10 10008.359524 10042.442860000001


KeyboardInterrupt: 