In [1]:
import findspark
findspark.init()

import time as t
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql import functions as F
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os
%matplotlib inline
from matplotlib import pyplot as plt
from pyspark.sql.window import Window
import math as m

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [3]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [4]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Backtesting

### 1.1 Try different strategies

In [5]:
b = sparkStructuredStreaming.backtest()

#strategy looks at last 10,120,500... minutes to calculate average
strategy_momentum = [["1d","momentum",10,30,60,120,180],["5d","momentum",30,60,120,180,240,300,400,500,600,800]]
# mean reverse strategy
rsi_buy = 15
rsi_sell = 60
ibr_buy = 0.2
ibr_sell = 0.7 
strategy_mean_reverse = ["mean_reverse",rsi_buy,rsi_sell,ibr_buy,ibr_sell,3,5,8,10,14,20]
# granularity of historical data
interval_momentum="1m"
interval_mean_reverse = "1d"
hdfs_path = "hdfs://0.0.0.0:19000"
#start capital
startCap = 10000.0
# regulatory trading fee
commission = 0.000119
# risk free market return, assumed here 0.1% but is not really clear
risk_free = 0.001

for strategy in strategy_momentum:
    performance = b.performance(startCap, commission, risk_free, strategy[1:], interval_momentum, strategy[0], hdfs_path, sqlContext)

[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  61 of 61 completed

4 Failed downloads:
- EVHC: No data found for this date range, symbol may be delisted
- AET: No data found for this date range, symbol may be delisted
- CELG: No data found, symbol may be delisted
- ESRX: No data found for this date range, symbol may be delisted
[*********************100%***********************]  4 of 4 completed
[*********************100%***********************]  1 of 1 completed


In [6]:
hdfs_path = "hdfs://0.0.0.0:19000"
df_performance = sqlContext.read.format('parquet').load(hdfs_path+"/performance").orderBy("DepotId")
df_performance.show(80)

+-------+--------+-----+----+-------------+------+----------+----------+------+--------------------+------------------+
|DepotId|   Value|Alpha|Beta|Start-Capital|Profit|Start-Date|  End-Date|Trades|Performance_Strategy|Performance_S&P500|
+-------+--------+-----+----+-------------+------+----------+----------+------+--------------------+------------------+
|      1|  9982.6|-0.86|0.52|      10000.0| -17.4|2020-05-15|2020-05-15|   385|               -0.17|              1.31|
|      2|  9969.9| -1.1|0.61|      10000.0| -30.1|2020-05-15|2020-05-15|   204|                -0.3|              1.31|
|      3|10124.74|  0.3|0.72|      10000.0|124.74|2020-05-15|2020-05-15|   134|                1.25|              1.31|
|      4|10063.99|-0.37|0.77|      10000.0| 63.99|2020-05-15|2020-05-15|   126|                0.64|              1.31|
|      5|10095.55|-0.31|0.96|      10000.0| 95.55|2020-05-15|2020-05-15|    79|                0.96|              1.31|
|      6|10075.39|-0.19|0.72|      10000

In [7]:
df_depot = sqlContext.read.format('parquet').load(hdfs_path+"/depot")
df_depot.orderBy("DepotId").show(70, truncate=False)

+-------+-------------+------------+----------------------+-------------------------------------------------------------+
|DepotId|Start-Caputal|Strategy    |ISIN                  |Share                                                        |
+-------+-------------+------------+----------------------+-------------------------------------------------------------+
|1      |10000.0      |momentum10  |[CTL, T, VZ]          |[0.3333333333333333, 0.3333333333333333, 0.33333333333333337]|
|2      |10000.0      |momentum30  |[CTL, T, VZ]          |[0.3333333333333333, 0.3333333333333333, 0.33333333333333337]|
|3      |10000.0      |momentum60  |[CTL, T, VZ]          |[0.3333333333333333, 0.3333333333333333, 0.33333333333333337]|
|4      |10000.0      |momentum120 |[CTL, T, VZ]          |[0.3333333333333333, 0.3333333333333333, 0.33333333333333337]|
|5      |10000.0      |momentum180 |[CTL, T, VZ]          |[0.3333333333333333, 0.3333333333333333, 0.33333333333333337]|
|6      |10000.0      |B

## 2. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [5]:
r = sparkStructuredStreaming.realtime()

# same set up as in backtesting
symbol, share = # todo: use the ibes which performed best in backtesting

startCap = 10000.0
commission = 0.000119
index = "trading"
# choose momentum which worked best in backtesting
momentum_str = [r.momentum,"momentum",10]


r.realtime(symbol, share, startCap, commission, momentum_str, index, sqlContext)

2020-05-14 15:32:36 9999.999524 10013.43286
2020-05-14 15:33:59 10001.299524000002 10016.42286
2020-05-14 15:35:09 10008.359524 10027.41286
2020-05-14 15:35:24 10008.359524 10038.66286
2020-05-14 15:37:45 10008.359524 10047.70286
2020-05-14 15:38:10 10008.359524 10042.442860000001


KeyboardInterrupt: 