In [1]:
import findspark
findspark.init()

import time as t
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os
%matplotlib inline
from matplotlib import pyplot as plt
from pyspark.sql.window import Window

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [3]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [4]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Historical Data

#### Read historical data from yahoo finance, write into HDFS

In [5]:
symbol=["AAPL","MSFT","AMZN","^IXIC"]
period="5d"
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"

for symbol in symbol:
    sparkStructuredStreaming.history().to_hdfs(symbol, interval, period, sqlContext, hdfs_path)

## 2. Backtesting

### 2.1 Momentum Strategy

In [6]:
symbol=["AAPL","MSFT","AMZN"]
#strategy looks at last 120 minutes to calculate average
strategy = ["momentum",10,120,500]
# granularity of historical data
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"
#start capital
startCap = 10000.0
# distribution of start-capital between stocks
share = [0.4,0.2,0.4]
# regulatory trading fee
commission = 0.000119
# when testing different strategies each one needs individual id
depotId = 19
# risk free market return, assumed here 0.1% but is not really clear
risk_free = 0.001

b = sparkStructuredStreaming.backtest()

performance = b.performance(depotId, symbol, share, startCap, commission, risk_free, strategy, interval, hdfs_path, sqlContext)

In [9]:
hdfs_path = "hdfs://0.0.0.0:19000"
df_performance = sqlContext.read.format('parquet').load(hdfs_path+"/performance").orderBy("DepotId")
df_performance.show(20, truncate=10)

+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------------+
|DepotId|     Value|     Alpha|      Beta|Start-Capital|    Profit|Start-Date|Trades|Performance_Strategy|Performance_Nasdaq|
+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------------+
|     15|10303.7...|2.33084...|0.54361...|      10000.0|303.789...|2020-05-06|  8886|          3.03789...|        1.28106...|
|     16|10198.9...|1.37842...|0.46724...|      10000.0|198.943...|2020-05-06|  3140|          1.98943...|        1.28106...|
|     17|9990.36...|-0.2830...|0.12987...|      10000.0|-9.6356...|2020-05-06|  1281|          -0.0963...|        1.28106...|
|     18|10195.3...|0.35966...|1.10370...|      10000.0|195.322...|2020-05-06|    30|          1.95322...|        1.44601...|
|     19|10206.9...|2.04189...|0.54230...|      10000.0|206.947...|2020-05-07|  7135|          2.06947...|        0.03

In [9]:
df_depot = sqlContext.read.format('parquet').load(hdfs_path+"/depot")
df_depot.orderBy("DepotId").show(20)

+-------+-------------+------------+------------------+---------------+
|DepotId|Start-Caputal|    Strategy|              ISIN|          Share|
+-------+-------------+------------+------------------+---------------+
|     15|      10000.0|  momentum10|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     16|      10000.0| momentum120|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     17|      10000.0| momentum500|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     18|      10000.0|Buy and Hold|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     19|      10000.0|  momentum10|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     20|      10000.0| momentum120|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     21|      10000.0| momentum500|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|     22|      10000.0|Buy and Hold|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
+-------+-------------+------------+------------------+---------------+



## 3. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [9]:
"""
function simulates realtime trading and writes results into elasticsearch
needs a stream that writes new stock prices into "hdfs://0.0.0.0:19000/realtime/momentum" 
and "hdfs://0.0.0.0:19000/realtime/buyAndHold"
"""

def realtime(symbol, share, startCap, commission, strategy, r, es):
    # compare momentum strategy with buy and hold
    strategy_b = [r.buy_and_hold,"buyAndHold",10]
    
    # initialize both strategys
    init_strategy = r.realtime_init(symbol, share, startCap, commission, strategy, sqlContext)
    init_b = r.realtime_init(symbol, share, startCap, commission, strategy_b, sqlContext)
    value, datetime, moneyForInvesting_list, moneyInStocks_list, stocksOwned, trades_total = init_strategy
    value_b, datetime_b, moneyForInvesting_list_b, moneyInStocks_list_b, stocksOwned_b, trades_total_b = init_b
    
    while(True):
        # update depot and stocksOwned in loop until break
        depot_strategy = r.realtime_loop(value, datetime, moneyForInvesting_list, moneyInStocks_list, stocksOwned,\
                                       trades_total, symbol, share, commission, strategy, sqlContext)
        depot_b = r.realtime_loop(value_b, datetime_b, moneyForInvesting_list_b, moneyInStocks_list_b, stocksOwned_b,\
                                  trades_total_b, symbol, share, commission, strategy_b, sqlContext)
        value, datetime, moneyForInvesting_list, moneyInStocks_list, stocksOwned, trades_total = depot_strategy
        value_b, datetime_b, moneyForInvesting_list_b, moneyInStocks_list_b, stocksOwned_b, trades_total_b = depot_b
        
        # write values into elasticsearch for visualisation
        doc = {}
        doc['date'] = (datetime_b[0] - timedelta(hours=2,minutes=0)).strftime("%Y-%m-%d"'T'"%H:%M:%S")
        doc['momentum'] = value
        doc['buy and hold'] = value_b
        res = es.index(index="test", body=doc)

In [10]:
from elasticsearch import Elasticsearch
from datetime import timedelta

es=Elasticsearch([{'host':'localhost','port':9200}])
r = sparkStructuredStreaming.realtime()

# same set up as in backtesting
symbol=["AAPL","MSFT","AMZN"]
share = [0.4,0.2,0.4]
startCap = 10000.0
commission = 0.000119

# choose momentum which worked best in backtesting
momentum_str = [r.momentum,"momentum",10]


realtime(symbol, share, startCap, commission, momentum_str, r, es)

KeyboardInterrupt: 