In [1]:
import findspark
findspark.init()

import time as t
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os
%matplotlib inline
from matplotlib import pyplot as plt
from pyspark.sql.window import Window
import math as m

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [3]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [4]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Historical Data

#### Read historical data from yahoo finance, write into HDFS

In [6]:
symbol=["AAPL","MSFT","AMZN","^IXIC"]
period="5d"
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"

for symbol in symbol:
    sparkStructuredStreaming.history().to_hdfs(symbol, interval, period, sqlContext, hdfs_path)

## 2. Backtesting

### 2.1 Momentum Strategy

In [5]:
symbol=["AAPL","MSFT","AMZN"]
#strategy looks at last 120 minutes to calculate average
strategy = ["momentum",10,120,500]
# granularity of historical data
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"
#start capital
startCap = 10000.0
# distribution of start-capital between stocks
share = [0.4,0.2,0.4]
# regulatory trading fee
commission = 0.000119
# when testing different strategies each one needs individual id
depotId = 1
# risk free market return, assumed here 0.1% but is not really clear
risk_free = 0.001

b = sparkStructuredStreaming.backtest()

#performance = b.performance(depotId, symbol, share, startCap, commission, risk_free, strategy, interval, hdfs_path, sqlContext)

In [11]:
hdfs_path = "hdfs://0.0.0.0:19000"
df_performance = sqlContext.read.format('parquet').load(hdfs_path+"/performance").orderBy("DepotId")
df_performance.show(30, truncate=10)

+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------------+
|DepotId|     Value|     Alpha|      Beta|Start-Capital|    Profit|Start-Date|Trades|Performance_Strategy|Performance_Nasdaq|
+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------------+
|      1|10093.3...|1.68362...|0.55727...|      10000.0|93.3532...|2020-05-07|  8208|          0.93353...|        -1.3645...|
|      2|9854.51...|-0.6767...|0.57741...|      10000.0|-145.48...|2020-05-07|  2188|          -1.4548...|        -1.3645...|
|      3|9914.16...|-0.1697...|0.51299...|      10000.0|-85.833...|2020-05-07|   929|          -0.8583...|        -1.3645...|
|      4|10007.2...|1.39698...|1.09888...|      10000.0|7.29309...|2020-05-07|    29|          0.07293...|        -1.2027...|
+-------+----------+----------+----------+-------------+----------+----------+------+--------------------+------------

In [12]:
df_depot = sqlContext.read.format('parquet').load(hdfs_path+"/depot")
df_depot.orderBy("DepotId").show(20)

+-------+-------------+------------+------------------+---------------+
|DepotId|Start-Caputal|    Strategy|              ISIN|          Share|
+-------+-------------+------------+------------------+---------------+
|      1|      10000.0|  momentum10|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      2|      10000.0| momentum120|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      3|      10000.0| momentum500|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
|      4|      10000.0|Buy and Hold|[AAPL, MSFT, AMZN]|[0.4, 0.2, 0.4]|
+-------+-------------+------------+------------------+---------------+



## 3. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [6]:
r = sparkStructuredStreaming.realtime()
# index for elasticsearch
index = "trading"
# choose momentum which worked best in backtesting
momentum_str = [r.momentum,10]

r.realtime_simulation_es(symbol, share, startCap, commission, momentum_str, index, sqlContext)

2020-05-13 20:00:00 9999.992384000001 9999.992384000001


KeyboardInterrupt: 