In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os
%matplotlib inline
from matplotlib import pyplot as plt
from pyspark.sql.window import Window

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [4]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [5]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Historical Data

#### Read historical data from yahoo finance, write into HDFS

In [5]:
symbol=["AAPL","MSFT","AMZN","^IXIC"]
period="5d"
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"

for symbol in symbol:
    sparkStructuredStreaming.history().to_hdfs(symbol, interval, period, sqlContext, hdfs_path)

## 2. Backtesting

### 2.1 Momentum Strategy

In [5]:
symbol=["AAPL","MSFT","AMZN"]
#strategy looks at last 120 minutes to calculate average
strategy = ["momentum",100,110,120,130,140,800]
# granularity of historical data
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"
#start capital
startCap = 10000.0
# distribution of start-capital between stocks
share = [0.3,0.2,0.3,0.2]
# regulatory trading fee
commission = 0.000119
# when testing different strategies each one needs individual id
depotId = 8
# risk free market return, assumed here 0.1% but is not really clear
risk_free = 0.001

b = sparkStructuredStreaming.backtest()

performance = b.performance(depotId, symbol, share, startCap, commission, risk_free, strategy, interval, hdfs_path, sqlContext)

In [6]:
df_performance = sqlContext.read.format('parquet').load(hdfs_path+"/performance")
df_performance.show(8)

+-------+------------------+-------------------+--------------------+-------------+------------------+----------+------+--------------------+------------------+
|DepotId|             Value|              Alpha|                Beta|Start-Capital|            Profit|Start-Date|Trades|Performance_Strategy|Performance_Nasdaq|
+-------+------------------+-------------------+--------------------+-------------+------------------+----------+------+--------------------+------------------+
|      1| 10367.81868400003| 0.8289415795010102|  0.4231281977569605|      10000.0|367.81868400003077|2020-05-04|  2364|   3.678186840000297| 6.711052086563263|
|      2| 10309.53392800002|0.17485787444046785| 0.43376936269263766|      10000.0|309.53392800002075|2020-05-04|  2488|   3.095339280000209| 6.711052086563263|
|      3|10539.946582000004| 2.4291610041627716| 0.44121192048364566|      10000.0| 539.9465820000041|2020-05-04|  3222|   5.399465820000038| 6.711052086563263|
|      4|10529.090374000012| 2.383

In [10]:
df_depot = sqlContext.read.format('parquet').load(hdfs_path+"/depot")
df_depot.orderBy("DepotId").show(8)

+-------+-------------+------------+------------------+--------------------+
|DepotId|Start-Caputal|    Strategy|              ISIN|               Share|
+-------+-------------+------------+------------------+--------------------+
|      1|      10000.0| momentum100|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
|      2|      10000.0| momentum110|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
|      3|      10000.0| momentum120|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
|      4|      10000.0| momentum130|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
|      5|      10000.0| momentum140|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
|      6|      10000.0| momentum800|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
|      7|      10000.0|Buy and Hold|[AAPL, MSFT, AMZN]|[0.3, 0.2, 0.3, 0.2]|
+-------+-------------+------------+------------------+--------------------+



## 3. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [6]:
get_datetime = udf(lambda x : datetime.datetime.fromtimestamp(x/ 1000.0).strftime("%Y-%m-%d %H:%M:%S"))

sss = sparkStructuredStreaming.kafka_spark_stream(bootstrap)

parsedDF = sss.stream_quotes(spark)       

selectDF = parsedDF \
        .select(explode(array("quote_data")))\
        .select("col.*",get_datetime("col.latestUpdate").cast("Timestamp").alias("Datetime"))


In [None]:
from pyspark.sql.window import Window

startCap = 10000.0
symbol = ["AAPL"]
share = [1.0]

selectDF = selectDF.select("Datetime","symbol","latestPrice").withColumn("startCap",lit(startCap))
sss.write_console(selectDF).awaitTermination()

In [None]:
def moving_average(spark, df, update, interval):
    # simple moving average for the interval "interval"
    
    windowdf = df.select(window(df.timestamp, interval, update), df.latestPrice)
    
    windowdf.createOrReplaceTempView("windowdf_sql")
    
    sma = spark.sql("""SELECT windowdf_sql.window AS time, avg(windowdf_sql.latestPrice) AS average
                    FROM windowdf_sql
                    Group BY windowdf_sql.window
                    """)   
    return sma

## 4. Visualize results, either here or write results into Elasticsearch -> Kibana