In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os
%matplotlib inline
from matplotlib import pyplot as plt
from pyspark.sql.window import Window

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [3]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [4]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Historical Data

#### Read historical data from yahoo finance, write into HDFS

In [9]:
symbol=["AAPL","MSFT","AMZN","^IXIC"]
period="5d"
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"

for symbol in symbol:
    sparkStructuredStreaming.history().to_hdfs(symbol, interval, period, sqlContext, hdfs_path)

## 2. Backtesting

### 2.1 Momentum Strategy

In [5]:
symbol=["AAPL","MSFT","AMZN"]
momentum = 120
interval="1m"
hdfs_path = "hdfs://0.0.0.0:19000"
startCap = 10000
share = [0.3,0.2,0.3,0.2]
commission = 0
b = sparkStructuredStreaming.backtest()
depotId = 1
risk_free = 2

position = b.momentum_portfolio_position(symbol, interval, momentum, sqlContext, hdfs_path)
depot = b.depot(depotId, symbol, share, position, startCap, commission, risk_free)

#### to do: write results into hdfs, try out different strategies/stocks, use best one on realtime data

In [8]:
print("wert : ",depot[0],"start-capital : ",depot[1],"profit : ",depot[2],"start-date : ",depot[3],"trades : ",depot[4],"performance : ",depot[5],"%")
print("beta : ",depot[6],"alpha : ",depot[7])

wert :  10509.339999999995 start-capital :  10000 profit :  509.3399999999947 start-date :  2020-05-04 15:31:00 trades :  2934 performance :  5.093399999999937 %
beta :  0.4161927893979215 alpha :  1.1696561072945306


## 3. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [None]:
#use this for elasticsearch, otherwise it won't recognize date field
get_datetime_kafka = udf(lambda x : datetime.datetime.fromtimestamp((x-7200000)/ 1000.0).strftime("%Y-%m-%d"'T'"%H:%M:%S"))

sss = sparkStructuredStreaming.kafka_spark_stream(bootstrap)

parsedDF = sss.stream_quotes(spark)       

selectDF_es = parsedDF \
        .select(explode(array("quote_data")))\
        .select("col.*",get_datetime_kafka("col.latestUpdate").cast("String").alias("date"))

In [None]:
def moving_average(spark, df, update, interval):
    # simple moving average for the interval "interval"
    
    windowdf = df.select(window(df.timestamp, interval, update), df.latestPrice)
    
    windowdf.createOrReplaceTempView("windowdf_sql")
    
    sma = spark.sql("""SELECT windowdf_sql.window AS time, avg(windowdf_sql.latestPrice) AS average
                    FROM windowdf_sql
                    Group BY windowdf_sql.window
                    """)   
    return sma

## 4. Visualize results, either here or write results into Elasticsearch -> Kibana