In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime
import sys
from lib import sparkStructuredStreaming
import os

### Set-up to stream from Kafka topic + read and write from/to Elasticsearch

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.elasticsearch:elasticsearch-spark-20_2.11:7.6.2 pyspark-shell'

In [3]:
#"127.0.0.1:9092" (local) //"10.0.0.8:9092" (BACC)
bootstrap = "127.0.0.1:9092"

In [5]:
spark = SparkSession \
            .builder \
            .appName("KafkaIEXStructuredStreaming") \
            .master("local[*]") \
            .getOrCreate()

sqlContext = SQLContext(spark)

## 1. Implement Strategy

In [157]:
def dummy_strategy(close):
    if pe_ratio < 10:
        action = "buy"
    elif pe_ratio > 15:
        action = "sell"
    else:
        action = "hold"
        
    return action

## 2. Backtesting with static Dataframes with Historical Data from Yahoo Finance (Pyspark)

### 2.1 Read historical data from yahoo finance, write into Elasticsearch

In [6]:
from lib import history
interval = "60m"
period = "5d"
symbol = "AAPL"

h = history.history()
h.to_es(symbol,interval,period,sqlContext)

### 2.2 Read historical data from Elasticsearch into Spark Dataframe and evaluate Strategy with it

In [9]:
df = h.from_es(symbol,interval,spark)
df.show(10)

+------+------+------+------+-------+-------------------+------+
| Close|  High|   Low|  Open| Volume|               date|symbol|
+------+------+------+------+-------+-------------------+------+
|288.33|288.55|283.96|284.73|9996261|2020-04-29 13:30:00|  AAPL|
|287.53|289.67|287.25|288.33|4689060|2020-04-29 14:30:00|  AAPL|
|287.11|287.88|286.73|287.56|2832405|2020-04-29 15:30:00|  AAPL|
|287.16|288.02|286.99|287.18|2503534|2020-04-29 16:30:00|  AAPL|
|287.36| 287.7|286.33|287.17|2768467|2020-04-29 17:30:00|  AAPL|
| 288.7|288.85|287.01|287.39|3338954|2020-04-29 18:30:00|  AAPL|
|287.83|289.33|287.33| 288.7|3340075|2020-04-29 19:30:00|  AAPL|
| 292.2|293.32|288.46|289.96|9652057|2020-04-30 13:30:00|  AAPL|
|291.06|292.45|290.35|292.21|3630126|2020-04-30 14:30:00|  AAPL|
|289.49| 291.4|288.88|291.03|4048306|2020-04-30 15:30:00|  AAPL|
+------+------+------+------+-------+-------------------+------+
only showing top 10 rows



In [33]:
from yahoofinancials import YahooFinancials

#earnings per share to calculate p/e ratio
symbol = "FB"
eps = YahooFinancials(symbol).get_earnings_per_share()
print(eps)
'''udf_pe_ratio = udf(lambda x : x/eps)
df = df.select("date","Close",udf_pe_ratio("Close").alias("pe_ratio"))
df.orderBy("pe_ratio").show(20)'''

7.2880003567661875


'udf_pe_ratio = udf(lambda x : x/eps)\ndf = df.select("date","Close",udf_pe_ratio("Close").alias("pe_ratio"))\ndf.orderBy("pe_ratio").show(20)'

## 3. Trading Simulation / Performance Evaluation with realtime Streams (Spark Streaming)

### Stream real time quotes from Kafka topic

In [None]:
#use this for elasticsearch, otherwise it won't recognize date field
get_datetime_kafka = udf(lambda x : datetime.datetime.fromtimestamp((x-7200000)/ 1000.0).strftime("%Y-%m-%d"'T'"%H:%M:%S"))

sss = sparkStructuredStreaming.kafka_spark_stream(bootstrap)

parsedDF = sss.stream_quotes(spark)       

selectDF_es = parsedDF \
        .select(explode(array("quote_data")))\
        .select("col.*",get_datetime_kafka("col.latestUpdate").cast("String").alias("date"))

## 4. Visualize results, either here with Plotly or write results into Elasticsearch -> Kibana

### Test functions

In [7]:
def time_chart(df,interval):
    # use df with "timestamp", "latestPrice", "Watermark"
    # get open, high, low prices for each time interval
    interval_values = df.groupBy(
        window(df.timestamp, interval))\
        .agg(max("latestPrice").alias("high"),\
            min("latestPrice").alias("low"),\
            min("timestamp").alias("open_time"))\
        .select("window.start","window.end","high","low","open_time")\
        .withWatermark("start", interval)
    
    # join to get opening price from opening time
    chart = interval_values.join(df,interval_values.open_time == df.timestamp, "left")\
        .drop("open_time","timestamp")\
        .withColumnRenamed("latestPrice","open")
        
    return chart

In [8]:
def moving_average(spark, df, update, interval):
    # simple moving average for the interval "interval"
    
    windowdf = df.select(window(df.timestamp, interval, update), df.latestPrice)
    
    windowdf.createOrReplaceTempView("windowdf_sql")
    
    sma = spark.sql("""SELECT windowdf_sql.window AS time, avg(windowdf_sql.latestPrice) AS average
                    FROM windowdf_sql
                    Group BY windowdf_sql.window
                    """)   
    return sma