In [31]:
#
# Hedge Fund ML (sandbox/playground/harness)
#
# (c)2022 Guy Resh
#
# - start fund on 1/1/2020 with $1B seed capital
# - +$1B additional seed injections on 1/1/2021 & 1/1/2022
# - tradable securities selected from Dow 30, Nasdaq 100 & S&P 500 (528 unique; less 8 partial)
# - maintain portfolio with 50-100 "best" stocks
# - maintain diversification with 5-10 different industry sectors
# - generate Buy-and-Hold P/L % statistics for EOY 2020, 2021 & YTD 2022 for all 520 securities (long-only)
# - generate "opportunity" (measured move statistics) P/L using fractal-based reversal pivot points for all 520 securities (long-only)
# - maintain 1%-5% minimum monthly profit (stop at second consecutive losing month or if drawdown exceeds 10%)
# - features/strategies based on CCI, DC, KR, LRBO, RSI, VWAP, Half/SuperTrend, Volume, Velocity/Momentum, etc.
# - 0% commissions assumed (though can/should be accounted for at some point)
# - whole share purchases-only (no fractional; round quantities down to nearest 100?)
# - generate portfolio scenarios that rebalance daily, weekly, monthly and quarterly
# - split-handling?
# - dividend income inclusion?
#

In [32]:
import datetime
import json
import math
import os
import sys
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
from finvizfinance.quote import finvizfinance
import pyiqfeed as iq
from pyiqfeed.field_readers import read_posix_ts, date_us_to_datetime, datetime_to_yyyymmdd_hhmmss, us_since_midnight_to_time

from math import floor
from tqdm.notebook import tqdm
from termcolor import colored as cl

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (20,10)

pd.set_option( 'display.max_rows', None )
pd.set_option( 'display.max_columns', None )
pd.set_option( 'display.width', None )
pd.set_option( 'display.max_colwidth', None )

In [33]:
if sys.platform == 'linux':
  home = '/mnt/f/db/IQFeed/'
else:
  home = 'F:/db/IQFeed/'

In [34]:
Dow30Syms = [
"AAPL", "AMGN", "AXP", "BA", "CAT", "CRM", "CSCO", "CVX", "DIS", "DOW",
"GS", "HD", "HON", "IBM", "INTC", "JNJ", "JPM", "KO", "MCD", "MMM",
"MRK", "MSFT", "NKE", "PG", "TRV", "UNH", "V", "VZ", "WBA", "WMT"
]

Nasdaq100Syms = [ # 102
"AAPL", "ABNB", "ADBE", "ADI", "ADP", "ADSK", "AEP", "ALGN", "AMAT", "AMD",
"AMGN", "AMZN", "ANSS", "ASML", "ATVI", "AVGO", "AZN", "BIDU", "BIIB", "BKNG",
"CDNS", "CEG", "CHTR", "CMCSA", "COST", "CPRT", "CRWD", "CSCO", "CSX", "CTAS",
"CTSH", "DDOG", "DLTR", "DOCU", "DXCM", "EA", "EBAY", "EXC", "FAST", "FB",
"FISV", "FTNT", "GILD", "GOOG", "GOOGL", "HON", "IDXX", "ILMN", "INTC", "INTU",
"ISRG", "JD", "KDP", "KHC", "KLAC", "LCID", "LRCX", "LULU", "MAR", "MCHP",
"MDLZ", "MELI", "MNST", "MRNA", "MRVL", "MSFT", "MTCH", "MU", "NFLX", "NTES",
"NVDA", "NXPI", "ODFL", "OKTA", "ORLY", "PANW", "PAYX", "PCAR", "PDD", "PEP",
"PYPL", "QCOM", "REGN", "ROST", "SBUX", "SGEN", "SIRI", "SNPS", "SPLK", "SWKS",
"TEAM", "TMUS", "TSLA", "TXN", "VRSK", "VRSN", "VRTX", "WBA", "WDAY", "XEL",
"ZM", "ZS"
]

SP500Syms = [ # 504
"A", "AAL", "AAP", "AAPL", "ABBV", "ABC", "ABMD", "ABT", "ACN", "ADBE",
"ADI", "ADM", "ADP", "ADSK", "AEE", "AEP", "AES", "AFL", "AIG", "AIZ",
"AJG", "AKAM", "ALB", "ALGN", "ALK", "ALL", "ALLE", "AMAT", "AMCR", "AMD",
"AME", "AMGN", "AMP", "AMT", "AMZN", "ANET", "ANSS", "ANTM", "AON", "AOS",
"APA", "APD", "APH", "APTV", "ARE", "ATO", "ATVI", "AVB", "AVGO", "AVY",
"AWK", "AXP", "AZO", "BA", "BAC", "BAX", "BBWI", "BBY", "BDX", "BEN",
"BF.B", "BIIB", "BIO", "BK", "BKNG", "BKR", "BLK", "BLL", "BMY", "BR",
"BRK.B", "BRO", "BSX", "BWA", "BXP", "C", "CAG", "CAH", "CARR", "CAT",
"CB", "CBOE", "CBRE", "CCI", "CCL", "CDAY", "CDNS", "CDW", "CE", "CEG",
"CERN", "CF", "CFG", "CHD", "CHRW", "CHTR", "CI", "CINF", "CL", "CLX",
"CMA", "CMCSA", "CME", "CMG", "CMI", "CMS", "CNC", "CNP", "COF", "COO",
"COP", "COST", "CPB", "CPRT", "CPT", "CRL", "CRM", "CSCO", "CSX", "CTAS",
"CTLT", "CTRA", "CTSH", "CTVA", "CTXS", "CVS", "CVX", "CZR", "D", "DAL",
"DD", "DE", "DFS", "DG", "DGX", "DHI", "DHR", "DIS", "DISH", "DLR",
"DLTR", "DOV", "DOW", "DPZ", "DRE", "DRI", "DTE", "DUK", "DVA", "DVN",
"DXC", "DXCM", "EA", "EBAY", "ECL", "ED", "EFX", "EIX", "EL", "EMN",
"EMR", "ENPH", "EOG", "EPAM", "EQIX", "EQR", "ES", "ESS", "ETN", "ETR",
"ETSY", "EVRG", "EW", "EXC", "EXPD", "EXPE", "EXR", "F", "FANG", "FAST",
"FB", "FBHS", "FCX", "FDS", "FDX", "FE", "FFIV", "FIS", "FISV", "FITB",
"FLT", "FMC", "FOX", "FOXA", "FRC", "FRT", "FTNT", "FTV", "GD", "GE",
"GILD", "GIS", "GL", "GLW", "GM", "GNRC", "GOOG", "GOOGL", "GPC", "GPN",
"GRMN", "GS", "GWW", "HAL", "HAS", "HBAN", "HCA", "HD", "HES", "HIG",
"HII", "HLT", "HOLX", "HON", "HPE", "HPQ", "HRL", "HSIC", "HST", "HSY",
"HUM", "HWM", "IBM", "ICE", "IDXX", "IEX", "IFF", "ILMN", "INCY", "INTC",
"INTU", "IP", "IPG", "IPGP", "IQV", "IR", "IRM", "ISRG", "IT", "ITW",
"IVZ", "J", "JBHT", "JCI", "JKHY", "JNJ", "JNPR", "JPM", "K", "KEY",
"KEYS", "KHC", "KIM", "KLAC", "KMB", "KMI", "KMX", "KO", "KR", "L",
"LDOS", "LEN", "LH", "LHX", "LIN", "LKQ", "LLY", "LMT", "LNC", "LNT",
"LOW", "LRCX", "LUMN", "LUV", "LVS", "LW", "LYB", "LYV", "MA", "MAA",
"MAR", "MAS", "MCD", "MCHP", "MCK", "MCO", "MDLZ", "MDT", "MET", "MGM",
"MHK", "MKC", "MKTX", "MLM", "MMC", "MMM", "MNST", "MO", "MOH", "MOS",
"MPC", "MPWR", "MRK", "MRNA", "MRO", "MS", "MSCI", "MSFT", "MSI", "MTB",
"MTCH", "MTD", "MU", "NCLH", "NDAQ", "NDSN", "NEE", "NEM", "NFLX", "NI",
"NKE", "NLOK", "NLSN", "NOC", "NOW", "NRG", "NSC", "NTAP", "NTRS", "NUE",
"NVDA", "NVR", "NWL", "NWS", "NWSA", "NXPI", "O", "ODFL", "OGN", "OKE",
"OMC", "ORCL", "ORLY", "OTIS", "OXY", "PARA", "PAYC", "PAYX", "PCAR", "PEAK",
"PEG", "PENN", "PEP", "PFE", "PFG", "PG", "PGR", "PH", "PHM", "PKG",
"PKI", "PLD", "PM", "PNC", "PNR", "PNW", "POOL", "PPG", "PPL", "PRU",
"PSA", "PSX", "PTC", "PVH", "PWR", "PXD", "PYPL", "QCOM", "QRVO", "RCL",
"RE", "REG", "REGN", "RF", "RHI", "RJF", "RL", "RMD", "ROK", "ROL",
"ROP", "ROST", "RSG", "RTX", "SBAC", "SBNY", "SBUX", "SCHW", "SEDG", "SEE",
"SHW", "SIVB", "SJM", "SLB", "SNA", "SNPS", "SO", "SPG", "SPGI", "SRE",
"STE", "STT", "STX", "STZ", "SWK", "SWKS", "SYF", "SYK", "SYY", "T",
"TAP", "TDG", "TDY", "TECH", "TEL", "TER", "TFC", "TFX", "TGT", "TJX",
"TMO", "TMUS", "TPR", "TRMB", "TROW", "TRV", "TSCO", "TSLA", "TSN", "TT",
"TTWO", "TWTR", "TXN", "TXT", "TYL", "UA", "UAA", "UAL", "UDR", "UHS",
"ULTA", "UNH", "UNP", "UPS", "URI", "USB", "V", "VFC", "VLO", "VMC",
"VNO", "VRSK", "VRSN", "VRTX", "VTR", "VTRS", "VZ", "WAB", "WAT", "WBA",
"WBD", "WDC", "WEC", "WELL", "WFC", "WHR", "WM", "WMB", "WMT", "WRB",
"WRK", "WST", "WTW", "WY", "WYNN", "XEL", "XOM", "XRAY", "XYL", "YUM",
"ZBH", "ZBRA", "ZION", "ZTS"
]

uniqueSymbols = np.sort( np.unique( np.array( Dow30Syms + Nasdaq100Syms + SP500Syms ))).tolist()

#
# Don't include these symbols that don't have a full complement of data (IPO'd after 1/1/2020?)
#
#ABNB.pkl: 2020-12-10 13:40:00
#CARR.pkl: 2020-03-19 15:45:00
#CEG.pkl: 2022-01-19 10:25:00
#LCID.pkl: 2020-09-18 09:40:00
#OGN.pkl: 2021-05-14 11:35:00
#OTIS.pkl: 2020-03-19 11:40:00
#VTRS.pkl: 2020-11-12 09:35:00
#WBD.pkl: 2022-04-04 09:35:00
partialSymbols = ["ABNB","CARR","CEG","LCID","OGN","OTIS","VTRS","WBD"]
for symbol in partialSymbols:
  uniqueSymbols.remove( symbol ) 

print( len( uniqueSymbols ), 'unique symbols' ) # 528-8
#print( list( uniqueSymbols ))

520 unique symbols


In [35]:
#
# Download/persist (some) fundamental data from finviz
#
fundamentals = {}
fn_json = 'data/fundamentals.json'
if not os.path.isfile( fn_json ):
  for symbol in uniqueSymbols:
    finvizSymbol = symbol.replace( '.', '-' )
    try:
      stock = finvizfinance( finvizSymbol )
    except (RuntimeError, TypeError, NameError):
      pass
    finvizFundamentals = stock.ticker_fundament()
    fundamentals[symbol] = {
      'Company':   finvizFundamentals['Company'],
      'Sector':    finvizFundamentals['Sector'],
      'Industry':  finvizFundamentals['Industry'],
      'MarketCap': finvizFundamentals['Market Cap']
    }
    #print( symbol, fundamentals[symbol] )

  dfFundamentals = pd.DataFrame.from_dict( fundamentals, orient="index" )
  #print( dfFundamentals.info() )
  #print( dfFundamentals )

  jsonObj = json.loads( dfFundamentals.to_json( orient="index" ))
  jsonFundamentals = json.dumps( jsonObj, indent=2 )
  print( jsonFundamentals )
  with open( fn_json, "w" ) as f:
    f.write( jsonFundamentals )

dfFundamentals = pd.read_json( fn_json, orient="index" )
#print( dfFundamentals[0:10] )
print( "[", len( dfFundamentals ), "] fundamental data loaded..." )

[ 520 ] fundamental data loaded...


In [36]:
#
# Download 5 minute bars from IQFeed from bgn_prd to end_prd and return as a Pandas DataFrame
#
def get_historical_data( symbol, bgn_prd: datetime.datetime, end_prd: datetime.datetime ):

  print( 'get_historical_data(', symbol, ',', datetime_to_yyyymmdd_hhmmss( bgn_prd ), ',', datetime_to_yyyymmdd_hhmmss( end_prd ), ')' )

  histConn = iq.HistoryConn( name="pyiqfeed" )
  histConn.connect()
  histListener = iq.VerboseIQFeedListener( 'History Tick Listener' )
  histConn.add_listener( histListener )

  #dt, tm = read_posix_ts( bp_str )
  #bgn_prd = date_us_to_datetime( dt, tm )
  #dt, tm = read_posix_ts( ep_str )
  #end_prd = date_us_to_datetime( dt, tm )

  # dtype([('date', '<M8[D]'), ('time', '<m8[us]'), ('open_p', '<f8'), ('high_p', '<f8'), ('low_p', '<f8'), ('close_p', '<f8'), ('tot_vlm', '<u8'), ('prd_vlm', '<u8'), ('num_trds', '<u8')])
  ndarray = histConn.request_bars_in_period(
    ticker = symbol,
    interval_len = 300, # 5 min bars
    interval_type = 's',
    bgn_prd = bgn_prd,
    end_prd = end_prd,
    bgn_flt = datetime.time.fromisoformat( '09:30:00' ), # None,
    end_flt = datetime.time.fromisoformat( '16:00:00' ), # None,
    ascend = True,
    max_bars = None,
    label_at_beginning = False,
    timeout = 30
  )
  df = pd.DataFrame( ndarray )
  df['datetime'] = df['date'] + df['time']
  df.drop('date', axis=1, inplace=True)
  df.drop('time', axis=1, inplace=True)
  df.drop('tot_vlm', axis=1, inplace=True)
  df.drop('num_trds', axis=1, inplace=True)
  df.rename( columns={'open_p': 'open', 'high_p': 'high', 'low_p': 'low', 'close_p': 'close', 'prd_vlm': 'volume'}, inplace=True)
  df.set_index( 'datetime', inplace=True )
  #print( df.info() )

  histConn.remove_listener( histListener )
  histConn.disconnect()

  return df

In [37]:
#
# Generate/persist and load 5-minute OHLCV data from IQFeed for all 520 symbols
#
ohlcv = {}
bgn_prd = datetime.datetime( year=2020, month=1, day=1, hour=0, minute=0, second=0 )
end_prd = datetime.datetime.now()

for i in tqdm( range( len( uniqueSymbols )), leave=False ):
  
  symbol = uniqueSymbols[i]
  #print( 'symbol=[', symbol, ']' )

  fn_pkl = 'data/5min/' + symbol + '.pkl'

  if not os.path.isfile( fn_pkl ): # only download if we don't already have the data locally
    iqData = get_historical_data( symbol, bgn_prd, end_prd )
    #print( iqData[:5] )
    startTime = datetime.datetime.now()
    iqData.to_pickle( fn_pkl )
    deltaTime = datetime.datetime.now() - startTime
    #print( "iqData to_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

  startTime = datetime.datetime.now()
  ohlcv[symbol] = pd.read_pickle( fn_pkl )
  deltaTime = datetime.datetime.now() - startTime
  #print( len( ohlcv[symbol] ), "ohlcv read_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

print( "[", len( ohlcv ), "] 5-minute data loaded..." )

  0%|          | 0/520 [00:00<?, ?it/s]

[ 520 ] 5-minute data loaded...


In [38]:
#
# Generate and load hourly, daily and weekly data from 5 minute OHLCV data for all 520 symbols
#
ohlcvH = {}
ohlcvD = {}
ohlcvW = {}
how = {
  'open': 'first',
  'high': 'max',
  'low': 'min',
  'close': 'last',
  'volume': 'sum'
}

for i in tqdm( range( len( uniqueSymbols )), leave=False ):

  symbol = uniqueSymbols[i]

  fn_pkl = 'data/hourly/' + symbol + '.pkl'

  if not os.path.isfile( fn_pkl ): # only download if we don't already have the data locally
    df = ohlcv[symbol].resample( '1h', offset=0 ).apply( how ).dropna()
    #print( ohlcv[symbol][:20] )
    #print( df[:20] )
    startTime = datetime.datetime.now()
    df.to_pickle( fn_pkl )
    deltaTime = datetime.datetime.now() - startTime
    #print( "hourly to_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

  startTime = datetime.datetime.now()
  ohlcvH[symbol] = pd.read_pickle( fn_pkl )
  deltaTime = datetime.datetime.now() - startTime
  #print( len( ohlcvH[symbol] ), "hourly read_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

  fn_pkl = 'data/daily/' + symbol + '.pkl'

  if not os.path.isfile( fn_pkl ): # only download if we don't already have the data locally
    df = ohlcv[symbol].resample( '1d', offset=0 ).apply( how ).dropna()
    #print( ohlcv[symbol][:20] )
    #print( df[:20] )
    startTime = datetime.datetime.now()
    df.to_pickle( fn_pkl )
    deltaTime = datetime.datetime.now() - startTime
    #print( "hourly to_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

  startTime = datetime.datetime.now()
  ohlcvD[symbol] = pd.read_pickle( fn_pkl )
  deltaTime = datetime.datetime.now() - startTime
  #print( len( ohlcvH[symbol] ), "hourly read_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

  fn_pkl = 'data/weekly/' + symbol + '.pkl'

  if not os.path.isfile( fn_pkl ): # only download if we don't already have the data locally
    df = ohlcv[symbol].resample( '1w', offset=0 ).apply( how ).dropna()
    #print( ohlcv[symbol][:20] )
    #print( df[:20] )
    startTime = datetime.datetime.now()
    df.to_pickle( fn_pkl )
    deltaTime = datetime.datetime.now() - startTime
    #print( "hourly to_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

  startTime = datetime.datetime.now()
  ohlcvW[symbol] = pd.read_pickle( fn_pkl )
  deltaTime = datetime.datetime.now() - startTime
  #print( len( ohlcvH[symbol] ), "hourly read_pickle(" + fn_pkl + ") elapsed {:.3f}ms".format( deltaTime.total_seconds() * 1000 )) # milliseconds

print( "[", len( ohlcvH ), "] hourly data loaded..." )
print( "[", len( ohlcvD ), "] daily data loaded..." )
print( "[", len( ohlcvW ), "] weekly data loaded..." )

  0%|          | 0/520 [00:00<?, ?it/s]

[ 520 ] hourly data loaded...
[ 520 ] daily data loaded...
[ 520 ] weekly data loaded...


In [39]:
#
# Generate fractal/pivot reversal points
#
TP = 50 # TODO: ATR-based?
startTime = datetime.datetime.now()

for i in tqdm( range( len( uniqueSymbols )), leave=False ):

  symbol = uniqueSymbols[i]
  #print( 'symbol=[', symbol, ']' )

  df_tmp = ohlcv[symbol][['high', 'low', 'open']].copy()

  df_tmp = df_tmp.assign(fh = np.where(
    (df_tmp['high'] > df_tmp['high'].shift(1)) &
    (df_tmp['high'] > df_tmp['high'].shift(2)) &
    (df_tmp['high'] > df_tmp['high'].shift(3)) &
    (df_tmp['high'] > df_tmp['high'].shift(4)) &
    (df_tmp['high'] > df_tmp['high'].shift(-1)) &
    (df_tmp['high'] > df_tmp['high'].shift(-2)), # &
    #(df_tmp['high'] > df_tmp['high'].shift(-3)),
    1, 0
  ))
  df_tmp = df_tmp.assign(fl = np.where(
    (df_tmp['low'] < df_tmp['low'].shift(1)) &
    (df_tmp['low'] < df_tmp['low'].shift(2)) &
    (df_tmp['low'] < df_tmp['low'].shift(3)) &
    (df_tmp['low'] < df_tmp['low'].shift(4)) &
    (df_tmp['low'] < df_tmp['low'].shift(-1)) &
    (df_tmp['low'] < df_tmp['low'].shift(-2)), # &
    #(df_tmp['low'] < df_tmp['low'].shift(-3)),
    1, 0
  ))
  df_tmp = df_tmp[['fh', 'fl']]
  ohlcv[symbol].loc[:, 'fh42'] = df_tmp['fh']
  ohlcv[symbol].loc[:, 'fl42'] = df_tmp['fl']
  
  #
  # Determine if fractal/pivot reversal points were a "verified" win (within the next 3 bars)
  #
  df_tmp = ohlcv[symbol][['high', 'low', 'open']].copy()
  #print( "===[ df_tmp ]===\n", df_tmp.head(20), sep='')

  df_tmp = df_tmp.assign(fh = np.where(
    (df_tmp['high'] > df_tmp['high'].shift(1)) &
    (df_tmp['high'] > df_tmp['high'].shift(2)) &
    (df_tmp['high'] > df_tmp['high'].shift(3)) &
    (df_tmp['high'] > df_tmp['high'].shift(4)) &
    (df_tmp['high'] > df_tmp['high'].shift(-1)) &
    (df_tmp['high'] > df_tmp['high'].shift(-2)) &
    (
      (((df_tmp['open'].shift(-1) - df_tmp['low'].shift(-2)) / .01) >= TP) |
      (((df_tmp['open'].shift(-1) - df_tmp['low'].shift(-3)) / .01) >= TP) |
      (((df_tmp['open'].shift(-1) - df_tmp['low'].shift(-4)) / .01) >= TP) |
      (((df_tmp['open'].shift(-1) - df_tmp['low'].shift(-5)) / .01) >= TP) |
      (((df_tmp['open'].shift(-1) - df_tmp['low'].shift(-6)) / .01) >= TP)
    ),
    #(df_tmp['high'] > df_tmp['high'].shift(-3)),
    1, 0
  ))
  df_tmp = df_tmp.assign(fl = np.where(
    (df_tmp['low'] < df_tmp['low'].shift(1)) &
    (df_tmp['low'] < df_tmp['low'].shift(2)) &
    (df_tmp['low'] < df_tmp['low'].shift(3)) &
    (df_tmp['low'] < df_tmp['low'].shift(4)) &
    (df_tmp['low'] < df_tmp['low'].shift(-1)) &
    (df_tmp['low'] < df_tmp['low'].shift(-2)) &
    (
      (((df_tmp['high'].shift(-2) - df_tmp['open'].shift(-1)) / .01) >= TP) |
      (((df_tmp['high'].shift(-3) - df_tmp['open'].shift(-1)) / .01) >= TP) |
      (((df_tmp['high'].shift(-4) - df_tmp['open'].shift(-1)) / .01) >= TP) |
      (((df_tmp['high'].shift(-5) - df_tmp['open'].shift(-1)) / .01) >= TP) |
      (((df_tmp['high'].shift(-6) - df_tmp['open'].shift(-1)) / .01) >= TP)
    ),
    #(df_tmp['low'] < df_tmp['low'].shift(-3)),
    1, 0
  ))
  df_tmp = df_tmp[['fh', 'fl']]
  ohlcv[symbol].loc[:, 'fh42v'] = df_tmp['fh']
  ohlcv[symbol].loc[:, 'fl42v'] = df_tmp['fl']
  #print( df_tmp.head(20) )

deltaTime = datetime.datetime.now() - startTime

print( "[", len( ohlcv ), "] Fractal/Pivot generation elapsed {:.3f}s".format( deltaTime.total_seconds() )) # milliseconds

  0%|          | 0/520 [00:00<?, ?it/s]

[ 520 ] Fractal/Pivot generation elapsed 19.813s


In [40]:
ohlcv['AAPL'][0:10]

Unnamed: 0_level_0,open,high,low,close,volume,fh42,fl42,fh42v,fl42v
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-02 09:35:00,296.24,296.926,295.19,296.87,1649209,0,0,0,0
2020-01-02 09:40:00,296.93,297.95,296.9,297.27,1071400,0,0,0,0
2020-01-02 09:45:00,297.27,297.71,296.87,297.12,674630,0,0,0,0
2020-01-02 09:50:00,297.13,297.42,296.8,296.83,536942,0,0,0,0
2020-01-02 09:55:00,296.8,297.3,296.7,297.28,434840,0,0,0,0
2020-01-02 10:00:00,297.3,298.15,297.25,297.97,720271,0,0,0,0
2020-01-02 10:05:00,297.99,298.08,297.51,297.8,524797,0,0,0,0
2020-01-02 10:10:00,297.8,298.41,297.8,298.26,580352,1,0,1,0
2020-01-02 10:15:00,298.2693,298.31,297.3101,297.39,556495,0,0,0,0
2020-01-02 10:20:00,297.3701,297.61,296.93,297.22,527967,0,0,0,0


In [41]:
seedCapital = 1000000000
investmentCapitalPerSymbol = seedCapital / len( uniqueSymbols )
print( 'seedCapital=[ ${:,.2f}'.format( seedCapital ), '] investmentCapitalPerSymbol=[ ${:,.2f}'.format( investmentCapitalPerSymbol ), ']' )

dictPortfolio = {}
for i in tqdm( range( len( uniqueSymbols )), leave=False ):

  symbol = uniqueSymbols[i]

  open2020   = ohlcv[symbol].open['2020-01-02 09:35:00']
  close2020  = ohlcv[symbol].close['2020-12-31 16:00:00']
  pctChg2020 = (close2020 - open2020) / open2020
  invest2020 = math.floor( investmentCapitalPerSymbol / open2020 ) * open2020

  open2021   = ohlcv[symbol].open['2021-01-04 09:35:00']
  close2021  = ohlcv[symbol].close['2021-12-31 16:00:00']
  pctChg2021 = (close2021 - open2021) / open2021
  invest2021 = math.floor( investmentCapitalPerSymbol / open2021 ) * open2021

  open2022   = ohlcv[symbol].open['2022-01-03 09:35:00']
  close2022  = ohlcv[symbol].close['2021-04-14 16:00:00']
  pctChg2022 = (close2022 - open2022) / open2022
  invest2022 = math.floor( investmentCapitalPerSymbol / open2022 ) * open2022

  dictPortfolio[symbol] = [
    open2020, close2020, pctChg2020 * 100.0, invest2020, invest2020 * pctChg2020,
    open2021, close2021, pctChg2021 * 100.0, invest2021, invest2021 * pctChg2021,
    open2022, close2022, pctChg2022 * 100.0, invest2022, invest2022 * pctChg2022
  ]
print( end="\r" )
#print( dictPortfolio )

dfPortfolio = pd.DataFrame.from_dict(
  dictPortfolio, orient='index',
  columns=[
    'open2020','close2020','pctChg2020','invest2020','PnL2020',
    'open2021','close2021','pctChg2021','invest2021','PnL2021',
    'open2022','close2022','pctChg2022','invest2022','PnL2022'
  ]
)

seedCapital=[ $1,000,000,000.00 ] investmentCapitalPerSymbol=[ $1,923,076.92 ]


  0%|          | 0/520 [00:00<?, ?it/s]



In [46]:
dfPortfolio.sort_values( by=['pctChg2020'], ascending=False, inplace=True)
df2020Top50 = dfPortfolio[['open2020','close2020','pctChg2020']][0:50]

df2020Top50.reset_index(inplace=True)
df2020Top50 = df2020Top50.rename( columns = {'index':'symbol'})

df2020Top50

dfPortfolio.sort_values( by=['pctChg2021'], ascending=False, inplace=True)
df2021Top50 = dfPortfolio[['open2021','close2021','pctChg2021']][0:50]

dfPortfolio.sort_values( by=['pctChg2022'], ascending=False, inplace=True)
df2022Top50 = dfPortfolio[['open2022','close2022','pctChg2022']][0:50]

with pd.option_context( 'display.width', 1000, 'display.precision', 2 ):
  print( df2020Top50, df2021Top50 )


   symbol  open2020  close2020  pctChg2020
0    ENPH     26.37     175.47      565.42
1    MRNA     19.57     104.47      433.83
2      ZM     68.80     337.16      390.06
3     PDD     38.50     177.67      361.48
4      ZS     46.87     199.71      326.09
5    CRWD     50.03     211.82      323.39
6    ETSY     44.71     177.84      297.76
7    PENN     25.97      86.38      232.61
8    SEDG     97.00     319.12      228.99
9    DOCU     74.31     222.39      199.27
10   MELI    576.94    1674.80      190.29
11   DDOG     38.22      98.42      157.51
12     JD     35.96      87.85      144.30
13   GNRC    101.38     227.42      124.32
14   NVDA    238.75     522.12      118.69
15   OKTA    116.75     254.27      117.79
16   PYPL    109.47     234.31      114.04
17   BBWI     18.25      37.18      103.73
18   MPWR    180.12     366.23      103.33
19    ALB     73.50     147.51      100.69
20    AMD     46.86      91.72       95.73
21    FCX     13.35      26.01       94.83
22   CDNS  

In [None]:
#with pd.option_context( 'display.width', 1000, 'display.precision', 2 ):
#  print( dfPortfolio )

BnHPnL2020All = dfPortfolio['PnL2020'].sum()
BnHPnL2021All = dfPortfolio['PnL2021'].sum()
BnHPnL2022All = dfPortfolio['PnL2022'].sum()

print( "\n2020 B&H PnL (All)=[ ${:,.2f}".format( BnHPnL2020All ), "({:.1f}%) ] ".format( (BnHPnL2020All / seedCapital) * 100.0 ))
print( "2021 B&H PnL (All)=[ ${:,.2f}".format( BnHPnL2021All ), "({:.1f}%) ] ".format( (BnHPnL2021All / seedCapital) * 100.0 ))
print( "YTD 2022 B&H PnL (All)=[ ${:,.2f}".format( BnHPnL2022All ), "({:.1f}%) ] ".format( (BnHPnL2022All / seedCapital) * 100.0 ))

BnHPnL2020WinnersOnly = dfPortfolio.query("pctChg2020 > 0.0")['PnL2020'].sum()
BnHPnL2021WinnersOnly = dfPortfolio.query("pctChg2021 > 0.0")['PnL2021'].sum()
BnHPnL2022WinnersOnly = dfPortfolio.query("pctChg2022 > 0.0")['PnL2022'].sum()

numBnHPnL2020WinnersOnly = dfPortfolio.query("pctChg2020 > 0.0")['PnL2020'].count()
numBnHPnL2021WinnersOnly = dfPortfolio.query("pctChg2021 > 0.0")['PnL2021'].count()
numBnHPnL2022WinnersOnly = dfPortfolio.query("pctChg2022 > 0.0")['PnL2022'].count()

print( "\n2020 B&H PnL (Winners-only)=[ ${:,.2f}".format( BnHPnL2020WinnersOnly ), "({:.1f}%;".format( (BnHPnL2020WinnersOnly / seedCapital) * 100.0 ), numBnHPnL2020WinnersOnly, 'of', len( uniqueSymbols ), ')]' )
print( "2021 B&H PnL (Winners-only)=[ ${:,.2f}".format( BnHPnL2021WinnersOnly ), "({:.1f}%;".format( (BnHPnL2021WinnersOnly / seedCapital) * 100.0 ), numBnHPnL2021WinnersOnly, 'of', len( uniqueSymbols ), ')]' )
print( "YTD 2022 B&H PnL (Winners-only)=[ ${:,.2f}".format( BnHPnL2022WinnersOnly ), "({:.1f}%;".format( (BnHPnL2022WinnersOnly / seedCapital) * 100.0 ), numBnHPnL2022WinnersOnly, 'of', len( uniqueSymbols ), ')]' )

#ohlcv[] for symbol in uniqueSymbols
sys.exit( 0 )

In [None]:
#
# Refactor from here and below...
#
def get_roc(close, n):
  difference = close.diff(n)
  nprev_values = close.shift(n)
  roc = (difference / nprev_values) * 100
  return roc

def get_kst(close, sma1, sma2, sma3, sma4, roc1, roc2, roc3, roc4, signal):
  rcma1 = get_roc(close, roc1).rolling(sma1).mean()
  rcma2 = get_roc(close, roc2).rolling(sma2).mean()
  rcma3 = get_roc(close, roc3).rolling(sma3).mean()
  rcma4 = get_roc(close, roc4).rolling(sma4).mean()
  kst = (rcma1 * 1) + (rcma2 * 2) + (rcma3 * 3) + (rcma4 * 4)
  signal = kst.rolling(signal).mean()
  return kst, signal

tsla['kst'], tsla['signal_line'] = get_kst(tsla['close'], 10, 10, 10, 15, 10, 15, 20, 30, 9)
tsla = tsla[tsla.index >= '2022-01-01']
print(tsla.tail())

ax1 = plt.subplot2grid((11,1), (0,0), rowspan = 5, colspan = 1)
ax2 = plt.subplot2grid((11,1), (6,0), rowspan = 5, colspan = 1)
ax1.plot(tsla['close'], linewidth = 2.5)
ax1.set_title('TSLA CLOSING PRICES')
ax2.plot(tsla['kst'], linewidth = 2, label = 'KST', color = 'orange')
ax2.plot(tsla['signal_line'], linewidth = 2, label = 'SIGNAL', color = 'mediumorchid')
ax2.legend()
ax2.set_title('TSLA KST')
plt.show()

def implement_kst_strategy(prices, kst_line, signal_line):
  buy_price = []
  sell_price = []
  kst_signal = []
  signal = 0
    
  for i in range(len(kst_line)):
        
    if kst_line[i-1] < signal_line[i-1] and kst_line[i] > signal_line[i]:
        if signal != 1:
          buy_price.append(prices[i])
          sell_price.append(np.nan)
          signal = 1
          kst_signal.append(signal)
        else:
          buy_price.append(np.nan)
          sell_price.append(np.nan)
          kst_signal.append(0)
                
    elif kst_line[i-1] > signal_line[i-1] and kst_line[i] < signal_line[i]:
      if signal != -1:
        buy_price.append(np.nan)
        sell_price.append(prices[i])
        signal = -1
        kst_signal.append(signal)
      else:
        buy_price.append(np.nan)
        sell_price.append(np.nan)
        kst_signal.append(0)
                
    else:
      buy_price.append(np.nan)
      sell_price.append(np.nan)
      kst_signal.append(0)
            
  return buy_price, sell_price, kst_signal

buy_price, sell_price, kst_signal = implement_kst_strategy(tsla['close'], tsla['kst'], tsla['signal_line'])

ax1 = plt.subplot2grid((11,1), (0,0), rowspan = 5, colspan = 1)
ax2 = plt.subplot2grid((11,1), (6,0), rowspan = 5, colspan = 1)
ax1.plot(tsla['close'], linewidth = 2, label = 'TSLA')
ax1.plot(tsla.index, buy_price, marker = '^', markersize = 12, linewidth = 0, color = 'green', label = 'BUY SIGNAL')
ax1.plot(tsla.index, sell_price, marker = 'v', markersize = 12, linewidth = 0, color = 'r', label = 'SELL SIGNAL')
ax1.legend()
ax1.set_title('TSLA KST TRADING SIGNALS')
ax2.plot(tsla['kst'], linewidth = 2, label = 'KST', color = 'orange')
ax2.plot(tsla['signal_line'], linewidth = 2, label = 'SIGNAL', color = 'mediumorchid')
ax2.legend()
ax2.set_title('TSLA KST')
plt.show()

In [None]:
#
# Bokeh plot(s)
#
from math import pi
import bokeh
from bokeh.plotting import figure, ColumnDataSource
from bokeh.io import output_notebook, show, curdoc

from bokeh.models import BooleanFilter, CDSView, Select, Range1d
from bokeh.models import Span, CrosshairTool, HoverTool, ResetTool, PanTool, WheelZoomTool
from bokeh.models.formatters import NumeralTickFormatter, DatetimeTickFormatter
from bokeh.models.widgets import Dropdown

from bokeh.layouts import column
from bokeh.palettes import Category20
from bokeh.resources import INLINE

output_notebook()

RED        = Category20[7][6]
DARKRED    = '#d00000'
#GREEN      = Category20[5][4]
GREEN      = '#00ff00'
DARKGREEN  = '#008000'
#BLUE       = Category20[3][0]
BLUE       = '#0000ff'
BLUE_LIGHT = Category20[3][1]
#ORANGE     = Category20[3][2]
ORANGE     = '#ff9900'
#PURPLE     = Category20[9][8]
PURPLE     = '#7b7bc0'
BROWN      = Category20[11][10]
WHITE      = '#ffffff'
GRAY       = '#505050'
YELLOW     = '#ffff00'
CYAN       = '#00ffff'
MAGENTA    = '#ff00ff'

chart_params = {
  'title' : symbol,
  'colors': {'up':'Green', 'down': 'Red'},
  'size'  : {'height': 500 , 'width': 1000} #,
  #'days'  : 100
}

VBAR_WIDTH = .5

def BokehChart( price_data ):

  numBars = 10000 if price_data.shape[0] > 10000 else price_data.shape[0]

  chart_data = price_data[0:numBars]
  chart_data = chart_data.assign( bar = np.arange( 1, len( chart_data ) + 1))
  #chart_data['bar'] += 1

  cds = ColumnDataSource( chart_data )
  #print( 'cds=[', cds.data, ']' )
  #cds = ColumnDataSource( data=dict( Date=[], Open=[], High=[], Low=[], Close=[], index=[] ))
  #cds.data = cds.from_df( price_data )
  #elements = list()

  # Bokeh comes with a list of tools that include xpan and crosshair.
  # Where pan allows you to move the chart in the y and x axis, the xpan limits this movement to the x-axis.
  #TOOLS = 'xpan,crosshair,wheel_zoom,hover,reset'
  TOOLS = [
    CrosshairTool(line_color='white'),
    PanTool(dimensions='width'),
    #HoverTool(tooltips=[("Time", "@Date_Time{%F}"), ('O', '@open{0.00}'), ('H', '@high{0.00}'), ('L', '@low{0.00}'), ('C', '@close{0.00}')], formatters={"@Time": "datetime"}, mode='vline'),
    WheelZoomTool(dimensions='width'),
    ResetTool()
  ]

  # Select specific tool for the plot
  #price_hover = p.select( dict( type=HoverTool ))

  # Choose, which glyphs are active by glyph name
  #price_hover.names = ["price"]

  # Creating tooltips
  #price_hover.tooltips = [("Datetime", "@Date{%Y-%m-%d}"),
  #                        ("Open", "@Open{$0,0.00}"),
  #                        ("Close", "@Close{$0,0.00}"),
  #                       ("Volume", "@Volume{($ 0.00 a)}")]
  #price_hover.formatters={"Date": 'datetime'}

  p = figure(
    tools=TOOLS, toolbar_location = 'above',
    plot_width = chart_params['size']['width'], plot_height = chart_params['size']['height'],
    title = chart_params['title'],
    x_range = (1, 100),
    y_axis_location = 'right'
  )
  #Range1d(bounds=(0, 1000))
  #x_axis_type = 'linear',

  p.background_fill_color = "black"
  #p.xaxis.major_label_orientation = pi / 4
  p.xaxis.ticker.desired_num_ticks = 5
  p.xaxis.major_label_overrides = {i: '{:02d}:{:02d}:{:02d}'.format( chart_data.index[i].hour, chart_data.index[i].minute, chart_data.index[i].second ) for i in range( numBars )}
  #p.xaxis.formatter=DatetimeTickFormatter( # "%d %B %Y"
  #  minutes=["%H:%M:%S"],
  #  hours=["%H:%M:%S"],
  #  days=["%H:%M:%S"],
  #  months=["%H:%M:%S"],
  #  years=["%H:%M:%S"]
  #)
  #ticker = SingleIntervalTicker(interval=5, num_minor_ticks=10)
  #xaxis = LinearAxis(ticker=ticker)
  #p.add_layout(xaxis, 'below')
  p.grid.grid_line_dash = [1, 3]
  p.grid.grid_line_alpha = 0.4

  #p.circle( x='bar', y='vwap', size=3, fill_color=CYAN, line_color=CYAN, source=cds)

  ###
  p.yaxis.axis_label_text_font_size = "12pt"
  #p.yaxis.major_label_orientation = 'vertical'

  # map dataframe indices to date strings and use as label overrides
  #p.xaxis.major_label_overrides = {
  #  i+int(chart_data['index'][0]): date.strftime('%b %d') for i, date in enumerate(pd.to_datetime(chart_data["Date_Time"]))
  #}
  #p.xaxis.bounds = (chart_data['index'][0], chart_data['index'][-1])

  # Add more ticks in the plot
  #p.x_range.range_padding = 0.05
  #p.xaxis.ticker.desired_num_ticks = 40
  #p.xaxis.major_label_orientation = 3.14/4
  ###

  #
  # Price bars (x=CurrentBar, y=price)
  #
  mids = (chart_data.Open + chart_data.Close) / 2
  spans = abs( chart_data.Close - chart_data.Open )
  inc = chart_data.Close > chart_data.Open
  dec = chart_data.Open >= chart_data.Close
  view_inc = CDSView( source=cds, filters=[BooleanFilter( inc )] )
  view_dec = CDSView( source=cds, filters=[BooleanFilter( dec )] )

  #p.segment( x0='Date_Time', x1='Date_Time', y0='low', y1='high', color='white', source=cds, view=view_inc )
  #p.segment( x0='Date_Time', x1='Date_Time', y0='low', y1='high', color='white', source=cds, view=view_dec )
  #p.vbar( x='Date_Time', line_width=VBAR_WIDTH, top='open', bottom='close', fill_color='white', line_color='white', source=cds, view=view_inc, name="price")
  #p.vbar( x='Date_Time', line_width=VBAR_WIDTH, top='open', bottom='close', fill_color=ORANGE, line_color=ORANGE, source=cds, view=view_dec, name="price")

  p.segment( x0='bar', x1='bar', y0='Low', y1='High', color='white', source=cds, view=view_inc )
  p.segment( x0='bar', x1='bar', y0='Low', y1='High', color='white', source=cds, view=view_dec )
  p.vbar( x='bar', width=VBAR_WIDTH, top='Open', bottom='Close', fill_color='white', line_color='white', source=cds, view=view_inc, name="price")
  p.vbar( x='bar', width=VBAR_WIDTH, top='Open', bottom='Close', fill_color=ORANGE, line_color=ORANGE, source=cds, view=view_dec, name="price")

  return p, chart_data, cds

In [None]:
(p, chart_data, cds) = BokehChart( df )
#BokehDonchian( p, cds )
#BokehReversals( p, chart_data )
#BokehFractals( p, chart_data )
#pADX = BokehADX( p, cds )
#pMFI = BokehMFI( p, cds )
#pOBV = BokehOBV( p, cds )
#pPPO = BokehPPO( p, cds )
#pSMI = BokehSMI( p, cds )
#pVelocity = BokehVelocity( p, chart_data )

#c = column( children=[p, pADX, pSMI, pPPO, pOBV, pVelocity], spacing=0 )
c = column( children=[p], spacing=0 )
bokeh.io.showing.show( c )

In [None]:
# Create a variable n with a value of 10
n = 8

dfPPO = TA.PPO( df, 3, 10, 16, column="Open" )
df.loc[:, 'PPO'] = dfPPO['PPO'].values
df.loc[:, 'PPOAvg'] = dfPPO['SIGNAL'].values
df.loc[:, 'PPODiff'] = dfPPO['HISTO'].values

#df['RSI'] = ta.RSI(np.array(df['Close'].shift(1)), timeperiod=n)
df['RSI'] = ta.RSI(np.array(df['Open'].shift(1)), timeperiod=n)

#df['SMA'] = df['Close'].shift(1).rolling(window=n).mean()
df['SMA'] = df['Open'].shift(1).rolling(window=n).mean()

#df['Corr'] = df['Close'].shift(1).rolling(window=n).corr(df['SMA'].shift(1))
df['Corr'] = df['Open'].shift(1).rolling(window=n).corr(df['SMA'].shift(1))

df['SAR'] = ta.SAR(np.array(df['High'].shift( 1 )), np.array(df['Low'].shift( 1 )), 0.2, 0.2)

df['ADX'] = ta.ADX(np.array(df['High'].shift( 1 )),
                   np.array(df['Low'].shift( 1 )),
                   np.array(df['Open']),
                   timeperiod=n)

df['Prev_High'] = df['High'].shift(1)
df['Prev_Low'] = df['Low'].shift(1)
df['Prev_Close'] = df['Close'].shift(1)

# Create columns 'OO' with the difference between the current minute's open and last minute's open
df['OO'] = df['Open'] - df['Open'].shift(1)

# Create columns 'OC' with the difference between the current minute's open and last minute's close
df['OC'] = df['Open'] - df['Prev_Close']

# Create a column 'Ret' with the calculation of returns
df['Ret'] = (df['Open'].shift(-1) - df['Open']) / df['Open']

# Create n columns and assign
for i in range(1, n): # n
  df['return%i' % i] = df['Ret'].shift(i)
    
# Change the value of 'Corr' to -1 if it is less than -1
df.loc[df['Corr'] < -1, 'Corr'] = -1

# Change the value of 'Corr' to 1 if it is greater than 1
df.loc[df['Corr'] > 1, 'Corr'] = 1

# Drop the NaN values
df = df.dropna()

In [None]:
def calc_features(df):
    open = df['op']
    high = df['hi']
    low = df['lo']
    close = df['cl']
    volume = df['volume']
    
    orig_columns = df.columns

    hilo = (df['hi'] + df['lo']) / 2
    df['BBANDS_upperband'], df['BBANDS_middleband'], df['BBANDS_lowerband'] = talib.BBANDS(close, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
    df['BBANDS_upperband'] -= hilo
    df['BBANDS_middleband'] -= hilo
    df['BBANDS_lowerband'] -= hilo
    df['DEMA'] = talib.DEMA(close, timeperiod=30) - hilo
    df['EMA'] = talib.EMA(close, timeperiod=30) - hilo
    df['HT_TRENDLINE'] = talib.HT_TRENDLINE(close) - hilo
    df['KAMA'] = talib.KAMA(close, timeperiod=30) - hilo
    df['MA'] = talib.MA(close, timeperiod=30, matype=0) - hilo
    df['MIDPOINT'] = talib.MIDPOINT(close, timeperiod=14) - hilo
    df['SMA'] = talib.SMA(close, timeperiod=30) - hilo
    df['T3'] = talib.T3(close, timeperiod=5, vfactor=0) - hilo
    df['TEMA'] = talib.TEMA(close, timeperiod=30) - hilo
    df['TRIMA'] = talib.TRIMA(close, timeperiod=30) - hilo
    df['WMA'] = talib.WMA(close, timeperiod=30) - hilo

    df['ADX'] = talib.ADX(high, low, close, timeperiod=14)
    df['ADXR'] = talib.ADXR(high, low, close, timeperiod=14)
    df['APO'] = talib.APO(close, fastperiod=12, slowperiod=26, matype=0)
    df['AROON_aroondown'], df['AROON_aroonup'] = talib.AROON(high, low, timeperiod=14)
    df['AROONOSC'] = talib.AROONOSC(high, low, timeperiod=14)
    df['BOP'] = talib.BOP(open, high, low, close)
    df['CCI'] = talib.CCI(high, low, close, timeperiod=14)
    df['DX'] = talib.DX(high, low, close, timeperiod=14)
    df['MACD_macd'], df['MACD_macdsignal'], df['MACD_macdhist'] = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
    # skip MACDEXT MACDFIX たぶん同じなので
    df['MFI'] = talib.MFI(high, low, close, volume, timeperiod=14)
    df['MINUS_DI'] = talib.MINUS_DI(high, low, close, timeperiod=14)
    df['MINUS_DM'] = talib.MINUS_DM(high, low, timeperiod=14)
    df['MOM'] = talib.MOM(close, timeperiod=10)
    df['PLUS_DI'] = talib.PLUS_DI(high, low, close, timeperiod=14)
    df['PLUS_DM'] = talib.PLUS_DM(high, low, timeperiod=14)
    df['RSI'] = talib.RSI(close, timeperiod=14)
    df['STOCH_slowk'], df['STOCH_slowd'] = talib.STOCH(high, low, close, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    df['STOCHF_fastk'], df['STOCHF_fastd'] = talib.STOCHF(high, low, close, fastk_period=5, fastd_period=3, fastd_matype=0)
    df['STOCHRSI_fastk'], df['STOCHRSI_fastd'] = talib.STOCHRSI(close, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
    df['TRIX'] = talib.TRIX(close, timeperiod=30)
    df['ULTOSC'] = talib.ULTOSC(high, low, close, timeperiod1=7, timeperiod2=14, timeperiod3=28)
    df['WILLR'] = talib.WILLR(high, low, close, timeperiod=14)

    df['AD'] = talib.AD(high, low, close, volume)
    df['ADOSC'] = talib.ADOSC(high, low, close, volume, fastperiod=3, slowperiod=10)
    df['OBV'] = talib.OBV(close, volume)

    df['ATR'] = talib.ATR(high, low, close, timeperiod=14)
    df['NATR'] = talib.NATR(high, low, close, timeperiod=14)
    df['TRANGE'] = talib.TRANGE(high, low, close)

    df['HT_DCPERIOD'] = talib.HT_DCPERIOD(close)
    df['HT_DCPHASE'] = talib.HT_DCPHASE(close)
    df['HT_PHASOR_inphase'], df['HT_PHASOR_quadrature'] = talib.HT_PHASOR(close)
    df['HT_SINE_sine'], df['HT_SINE_leadsine'] = talib.HT_SINE(close)
    df['HT_TRENDMODE'] = talib.HT_TRENDMODE(close)

    df['BETA'] = talib.BETA(high, low, timeperiod=5)
    df['CORREL'] = talib.CORREL(high, low, timeperiod=30)
    df['LINEARREG'] = talib.LINEARREG(close, timeperiod=14) - close
    df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(close, timeperiod=14)
    df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(close, timeperiod=14) - close
    df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(close, timeperiod=14)
    df['STDDEV'] = talib.STDDEV(close, timeperiod=5, nbdev=1)

    return df

df = pd.read_pickle('df_ohlcv_with_fee.pkl')
df = df.dropna()
df = calc_features(df)
display(df)
df.to_pickle('df_features.pkl')

def stock_features():
  Stock_Data[i]['High Shifted'] = Stock_Data[i]['High'].shift(1)
  Stock_Data[i]['Low Shifted'] = Stock_Data[i]['Low'].shift(1)
  Stock_Data[i]['Close Shifted'] = Stock_Data[i]['Close'].shift(1)
  Stock_Data[i]['Upper BBand'], Stock_Data[i]['Middle BBand'],Stock_Data[i]['Lower BBand']= ta.BBANDS(Stock_Data[i]['Close Shifted'], timeperiod=20,)
  Stock_Data[i]['RSI'] = ta.RSI(np.array(Stock_Data[i]['Close Shifted']), timeperiod=14)
  Stock_Data[i]['Macd'], Stock_Data[i]['Macd Signal'],Stock_Data[i]['Macd Hist'] = ta.MACD(Stock_Data[i]['Close Shifted'], fastperiod=12, slowperiod=26, signalperiod=9)
  Stock_Data[i]['Momentum'] = ta.MOM(Stock_Data[i]['Close Shifted'],timeperiod=12)    
  Stock_Data[i]['Returns'] = np.log(Stock_Data[i]['Open']/Stock_Data[i]['Open'].shift(1)) 
    

In [None]:
t = .8
split = int(t*len(df))
print( len(df), split )

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

df['Signal'] = 0

# Assign a value of 1 to 'Signal' column for the quantile with the highest returns
df.loc[df['Ret'] > df['Ret'][:split].quantile( q=0.66 ), 'Signal'] = 1

# Assign a value of -1 to 'Signal' column for the quantile with the lowest returns
df.loc[df['Ret'] < df['Ret'][:split].quantile( q=0.34 ), 'Signal'] = -1

print( 'split=[', split, ']' )
df[-5:]
print( len( df['Ret'][:split] ))
#df['Ret']
#print( df['Ret'][:split].quantile( q=0.66 ))
#print( df['Ret'][:split].quantile( q=0.34 ))

X = df.copy(deep=True)

# Use drop method to drop the columns
#X = df.drop(['Close', 'High', 'Low', 'Volume', 'Ret', 'Signal'], axis=1)
#X = df.drop(['Close', 'Low', 'High', 'Volume', 'Ret', 'Ret1', 'Signal', 'Pred_Signal'], axis=1)
#X = df.drop(['High', 'Low', 'Volume', 'Ret', 'Signal'], axis=1)
#X = df.drop(['Close', 'High', 'Low', 'Volume', 'RSI', 'SMA', 'SAR', 'ADX', 'Prev_High', 'Prev_Low', 'Prev_Close', 'Ret', 'Signal', 'return1'], axis=1)
#X = df.drop(['Open', 'High', 'Low', 'Close', 'Volume', 'Prev_High', 'Prev_Low', 'Prev_Close', 'OO', 'OC', 'Ret', 'Signal'], axis=1)
#X[0:5]
dropCols = ['Close', 'Low', 'High', 'Volume', 'Ret', 'Ret1', 'Signal', 'Pred_Signal']
for col in dropCols:
  if col in X:
    X.drop([col], axis=1, inplace=True)

print( df[0:5] )
print( X[0:5] )

y = df['Signal']

if False:
  X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0 )

  print( '#X_train=[', len( X_train ), ']' )
  print( '#X_test=[', len( X_test ), ']' )
  print( '#y_train=[', len( y_train ), ']' )
  print( '#y_test=[', len( y_test ), ']' )

  #X_train.info()
  print( X_train[0:5] )
  print( y_train[0:5] )

print( X[0:5] )
print( y[0:5] )

In [None]:
if False:
  pipeline_lr = Pipeline([('scalar1', StandardScaler()),
                          ('pca1', PCA( n_components=2 )),
                          ('lr_classifier', LogisticRegression( random_state=0 ))])
  pipeline_dt = Pipeline([('scalar2', StandardScaler()),
                          ('pca2', PCA( n_components=2)),
                          ('dt_classifier', DecisionTreeClassifier())])
  pipeline_rf = Pipeline([('scaler3', StandardScaler()),
                          ('pca3', PCA( n_components=2)),
                          ('rf_classifier', RandomForestClassifier())])

  pipelines = [pipeline_lr, pipeline_dt, pipeline_rf]

  best_accuracy = 0.0
  best_classifier = 0
  best_pipeline = ""

  pipe_dict = {0:'Logistic Regression', 1:'Decision Tree', 2:'Random Forest'}

  for pipe in pipelines:
    pipe.fit( X_train, y_train )

  for i, model in enumerate( pipelines ):
    print( '{} test accuracy: {}'.format( pipe_dict[i], model.score( X_test, y_test )))

In [None]:
# R

# A great quantitative trading resource
#install.packages( 'quantmod' )
#library( quantmod )

# The library containing our SVM
#install.packages(“e1071”)
#library(e1071)

# The plotting tools we will use
#install.packages(“ggplot2”)
#library(ggplot2)

# Our 4-hour bars of the Australian Dollar/US Dollar currency pair dating back to 01/01/2010. You can download it here for your own use.
#Data <- AUD/USD

# The 3-period relative strength index calculated off the open
#RSI3 <- RSI(Op( Data ), n=3 )
n = 3
df['RSI3'] = ta.RSI(np.array( df['Open'] ), timeperiod=n)

# Our measure of trend: the difference between the open price and the 50-period simple moving average.
#SMA50 <- SMA( Op( Data ), n=50 )
df['SMA50'] = df['Open'].rolling( window=50 ).mean()
#Trend <- Op( Data ) - SMA50
df['Trend'] = df['Open'] - df['SMA50']

# The variable we are looking to predict, the direction of the next bar
#Price <- Cl( Data ) - Op( Data )
df['Price'] = df['Close'] - df['Open']
#Class <- ifelse( Price > 0, "UP", "DOWN" )
df['Class'] = np.where( df['Price'] >= 0, 1, -1 )

# Create the data set and removing the points where our indicators are still being calculated
#DataSet <- data.frame( RSI3, Trend, Class )
#DataSet <- DataSet[-c(1:49),]
df = df.dropna()
print( df[0:5] )

# Separate the data into 60% training set to build our model, 20% test set to test the patterns we found, and 20% validation set to run our strategy over new data
#Training <- DataSet[1:4528,]
#Test <- DataSet[4529:6038,]
#Val <- DataSet[6039:7548,]

X = df.copy( deep=True )

# Use drop method to drop the columns
dropCols = ['Low', 'High', 'Close', 'Volume', 'Price', 'Class']
for col in dropCols:
  if col in X:
    X.drop([col], axis=1, inplace=True)

y = df['Class']

if True:
  X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=None, shuffle=False )

  print( '#X_train=[', len( X_train ), ']' )
  print( '#y_train=[', len( y_train ), ']' )
  print( '#X_test=[', len( X_test ), ']' )
  print( '#y_test=[', len( y_test ), ']' )
  split = len( X_train )

  #X_train.info()
  print( X_train[0:5] )
  print( y_train[0:5] )

In [None]:
# Test variables for 'c' and 'g'
#c = [10, 100, 1000, 2000, 3000, 4000, 5000, 10000]
c = [100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
g = [1e-3, 1e-2, 1e-1, 1e0]

# Intialise the parameters
parameters = {
  'svc__C': c,
  'svc__gamma': g,
  'svc__kernel': ['rbf'], # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
  #'max_depth': [6, 9, None],
  #'n_estimators': [50, 70, 100, 150],
  #'max_features': [ random.randint( 1, 6 )],
  #'criterion': ['gini', 'entropy'],
  #'bootstrap': [True, False],
  #'mln_samples_leaf': [ random.randint( 1, 4 )]
}

# Create the 'steps' variable with the pipeline functions
steps = [('scaler', StandardScaler()), ('svc', SVC())]

# Pass the 'steps' to the Pipeline function
pipeline = Pipeline( steps )

# Call the RandomizedSearchCV function and pass the parameters
startTime = datetime.datetime.now()
#rcv = RandomizedSearchCV( pipeline, parameters, cv=TimeSeriesSplit( n_splits=2 ), n_jobs=16 )
rcv = GridSearchCV( pipeline, parameters, cv=TimeSeriesSplit( n_splits=2 ), n_jobs=16 )
deltaTime = datetime.datetime.now() - startTime
#print( "RandomizedSearchCV elapsed {:.3f}s".format( deltaTime.total_seconds() )) # milliseconds
print( "GridSearchCV elapsed {:.3f}s".format( deltaTime.total_seconds() )) # milliseconds

#X_train = X.iloc[:split]
#y_train = y.iloc[:split]

print( '#X_train=[', len( X_train ), ']' )
print( '#y_train=[', len( y_train ), ']' )

#print( '#X_test=[', len( X_test ), ']' )
#print( '#y_test=[', len( y_test ), ']' )

# Call the 'fit' method of rcv and pass the train data to it
startTime = datetime.datetime.now()
rcv.fit( X_train, y_train )
deltaTime = datetime.datetime.now() - startTime
print( "rcv.fit( X_train, y_train ) elapsed {:.3f}s\n".format( deltaTime.total_seconds() )) # milliseconds

startTime = datetime.datetime.now()

# Call the 'best_params_' method to obtain the best parameters of C
best_C = rcv.best_params_['svc__C']

# Call the 'best_params_' method to obtain the best parameters of kernel
best_kernel = rcv.best_params_['svc__kernel']

# Call the 'best_params_' method to obtain the best parameters of gamma
best_gamma = rcv.best_params_['svc__gamma']

print( 'best_C=[', best_C, ']' )
print( 'best_kernel=[', best_kernel, ']' )
print( 'best_gamma=[', best_gamma, ']' )

# Create a new SVC classifier
clf = OneVsRestClassifier(
  SVC(
    C = best_C, # 3
    kernel = best_kernel,
    gamma = best_gamma,
    verbose= True
    # cache_size = 200,
    # class_weight = None,
    # coef0 = 0.0, # 3.0
    # decision_function_shape = None,
    # degree = 3, # 4
    # max_iter = -1,
    # probability = False,
    # random_state = None,
    # shinking = True,
    # tol = 0.001
  ), n_jobs=16
)

cls = SVC(C=3.0, cache_size=100, class_weight=None, coef0=3.0,
          decision_function_shape=None, degree=4, 
          gamma='auto', kernel='rbf', max_iter=-1, probability=False,
          random_state=None, shrinking=True, 
          tol=0.001, verbose=False,  )

deltaTime = datetime.datetime.now() - startTime
print( "OneVsRestClassifier elapsed {:.3f}s\n".format( deltaTime.total_seconds() )) # milliseconds

startTime = datetime.datetime.now()

# Instantiate the StandardScaler
ss1 = StandardScaler()

# Pass the scaled train data to the SVC classifier
clf.fit( ss1.fit_transform( X_train ), y_train ) # X = max_abs_scaler.fit_transform( X )

#X_test = X.iloc[split:]
print( '#X_test=[', len( X_test ), ']' )
print( X_test[0:20] )

# Pass the test data to the predict function and store the values into 'y_predict'
y_predict = clf.predict( ss1.transform( X_test ))
print( '#y_predict=[', len( y_predict ), ']' )
print( y_predict[0:20] )

if False:
  # Initiate a column by name, 'Pred_Signal' and assign 0 to it
  df[ 'Pred_Signal' ] = 0

  # Save the predicted values for the train data
  df.iloc[:split, df.columns.get_loc( 'Pred_Signal' )] = pd.Series( clf.predict( ss1.transform( X_train )).tolist() )

  # Save the predicted values for the test data
  df.iloc[split:, df.columns.get_loc( 'Pred_Signal' )] = y_predict

  # Calculate strategy returns and store them in 'Ret1' column
  df['Ret1'] = df['Ret'] * df['Pred_Signal']

  deltaTime = datetime.datetime.now() - startTime
  print( "Predict elapsed {:.3f}s".format( deltaTime.total_seconds() )) # milliseconds

  print( 'score=[', clf.score( X_test, y_predict ), ']' )

In [None]:
# Build our support vector machine using a radial basis function as our kernel, the cost, or C, at 1, and the gamma function at &frac12;, or 1 over the number of inputs we are using
#SVM <- svm( Class~RSI3+Trend, data=Training, kernel="radial", cost=1, gamma=1/2 )

# Run the algorithm once more over the training set to visualize the patterns it found
#TrainingPredictions <- predict( SVM, Training, type="class" )

# Create a data set with the predictions
#TrainingData <- data.frame( Training, TrainingPredictions )
print( X_train[0:5] )
print( y_train[0:5] )
TrainingData = pd.concat( [X_train, y_train], axis=1, ignore_index=True )
TrainingData.columns = [ 'Open', 'RSI3', 'SMA50', 'Trend', 'Class' ]
print( TrainingData[0:5] )

# Now let’s see what patterns it was able to find
#ggplot(
#  TrainingData,
#  aes( x=Trend,y=RSI3)) +
#  stat_density2d(geom="contour",aes(color=TrainingPredictions)) + 
#  labs(title="SVM RSI3 and Trend Predictions", x="Open - SMA50", y="RSI3", color="Training Predictions"
#)

In [None]:
print( len( y ), len( y_predict ))

# Calculate the confusion matrix
#cm = confusion_matrix( y[split:], y_predict )
#sum = 0
#for i in range( cm.shape[0] ):
#  sum += cm[i][i]
#sum += cm[1][0] + cm[1][2]
    
#accuracy = sum / X_test.shape[0]
#print( accuracy * 100.0 )

ConfusionMatrixDisplay.from_predictions( y[split:], y_predict)

In [None]:
# Calculate the classification report
cr = classification_report( y[split:], y_predict )
print(cr)

print( accuracy_score( y[split:], y_predict ))
print( precision_score( y[split:], y_predict ))
print( recall_score( y[split:], y_predict ))

In [None]:
def create_features_v1(i):
    df = create_HLCV(i)
    high = df[f'high_{i}D']
    low = df[f'low_{i}D']
    close = df[f'close_{i}D']
    volume = df[f'volume_{i}D']
    
    features = pd.DataFrame(index=prices.index)
    features[f'volume_{i}D'] = volume
    features[f'price_spread_{i}D'] = high - low
    features[f'close_loc_{i}D'] = (high - close) / (high - low)
    features[f'close_change_{i}D'] = close.pct_change()
    
    return features

In [None]:
def create_bunch_of_features_v1():
  '''
  the timespan that i would like to explore 
  are 1, 2, 3 days and 1 week, 1 month, 2 month, 3 month
  which roughly are [1,2,3,5,20,40,60]
  '''
  days = [1,2,3,5,20,40,60]
  bunch_of_features = pd.DataFrame(index=prices.index)
  for day in days:
    f = create_features_v1(day)
    bunch_of_features = bunch_of_features.join(f)
    
  return bunch_of_features

In [None]:
bunch_of_features_v1 = create_bunch_of_features_v1()

#check the correlation
corr_v1 = bunch_of_features_v1.corrwith(outcomes.close_1)
corr_v1.sort_values(ascending=False).plot.barh( title = 'Strength of Correlation')

corr_matrix_v1 = bunch_of_features_v1.corr()

sns.clustermap(corr_matrix_v1)

sns.clustermap(corr_matrix_v1, cmap='coolwarm', linewidth=1)
sns.clustermap(corr_matrix_v1, cmap='coolwarm', linewidth=1, method='ward')

#define the outcome target
#here， to make thing easy to understand, i will only try to predict #the next days's return
outcomes = pd.DataFrame(index=prices.index)
# next day's returns
outcomes['close_1'] = prices.close.pct_change(-1)
#decide which features are abundant from cluster map
deselected_features_v1 = ['close_loc_3D','close_loc_60D',
                       'volume_3D', 'volume_60D',
                       'price_spread_3D','price_spread_60D',
                       'close_change_3D','close_change_60D']
selected_features_v1 = bunch_of_features_v1.drop(labels=deselected_features_v1, axis=1)

sns.pairplot(selected_features_v1)

#join the features and outcome together to remove the outliers
features_outcomes = selected_features_v1.join(outcomes)
stats = features_outcomes.describe()