In [1]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting multitasking>=0.0.7
  Downloading multitasking-0.0.10.tar.gz (8.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: multitasking
  Building wheel for multitasking (setup.py) ... [?25ldone
[?25h  Created wheel for multitasking: filename=multitasking-0.0.10-py3-none-any.whl size=8488 sha256=161ddbcac14c38e6f6ac753e22bd252f341b6044c3506eb7f259bab855714471
  Stored in directory: /home/ec2-user/.cache/pip/wheels/15/e6/fa/f4bf8d84e804547b3c1b1d4b09a671768502b32ca33ec60651
Successfully built multitasking
Installing collected packages: multitasking, yfinance
Successfully installed multitasking-0.0.10 yfinance-0.1.70


In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
import os
import json

from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score, precision_score, f1_score

from sklearn.model_selection import learning_curve

import seaborn as sns
import matplotlib.pyplot as plt


def gen_ret(df, tickers): # generates daily returns 
    for ticker in tickers:
        ticker = ticker.lower()
        df.loc[:,f"ret_{ticker}"] = df.loc[:,f"adj close_{ticker}"].pct_change()
    return df

    
    return result

def gen_vol(df, tickers, lookback = 10): # generates ohlc volatility
    import numpy as np
    for ticker in tickers:
        ticker = ticker.lower()
        o = df.loc[:,'_'.join(['open', ticker])]
        h = df.loc[:,'_'.join(['high', ticker])]
        l = df.loc[:,'_'.join(['low', ticker])]
        c = df.loc[:,'_'.join(['close', ticker])]

        k = 0.34 / (1.34 + (lookback+1)/(lookback-1))
        cc = np.log(c/c.shift(1))
        ho = np.log(h/o)
        lo = np.log(l/o)
        co = np.log(c/o)
        oc = np.log(o/c.shift(1))
        oc_sq = oc**2
        cc_sq = cc**2
        rs = ho*(ho-co)+lo*(lo-co)
        close_vol = cc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
        open_vol = oc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
        window_rs = rs.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
        result = (open_vol + k * close_vol + (1-k) * window_rs).apply(np.sqrt) * np.sqrt(252)
        result[:lookback-1] = np.nan

        df.loc[:,f'vol_{ticker}'] = result
    return df

def gen_rsi(df, tickers, periods = 14, ema = True): # technical indicator: Relative Strength Index
    for ticker in tickers: 
        ticker = ticker.lower()
        close_delta = df[f"adj close_{ticker}"].diff()

        up = close_delta.clip(lower=0)
        down = -1 * close_delta.clip(upper=0)

        if ema == True:
            ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
            ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
        else:
            ma_up = up.rolling(window = periods, adjust=False).mean()
            ma_down = down.rolling(window = periods, adjust=False).mean()
        rs = ma_up / ma_down
        rsi = 100 - 100 / (1+rs)
        df.loc[:,f"rsi_{ticker}"] = rsi
    
    return df

def gen_stok(df, tickers, periods = 14): # technical indicator: stocastic oscillator k%
    
    for ticker in tickers: 
        ticker = ticker.lower()
        hh = df.loc[:,f"high_{ticker}"].rolling(periods).max()
        ll = df.loc[:,f"low_{ticker}"].rolling(periods).min()
        df.loc[:,f"stok_{ticker}"] = 100 * (df.loc[:,f"adj close_{ticker}"] - ll) / (hh - ll)
        
    return df

def gen_wilr(df, tickers, periods = 14): # technical indicator: William's R%
    
    for ticker in tickers: 
        ticker = ticker.lower()
        hh = df.loc[:,f"high_{ticker}"].rolling(periods).max()
        ll = df.loc[:,f"low_{ticker}"].rolling(periods).min()
        df.loc[:,f"wilr_{ticker}"] = (hh - df.loc[:,f"adj close_{ticker}"]) / (hh - ll) * (-100)
    
    return df

def gen_macd(df, tickers, periods = (12, 26, 9)): # technical indicator: MACD - MACD_signal
    for ticker in tickers:
        ticker = ticker.lower()
        k = df[f"adj close_{ticker}"].ewm(span=periods[0], adjust=False, min_periods=periods[0]).mean()
        d = df[f"adj close_{ticker}"].ewm(span=periods[1], adjust=False, min_periods=periods[1]).mean()
        macd = k - d
        macd_s = macd.ewm(span=periods[2], adjust=False, min_periods=periods[2]).mean()
        df.loc[:,f"macd_{ticker}"] = macd - macd_s

    return df

def gen_obv(df, tickers, periods = 14): # technical indicator: OBV - OBV exponential MA 
    for ticker in tickers:
        ticker = ticker.lower()
        ac = data.loc[:,f"adj close_{ticker}"]
        vo = data.loc[:,f"volume_{ticker}"]
        obv = ((ac.diff() > 0)*1 - (ac.diff() < 0)*1) * vo
        obv_ema = pd.DataFrame(obv).ewm(span = periods, adjust = False, min_periods = periods).mean()
        df.loc[:,f"obv_{ticker}"] = obv_ema.values
    return df

def gen_retbbk(df, tickers, periods = 14): # technical indicator: Bollinger Band coefficient k
    for ticker in tickers: 
        ticker = ticker.lower()
        ret = data.loc[:, f"ret_{ticker}"] 
        ma = ret.rolling(periods).mean()
        std = ret.rolling(periods).std()
        df.loc[:,f"retbbk_{ticker}"] = (ret - ma)/std
    return df
        

def gen_features(df, tickers):
    df = gen_ret(df, tickers)
    df = gen_vol(df, tickers)
    df = gen_rsi(df, tickers)
    df = gen_stok(df, tickers)
    df = gen_wilr(df, tickers)
    df = gen_macd(df, tickers)
    df = gen_obv(df, tickers)
    df = gen_retbbk(df, tickers)
    
    return df 

def gen_lags(df, tickers, lags, lagged_cols, dropnan = True):
    cols = list()
    for i in range(lags, 0, -1):
        lagged_df = df.loc[:,[x for x in lagged_cols if x in df.columns]].shift(i)
        lagged_df.columns = ['_'.join([x, 'lag', str(i)]) for x in lagged_df.columns]
        cols.append(lagged_df)
    df = pd.concat([df]+cols, axis = 1)
    if dropnan:
        df = df.dropna(how = 'any')
    return df
    



In [18]:
tickers = ["FB", "AMZN", "AAPL", # FAANG stocks 
           "NFLX", "GOOG", 
           "SPY",]
# tickers = ['SPY',"AAPL"]
start = "2017-1-1"
end = "2022-1-31"
data = yf.download(tickers, start, end, interval = "1d")
data.columns = ['_'.join(col).lower().strip() for col in data.columns.values]
data.to_csv("data.csv", index = True)
data = pd.read_csv("data.csv")
print(data.head(5))
data = gen_features(data, tickers = tickers)
print(data.head(5))
lagged_cols = [['_'.join([str(x).lower(), str(y).lower()]) for y in tickers] 
                               for x in [
                                        'ret', 
                                         'vol', 
                                        'rsi', 
                                         'stok', 
                                         'wilr',
                                        'macd', 
                                         'obv', 
                                         'retbbk'
                                        ]
                              ]
data = gen_lags(data, tickers = tickers, lags = 3, 
                lagged_cols = [item for sublist in lagged_cols for item in sublist]
               )
data = data.loc[:,[x for x in data.columns if 
                   "high" not in x 
                   and "low" not in x
                   and "close" not in x
                   and "open" not in x
                   and "volume" not in x]
               ]
for col in data.columns:
    print(col, end = "\n")
data = data.dropna(how = 'any', axis = 0)
data.to_csv("featured_data.csv", index = False)

[*********************100%***********************]  6 of 6 completed
         Date  adj close_aapl  adj close_amzn  adj close_fb  adj close_goog  \
0  2017-01-03       27.332468      753.669983    116.860001      786.140015   
1  2017-01-04       27.301876      757.179993    118.690002      786.900024   
2  2017-01-05       27.440720      780.450012    120.669998      794.020020   
3  2017-01-06       27.746634      795.989990    123.410004      806.150024   
4  2017-01-09       28.000782      796.919983    124.900002      806.650024   

   adj close_nflx  adj close_spy  close_aapl  close_amzn    close_fb  ...  \
0      127.489998     206.147552   29.037500  753.669983  116.860001  ...   
1      129.410004     207.373947   29.004999  757.179993  118.690002  ...   
2      131.809998     207.209198   29.152500  780.450012  120.669998  ...   
3      131.070007     207.950577   29.477501  795.989990  123.410004  ...   
4      130.949997     207.264084   29.747499  796.919983  124.900002  .

In [26]:
def frame_to_obj(df, target_col):
    obj = {"start": df.loc[:,'Date'].tolist()[0], "target": df.loc[:,target_col].tolist()}
    for key, val in df.loc[:,
                          [x for x in df.columns if (x != target_col and x.lower() != 'date')]
                          ].to_dict().items():
        obj[key] = list(val.values())
    
    return json.dumps(obj)

def split_data(df, train_size, test_size, target_col):
    encoding = "utf-8"
    output_dir = os.path.join(os.getcwd(), "data")
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    FILE_TRAIN = os.path.join("data", "train.json")
    FILE_VALID = os.path.join("data", "validation.json")
    FILE_TEST = os.path.join("data", "test.json")
    
    # seperate post 2020 data out for backtesting
    
    filt_post20 = df.loc[:,'Date'].apply(
        lambda x : int(str(x).split('-')[0])>2020
    )
#     print(df.loc[filt_post20,:])
    with open(FILE_TEST, "wb") as f0:
        f0.write(frame_to_obj(df.loc[filt_post20,:], target_col).encode(encoding))
        f0.write("\n".encode(encoding))
        
    df = df.loc[~filt_post20]
    tsplit = TimeSeriesSplit(max_train_size=train_size, 
                             test_size = test_size, 
                             n_splits = len(df)-train_size-test_size+1
                            )
    
    
    with open(FILE_TRAIN, "wb") as f1:
        with open(FILE_VALID, "wb") as f2:
            for train_index, test_index in tsplit.split(df):
                train_idx = train_index
                test_idx = list(train_index)+list(test_index)
                f1.write(frame_to_obj(df.iloc[train_idx], target_col).encode(encoding))
                f1.write("\n".encode(encoding))
                f2.write(frame_to_obj(df.iloc[test_idx], target_col).encode(encoding))
                f2.write("\n".encode(encoding))
    return "Files written. \"test.json\" and \"train.json\" created/overwritten."

def split_data_csv(df, train_size, test_size):
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    output_dir = os.path.join(os.getcwd(), "data")
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    FILE_TRAIN = os.path.join("data", "train")
    FILE_VALID = os.path.join("data", "validation")
    FILE_TEST = os.path.join("data", "test")
    filt_post20 = df.loc[:,'Date'].apply(
        lambda x : int(str(x).split('-')[0])>2020
    )
#     print(df.loc[filt_post20,:])
    df.loc[filt_post20,
           [x for x in df.columns if "_" in x  or x.lower() == "target"]
          ].to_csv(FILE_TEST+'.csv', header = False, index = False) # This is for S3 channel
    df.loc[filt_post20, 
           [x for x in df.columns if "_" in x  or x.lower() == "target"
           ]
          ].to_csv(FILE_TEST+'_header.csv', header = True, index = False) # This is for trading simulation
    df = df.loc[~filt_post20, 
                [x for x in df.columns if x.lower() != 'date']
               ]
    print(df.columns)
    
    df.to_csv(FILE_TRAIN+'.csv', header = False, index = False)

    return ".csv files written."
        
    
        

In [27]:
target_ticker = "spy" # we predict
# frame_to_obj(data.iloc[:20,:], "ret_msft")
data = pd.read_csv("featured_data.csv").loc[:,
#                                             [x for x in data.columns if 
#                                              'spy' in x.lower() 
#                                              or 
#                                              'date' in x.lower()
#                                             ]
                                           ]

data.loc[:,'target'] = ((data.loc[:,f"ret_{target_ticker}"].shift(-1) < 0.005) * 1)

# cols = data.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# data = data[cols]

# Put target to front first. 
# split_data(data, 
#            train_size = 40, 
#            test_size = 1, 
#            target_col = "target")
split_data_csv(data, 
           train_size = -1, 
           test_size = -1, 
)

Index(['target', 'ret_fb', 'ret_amzn', 'ret_aapl', 'ret_nflx', 'ret_goog',
       'ret_spy', 'vol_fb', 'vol_amzn', 'vol_aapl',
       ...
       'obv_aapl_lag_1', 'obv_nflx_lag_1', 'obv_goog_lag_1', 'obv_spy_lag_1',
       'retbbk_fb_lag_1', 'retbbk_amzn_lag_1', 'retbbk_aapl_lag_1',
       'retbbk_nflx_lag_1', 'retbbk_goog_lag_1', 'retbbk_spy_lag_1'],
      dtype='object', length=193)


'.csv files written.'

In [23]:
import boto3
import sagemaker
from sagemaker import get_execution_role

prefix = "sagemaker/binaryforecaster"

sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = sagemaker_session.default_bucket()

s3_data_path = f"{bucket}/{prefix}/data"
s3_output_path = f"{bucket}/{prefix}/output"

s3 = boto3.client("s3")
# FILE_TRAIN = os.path.join("data", "train.json")
# FILE_VALID = os.path.join("data", "validation.json")
# FILE_TEST = os.path.join("data", "test.json")
for f in os.listdir("data"):
    if ".csv" in f:
        if "test" in f:
            s3.upload_file(os.path.join("data",f), bucket, prefix + "/data/test/" + f)
        elif "train" in f:
            s3.upload_file(os.path.join("data",f), bucket, prefix + "/data/train/" + f)
        
# s3.upload_file(FILE_TRAIN, bucket, prefix + "/data/train/" + FILE_TRAIN)
# s3.upload_file(FILE_VALID, bucket, prefix + "/data/validation/" + FILE_VALID)
# s3.upload_file(FILE_TEST, bucket, prefix + "/data/test/" + FILE_TEST)