# Stock purchase recommendations with Machine Learning

In [23]:
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import talib as ta
import matplotlib.pyplot as plt
from tqdm import tqdm # progress bar

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

In [2]:
# read in the Quandl.com API key - saved in separate file to keep it private

filepath = 'confidential-API-key.txt'  
with open(filepath) as fp:  
   my_Quandl_API_key = fp.readline()

## Load stock data from Quandl

In [3]:
def load_stock_data(ticker, start_date, end_date, Quandl_API_key=my_Quandl_API_key):
    '''
    Downloads stock data from Quandl, drop some columns, resort datafram, and return
    Inputs:
        ticker - a stock ticker symbol (needs to be valid - no error checking implemented)
        start_date - first date of stock prices
        end_date - last date of stock prices
        Quandl_API_key - string with valid API key for Quandl.com data queries
    Outputs:
        stock_data - DataFrame with stock price data, sorted in ascending date order
    '''
    
    # download data from Quandl with Pandas Datareader
    stock_data = web.DataReader(name=symbol, data_source='quandl', start=start_date, end=end_date, access_key=Quandl_API_key)
    
    # need ascending index for the TA-lib indicators to work properly
    stock_data.sort_index(inplace=True)
    
    # keep only the columns with adjusted data to eliminate any issues due to stock splits
    stock_data = stock_data[['AdjVolume', 'AdjOpen', 'AdjHigh', 'AdjLow', 'AdjClose']]
    
    # DataFram has a second level column index with the stock ticker - not needed so drop it
    stock_data.columns = stock_data.columns.droplevel(1)
    
    # add row index (highest number is most recent date) - this will be used to later restack the rows for feature matrix
    stock_data['row_index'] = range(0, stock_data.shape[0])
    
    return stock_data    

In [53]:
# test loading stock data:

start = '2015-04-22'
end = '2017-04-22'
symbol = ['AAPL']

df = load_stock_data(symbol, start, end)

df.head()

Attributes,AdjVolume,AdjOpen,AdjHigh,AdjLow,AdjClose,row_index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-04-22,37654505.0,121.113778,122.906785,120.474781,122.668353,0
2015-04-23,45770902.0,122.36316,124.385061,122.210564,123.669766,1
2015-04-24,44525905.0,124.451822,124.585344,123.250126,124.25154,2
2015-04-27,96954207.0,126.187605,126.969661,125.081282,126.511872,3
2015-04-28,118923970.0,128.233349,128.314416,123.574394,124.518583,4


## Define Target column: profitability of trade

The goal is the define a column "setup_for_profitable_trade" based on a trading strategy. The data will be analyzed for this trade setup:

Run the analysis in the evening after market close. That day's row of market data will then be either deemed as profitable or not for a trade to be initiated and closed in the following days. This setup of the data ensures no lookahead is happening.

## Trading Strategy:

1. After market close on day N decide if buying stock at market Open on day N+1
2. Open position: submit market order for day N+1 prior to market open
3. Close position: submit sell order for market open for day N+2

Threshold to decide to buy the stock: expected profit from open to close: 0.5% profit

Store in row N if this trade was deemed profitable.

In [54]:
def add_flag_for_profitable_trade_setups(df, delete_interim_calculation_cols=True):
    '''
    Adds column to dataframe that identifies profitable trading setup
    Inputs:
        df - dataframe with stock data
        delete_interim_calculation_cols - if false: keep the interim calculations - good for debugging
    Outputs:
        df - dataframe with additional columns
    
    '''
    
    # trade strategy: after market close on day N, set a buy at market open on day N+1 and sell at market close on day N+1
    # for expected gain of at least 0.5%
    profitability_threshold = 0.005

    # use helper columns to calculate profit
    df['strategy_open_price'] = df['AdjOpen'].shift(-1) # AdjOpen from day N+1
    df['strategy_close_price'] = df['AdjOpen'].shift(-2) # AdjOpen from day N+2
    df['strategy_profit_dollars'] = df['strategy_close_price'] - df['strategy_open_price']
    df['strategy_profitability'] = df['strategy_profit_dollars'] / df['strategy_open_price']

    # use categorical field to encode "setup_for_succesful_trade": 1=yes, 0=no
    df['setup_for_profitable_trade'] = df['strategy_profitability'] >= profitability_threshold
    
    if delete_interim_calculation_cols:
        df = df.drop(columns=['strategy_open_price', 'strategy_close_price', 'strategy_profit_dollars', 'strategy_profitability'])
    
    # drop any rows that have NaNs in them. especially the last few rows will have NaNs because the profitability calc
    # looks into the future beyond the last row. these rows need to get deleted - the profitability is not defined there
    df = df.loc[df.notnull().all(axis=1), :]
    
    return df

In [57]:
# testing: show calculations:
df = add_flag_for_profitable_trade_setups(df, False)
df.tail(10)

Attributes,AdjVolume,AdjOpen,AdjHigh,AdjLow,AdjClose,row_index,setup_for_profitable_trade,strategy_open_price,strategy_close_price,strategy_profit_dollars,strategy_profitability
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-04-03,19985714.0,142.545929,142.952608,141.891275,142.53601,491,True,142.089655,143.051798,0.962143,0.006771
2017-04-04,19891354.0,142.089655,143.71637,142.010303,143.597342,492,False,143.051798,143.121231,0.069433,0.000485
2017-04-05,27717854.0,143.051798,144.281753,142.645119,142.853418,493,False,143.121231,142.565767,-0.555464,-0.003881
2017-04-06,21149034.0,143.121231,143.349367,142.288035,142.496334,494,False,142.565767,142.43682,-0.128947,-0.000904
2017-04-07,16658543.0,142.565767,143.012122,142.109493,142.178926,495,False,142.43682,141.782166,-0.654654,-0.004596
2017-04-10,18933397.0,142.43682,142.713758,141.74249,142.010303,496,False,141.782166,140.45302,-1.329146,-0.009375
2017-04-11,30379376.0,141.782166,142.188845,138.925494,140.482777,497,False,140.45302,140.760509,0.307489,0.002189
2017-04-12,20350000.0,140.45302,140.998565,139.867799,140.6514,498,False,140.760509,140.333992,-0.426517,-0.00303
2017-04-13,17822880.0,140.760509,141.226702,139.907475,139.907475,499,False,140.333992,140.264559,-0.069433,-0.000495
2017-04-17,16582094.0,140.333992,140.730752,139.728933,140.681157,500,False,140.264559,140.730752,0.466193,0.003324


In [59]:
# without verbose results:
df = add_flag_for_profitable_trade_setups(df)
df.tail(10)

Attributes,AdjVolume,AdjOpen,AdjHigh,AdjLow,AdjClose,row_index,setup_for_profitable_trade
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-03,19985714.0,142.545929,142.952608,141.891275,142.53601,491,True
2017-04-04,19891354.0,142.089655,143.71637,142.010303,143.597342,492,False
2017-04-05,27717854.0,143.051798,144.281753,142.645119,142.853418,493,False
2017-04-06,21149034.0,143.121231,143.349367,142.288035,142.496334,494,False
2017-04-07,16658543.0,142.565767,143.012122,142.109493,142.178926,495,False
2017-04-10,18933397.0,142.43682,142.713758,141.74249,142.010303,496,False
2017-04-11,30379376.0,141.782166,142.188845,138.925494,140.482777,497,False
2017-04-12,20350000.0,140.45302,140.998565,139.867799,140.6514,498,False
2017-04-13,17822880.0,140.760509,141.226702,139.907475,139.907475,499,False
2017-04-17,16582094.0,140.333992,140.730752,139.728933,140.681157,500,False


In [60]:
df['setup_for_profitable_trade'].sum()

180

## Feature Engineering

In [61]:
#split df into features and target (column setup_for_profitable_trade)
df_X_base_data = df.drop(columns=['setup_for_profitable_trade'])

### Feature Engineering - Add Technical Analysis Indicators

In [62]:
def add_TALib_indicator(df, attribute, indicator_func, *args):
    '''
    Adds a column to a dataframe:
        column name is the name of the technical indicator as specified by indicator_func
        column content is the function calculated on the attribute column
    Example: add_TALib_indicator(df, 'AdjClose', ta.RSI, 14) creates a new column called RSI with 
             the 14 day RSI of the values of the column 'AdjClose'
    Inputs:
        df - dataframe - needs to be sorted in date ascending order
        attribute - column name to be used in TA-Lib calculation
        indicator_func - name of a TA-Lib function
        *args - optional parameters for indicator_func
        
    Oupputs:
        df - datarame with new column added
        func_name - name of the new colunm
    
    '''
    # get the name of the indicator from TA-Lib
    func_name = attribute + indicator_func.__name__
    
    # add new column, calculated based on attribute column
    df.loc[:, func_name] = indicator_func(df.loc[:, attribute].values, *args)
    
    return df, func_name

In [63]:
df_X_base_data, indicator_name = add_TALib_indicator(df_X_base_data, 'AdjClose', ta.RSI, 14)

df_X_base_data.tail()

Attributes,AdjVolume,AdjOpen,AdjHigh,AdjLow,AdjClose,row_index,AdjCloseRSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-10,18933397.0,142.43682,142.713758,141.74249,142.010303,496,63.211365
2017-04-11,30379376.0,141.782166,142.188845,138.925494,140.482777,497,53.389801
2017-04-12,20350000.0,140.45302,140.998565,139.867799,140.6514,498,54.235139
2017-04-13,17822880.0,140.760509,141.226702,139.907475,139.907475,499,49.932546
2017-04-17,16582094.0,140.333992,140.730752,139.728933,140.681157,500,54.01813


In [64]:
df_X_base_data.head(16)
#confirms NaN for RSI on top for first 14 dates since it is a 14 day RSI

Attributes,AdjVolume,AdjOpen,AdjHigh,AdjLow,AdjClose,row_index,AdjCloseRSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-04-22,37654505.0,121.113778,122.906785,120.474781,122.668353,0,
2015-04-23,45770902.0,122.36316,124.385061,122.210564,123.669766,1,
2015-04-24,44525905.0,124.451822,124.585344,123.250126,124.25154,2,
2015-04-27,96954207.0,126.187605,126.969661,125.081282,126.511872,3,
2015-04-28,118923970.0,128.233349,128.314416,123.574394,124.518583,4,
2015-04-29,63386083.0,124.137092,125.500922,122.36316,122.687428,5,
2015-04-30,83195423.0,121.600179,121.962595,118.815296,119.358921,6,
2015-05-01,58512638.0,120.264961,124.108481,119.50198,122.983083,7,
2015-05-04,50988278.0,123.507633,124.52812,122.325011,122.744651,8,
2015-05-05,49271416.0,122.220101,122.506029,119.959769,119.978843,9,


### Feature Engineering - Changing from actual values to percentage changes

In [65]:
def feat_eng_changes_values_to_change(df, cols_set_vals_to_change, delete_original_cols=True):
    '''
    Instead of the actual values in some columns, we care about the change from one day to the next.
    This function calculates that change for the given columns and then either keeps or drops (default) the origianl columns
    Input:
        df - a dataframe
        cols_set_vals_to_change - names of columns to work on.
        delete_original_cols - keep or delete original columns
    Output:
        df - dataframe with new columns added. the value in row N is now the change from row N-1 to row N (instead of the actual values)
    '''    

    # calculate the change from row N-1 to row N
    df_chg_cols = (df[cols_set_vals_to_change] / df[cols_set_vals_to_change].shift(1) - 1)

    # add suffix to the column names
    df_chg_cols = df_chg_cols.add_suffix('_chg')

    # join the data onto the original data fram
    df = df.join(df_chg_cols)

    if delete_original_cols:
        # drop the original columns
        df = df.drop(columns=cols_set_vals_to_change)
        
    return df

cols_set_vals_to_change = ['AdjVolume', 'AdjOpen', 'AdjLow', 'AdjHigh', 'AdjClose']
df_X_base_data = feat_eng_changes_values_to_change(df_X_base_data, cols_set_vals_to_change)

df_X_base_data.tail()

Attributes,row_index,AdjCloseRSI,AdjVolume_chg,AdjOpen_chg,AdjLow_chg,AdjHigh_chg,AdjClose_chg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-10,496,63.211365,0.136558,-0.000904,-0.002583,-0.002086,-0.001186
2017-04-11,497,53.389801,0.604539,-0.004596,-0.019874,-0.003678,-0.010756
2017-04-12,498,54.235139,-0.330138,-0.009375,0.006783,-0.008371,0.0012
2017-04-13,499,49.932546,-0.124183,0.002189,0.000284,0.001618,-0.005289
2017-04-17,500,54.01813,-0.069618,-0.00303,-0.001276,-0.003512,0.00553


### Feature Engineering - Reshaping the prior day data into ML-ready features matrix
#### --- build pieces for the dataframe section unpivoting ---
Goal: pull out n_features rows, flatten, and build meaningful column names that indicate how many days' back the data is from

In [66]:
# parameter for how many days of history to include in feature list
n_days_features = 5

# total lenght of df
n_data_points = df.shape[0]

In [67]:
i = 505
# pull out n_days_features of rows from current position
df_extract = df_X_base_data.iloc[i-n_days_features:i, :].copy()

# change the index the be "days into the past" - eg current day is 0, prior day is -1, ...
df_extract.loc[:, 'row_index'] = range(-n_days_features+1, 1)

# make this the new index
df_extract.set_index('row_index', inplace=True)

df_extract

ValueError: Must have equal len keys and value when setting with an iterable

In [16]:
# unstack and make it tall (ie unpivot)
df_extract = df_extract.unstack().reset_index()
df_extract

Unnamed: 0,Attributes,row_index,0
0,AdjCloseRSI,-4,54.01813
1,AdjCloseRSI,-3,50.43809
2,AdjCloseRSI,-2,47.632042
3,AdjCloseRSI,-1,56.46099
4,AdjCloseRSI,0,55.487879
5,AdjVolume_chg,-4,-0.069618
6,AdjVolume_chg,-3,-0.11365
7,AdjVolume_chg,-2,0.178998
8,AdjVolume_chg,-1,0.345744
9,AdjVolume_chg,0,-0.257236


In [17]:
# create new column with combined field names of attribute and index
# eg: AdjClose-1 for the adjusted close of day N-1 or AdjHigh-4 for the adjusted High of day N-4
df_extract['Attribute-index'] = df_extract['Attributes'] + df_extract['row_index'].apply(str)
# then drop Attributes and row_index columns since they are not needed anymore
df_extract.drop(columns=['Attributes', 'row_index'], inplace=True)
df_extract

Unnamed: 0,0,Attribute-index
0,54.01813,AdjCloseRSI-4
1,50.43809,AdjCloseRSI-3
2,47.632042,AdjCloseRSI-2
3,56.46099,AdjCloseRSI-1
4,55.487879,AdjCloseRSI0
5,-0.069618,AdjVolume_chg-4
6,-0.11365,AdjVolume_chg-3
7,0.178998,AdjVolume_chg-2
8,0.345744,AdjVolume_chg-1
9,-0.257236,AdjVolume_chg0


In [18]:
# set index one and transpose
target_row = df_extract.set_index('Attribute-index').T
# we now have one row of data that represents the prior n_feature_days worth of data:
target_row

Attribute-index,AdjCloseRSI-4,AdjCloseRSI-3,AdjCloseRSI-2,AdjCloseRSI-1,AdjCloseRSI0,AdjVolume_chg-4,AdjVolume_chg-3,AdjVolume_chg-2,AdjVolume_chg-1,AdjVolume_chg0,...,AdjHigh_chg-4,AdjHigh_chg-3,AdjHigh_chg-2,AdjHigh_chg-1,AdjHigh_chg0,AdjClose_chg-4,AdjClose_chg-3,AdjClose_chg-2,AdjClose_chg-1,AdjClose_chg0
0,54.01813,50.43809,47.632042,56.46099,55.487879,-0.069618,-0.11365,0.178998,0.345744,-0.257236,...,-0.003512,0.001128,-0.000282,0.006479,-0.001679,0.00553,-0.004442,-0.003683,0.012511,-0.001193


In [19]:
df_X_base_data.index.to_list()[504]

Timestamp('2017-04-21 00:00:00')

In [20]:
target_row['Index'] = df_X_base_data.index.to_list()[504]
target_row = target_row.set_index('Index')
target_row

Attribute-index,AdjCloseRSI-4,AdjCloseRSI-3,AdjCloseRSI-2,AdjCloseRSI-1,AdjCloseRSI0,AdjVolume_chg-4,AdjVolume_chg-3,AdjVolume_chg-2,AdjVolume_chg-1,AdjVolume_chg0,...,AdjHigh_chg-4,AdjHigh_chg-3,AdjHigh_chg-2,AdjHigh_chg-1,AdjHigh_chg0,AdjClose_chg-4,AdjClose_chg-3,AdjClose_chg-2,AdjClose_chg-1,AdjClose_chg0
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-21,54.01813,50.43809,47.632042,56.46099,55.487879,-0.069618,-0.11365,0.178998,0.345744,-0.257236,...,-0.003512,0.001128,-0.000282,0.006479,-0.001679,0.00553,-0.004442,-0.003683,0.012511,-0.001193


#### --- done building the pieces, now implement in one loop to cycle through all rows of df ---

In [68]:
def create_feature_cols_df(df_X_base_data, n_days_features=n_days_features):
    '''
    Take dataframe with date index (sorted increasing time) with multiple columns and return a new wider dataframe
    where the rows for the last n_days_features have been pivoted into additional columns
    Input:
        df_X_base_data - dataframe with date index
    Output:
        df_X - datafram that has length of df_X_base_data.shape[0] - n_days_features and more columns than df_X_base_data
    '''
    
    df_X = pd.DataFrame()
    
    # total lenght of df
    n_data_points = df_X_base_data.shape[0]
    
    # cycle through each row of df, start at n_days_features-1 because we wouldn't have enough history for first rows
    for i in tqdm(range(n_days_features, n_data_points+1), desc='reshaping data into feature rows'):
        # i contains the rows number of df

        df_extract = df_X_base_data.iloc[i-n_days_features:i, :]

        # pull out n_days_features of rows from current position
        df_extract = df_X_base_data.iloc[i-n_days_features:i, :].copy()

        # change the index to be "days into the past" - eg current day is 0, prior day is -1, ...
        df_extract.loc[:, 'row_index'] = range(-n_days_features+1, 1)

        # make this the new index
        df_extract.set_index('row_index', inplace=True)

        # unstack and make it tall (ie unpivot)
        df_extract = df_extract.unstack().reset_index()

        # create new column with combined field names of attribute and index
        # eg: AdjClose-1 for the adjusted close of day N-1 or AdjHigh-4 for the adjusted High of day N-4
        df_extract['Attribute-index'] = df_extract['Attributes'] + df_extract['row_index'].apply(str)
        # then drop Attributes and row_index columns since they are not needed anymore
        df_extract.drop(columns=['Attributes', 'row_index'], inplace=True)

        # set index one and transpose
        target_row = df_extract.set_index('Attribute-index').T
        # we now have one row of data that represents the prior n_feature_days worth of data

        # fill in the target_row index with the date from the index of the source dataframe df_X_base_data (ie, day N)
        target_row['Index'] = df_X_base_data.index.to_list()[i-1] # zero-indexed so need minus 1
        target_row = target_row.set_index('Index')

        df_X = df_X.append(target_row)
        
    return df_X

# split df into features and target (column setup_for_profitable_trade)
df_X_base_data = df.drop(columns=['setup_for_profitable_trade'])
df_y = df['setup_for_profitable_trade']

# feature engineering: instead of dollars/absolute values, calculate change from one day to next
cols_set_vals_to_change = ['AdjVolume', 'AdjOpen', 'AdjLow', 'AdjHigh', 'AdjClose']
df_X_base_data = feat_eng_changes_values_to_change(df_X_base_data, cols_set_vals_to_change)

# create wide features matrix that includes prior days' data as columns
df_X = create_feature_cols_df(df_X_base_data)
    
df_X.tail()

reshaping data into feature rows: 100%|██████| 497/497 [00:07<00:00, 67.13it/s]


Attribute-index,AdjVolume_chg-4,AdjVolume_chg-3,AdjVolume_chg-2,AdjVolume_chg-1,AdjVolume_chg0,AdjOpen_chg-4,AdjOpen_chg-3,AdjOpen_chg-2,AdjOpen_chg-1,AdjOpen_chg0,...,AdjHigh_chg-4,AdjHigh_chg-3,AdjHigh_chg-2,AdjHigh_chg-1,AdjHigh_chg0,AdjClose_chg-4,AdjClose_chg-3,AdjClose_chg-2,AdjClose_chg-1,AdjClose_chg0
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-10,-0.004721,0.393462,-0.236989,-0.212326,0.136558,-0.003201,0.006771,0.000485,-0.003881,-0.000904,...,0.005343,0.003934,-0.006462,-0.002353,-0.002086,0.007446,-0.005181,-0.0025,-0.002227,-0.001186
2017-04-11,0.393462,-0.236989,-0.212326,0.136558,0.604539,0.006771,0.000485,-0.003881,-0.000904,-0.004596,...,0.003934,-0.006462,-0.002353,-0.002086,-0.003678,-0.005181,-0.0025,-0.002227,-0.001186,-0.010756
2017-04-12,-0.236989,-0.212326,0.136558,0.604539,-0.330138,0.000485,-0.003881,-0.000904,-0.004596,-0.009375,...,-0.006462,-0.002353,-0.002086,-0.003678,-0.008371,-0.0025,-0.002227,-0.001186,-0.010756,0.0012
2017-04-13,-0.212326,0.136558,0.604539,-0.330138,-0.124183,-0.003881,-0.000904,-0.004596,-0.009375,0.002189,...,-0.002353,-0.002086,-0.003678,-0.008371,0.001618,-0.002227,-0.001186,-0.010756,0.0012,-0.005289
2017-04-17,0.136558,0.604539,-0.330138,-0.124183,-0.069618,-0.000904,-0.004596,-0.009375,0.002189,-0.00303,...,-0.002086,-0.003678,-0.008371,0.001618,-0.003512,-0.001186,-0.010756,0.0012,-0.005289,0.00553


### Feature Engineering - Date information

In [70]:
def feat_eng_append_date_index_content(df):
    '''
    Assumes that the df index is date-time. Bolts on additional columns about the date
    '''

    df['year'] = df.index.year.values
    df['month'] = df.index.month.values
    df['week'] = df.index.week.values
    df['weekday'] = df.index.weekday.values
    df['day'] = df.index.day.values
    df['year'] = df.index.year.values
    df['year'] = df.index.year.values
    
    return df
    
df_X = feat_eng_append_date_index_content(df_X)
df_X.tail()

Attribute-index,AdjVolume_chg-4,AdjVolume_chg-3,AdjVolume_chg-2,AdjVolume_chg-1,AdjVolume_chg0,AdjOpen_chg-4,AdjOpen_chg-3,AdjOpen_chg-2,AdjOpen_chg-1,AdjOpen_chg0,...,AdjClose_chg-4,AdjClose_chg-3,AdjClose_chg-2,AdjClose_chg-1,AdjClose_chg0,year,month,week,weekday,day
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-10,-0.004721,0.393462,-0.236989,-0.212326,0.136558,-0.003201,0.006771,0.000485,-0.003881,-0.000904,...,0.007446,-0.005181,-0.0025,-0.002227,-0.001186,2017,4,15,0,10
2017-04-11,0.393462,-0.236989,-0.212326,0.136558,0.604539,0.006771,0.000485,-0.003881,-0.000904,-0.004596,...,-0.005181,-0.0025,-0.002227,-0.001186,-0.010756,2017,4,15,1,11
2017-04-12,-0.236989,-0.212326,0.136558,0.604539,-0.330138,0.000485,-0.003881,-0.000904,-0.004596,-0.009375,...,-0.0025,-0.002227,-0.001186,-0.010756,0.0012,2017,4,15,2,12
2017-04-13,-0.212326,0.136558,0.604539,-0.330138,-0.124183,-0.003881,-0.000904,-0.004596,-0.009375,0.002189,...,-0.002227,-0.001186,-0.010756,0.0012,-0.005289,2017,4,15,3,13
2017-04-17,0.136558,0.604539,-0.330138,-0.124183,-0.069618,-0.000904,-0.004596,-0.009375,0.002189,-0.00303,...,-0.001186,-0.010756,0.0012,-0.005289,0.00553,2017,4,16,0,17


##### just testing aroung: can also use only one or a few columns and then join the dataframes back together

In [None]:
df_XO = create_feature_cols_df(df_X_base_data[['AdjOpen']])
df_XO.tail()

In [None]:
df_X.join(df_XO).tail()

## Machine Learning

### Build Model

In [34]:
# need to remove first n_days_features rows - they were eliminated during build of df_X and sizes need to match
df_y = df_y.iloc[n_days_features-1:]

In [75]:
# remove all rows that have any NaNs in them - they come from technical indicators or the reshaping and we just don't have 
# any good strategy for imputation other than starting with more time series data
df_X = df_X.loc[df_X.notnull().all(axis=1), :]
df_X.shape

(496, 30)

In [79]:
# now we need to keep the same rows that we kept in df_X in df_y. We'll use the index to filter
df_y = df_y.loc[df_X.index]
df_y.shape

(496,)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2)

In [81]:
pipeline = Pipeline([
    ('randomForest', RandomForestClassifier())
])
pipeline.get_params()

{'memory': None,
 'steps': [('randomForest',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
               oob_score=False, random_state=None, verbose=0,
               warm_start=False))],
 'randomForest': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 'randomForest__bootstrap': True,
 'randomForest__c

In [82]:
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('randomForest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [83]:
y_pred = pipeline.predict(X_test)

In [85]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.69      0.89      0.78        65
        True       0.56      0.26      0.35        35

   micro avg       0.67      0.67      0.67       100
   macro avg       0.63      0.57      0.57       100
weighted avg       0.65      0.67      0.63       100



In [86]:
y_pred.shape

(100,)