## Importing Required Libraries

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn import linear_model
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error

import warnings

# supress svm ConvergenceWarning to increase iterations. Already set to max_iter=10000 
warnings.filterwarnings('ignore', category=UserWarning) 


## Setting up utility/helper functions for data import and preparation


In [21]:

# see docstrings for each functions purpose, inputs and outputs

def get_data(symbols):
    
    """Get prices for ticker symbols provided over the date range given by start and end dates
    
    Input: symbols - (str or list) ticker(s) of stocks for which prices are required
               
    Output: Series of stock prices if one ticker provided
            Dataframe of stock prices if list of tickers is provided 
    """
    
    is_single_ticker = False #initialize flag for single ticker request
    
    # if symbols is a list containing more than 1 tickers combine to one string
    # yfinance takes list as one string with tickers separated by space
    # for example ['GOOG', 'AAPL'] should be passed as 'GOOG AAPL'
         
    if len(symbols) > 1:
        symbols = ' '.join(map(str, symbols))
    else:       
        is_single_ticker = True   # flag if only one ticker symbol was provided. used later
    
    # API call to Yahoo Finance
    df = yf.download(symbols, period='max', group_by='ticker')
    
    # This section retains the multi-index column structure for consistency (if one ticker is requested)
    # yfinance does not return a multi-index structure if one ticker is requested
    
    if is_single_ticker:    
              
        # create list containing 6 identical elements representing the symbol
        # 5 because the API column return 6 columns and symbols already contains 1 column
        # hence 5 repeats needed to align with the 6 columns returned
        
        symbols.extend([symbols[0] for i in range(5)]) 
                
        # create multi-index column hierachy
        tuples = tuple(zip(symbols, df.columns))
               
        # convert column indext to a multi-index structure
        df.columns = pd.MultiIndex.from_tuples(tuples)
    
    return df

def format_tickers(tickers):
    
    """Formats the user entry into a list of tickers without punctuation and extra spaces
    
    Input: tickers (str) a string representing ticker symbols for which stock data is requested
    
    Output: clean_tickers (list) a list representing the list of stock tickers 
    
    """
    
    # Keep alphabetic characters and '-', remove all other characters 
    # also remove trailing spaces with strip()
    cleaned_tickers = re.sub("[^a-zA-z-]"," ", tickers).strip()
    
    # convert symbol to uppercase in case user entered as lower case
    # this is for consistency with calls for multiple tickers which display tickers as uppercase
    cleaned_tickers = cleaned_tickers.upper().split(" ")
    
    # for certain inputs, split function above may add additional blank elements
    # use list comprehension to retain elements that are not blanks ('')
    cleaned_tickers = [x for x in cleaned_tickers if x != '']
    
    return cleaned_tickers

def plot_data(df, title="Stock Prices", figsize=(8,6), xlabel="Date", ylabel="Price"):
    """ Plot stock prices
    Input: df - dataframe with stock prices for single or multiple stocks
    Outpt: None - function only plots the stock prices
    """
    
    ax = df.plot(title=title, fontsize=12)
    plt.figure(figsize=figsize)
    
    # add axis labels    
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
 
    plt.show();

def get_rolling_mean(values, window):
    """Return rolling mean of given values, using specified window size."""
    return values.rolling(window=window).mean()

def get_rolling_exponential_mean(values, span):
    """Return rolling mean of given values, using specified window size."""
    return values.ewm(span=span).mean()

def get_rolling_std(values, window):
    """Return rolling standard deviation of given values, using specified window size."""
    return values.rolling(window=window).std()

def create_features(df, forecast_range=28):
    
    """Creates new columns for features to be used for modelling
    
    Input: df (dataframe) representing the adjusted closing prices
           forecast_range (int) number of days in the future to be predicted
               
    Output: df_features (dataframe) which adds the following columns:
            1. forward price: this the value to be predicted (Y-value)
            2. rolling_mean: moving average over the forecast period
            3. sma_50: 50-day simple moving average (sma)
            4. sma_100: 100-day simple moving average (sma)
            5. rolling_std: moving standard deviation over the forecast period
            6. exponential_rolling_mean: exponential average over the forecast period
    """
    
    df_features = df.copy()    
    
    # forward fill
    # not using backfill after 'ffill' since remaining NaNs represent period before stock started trading
    df_features.fillna(method='ffill', inplace=True)
    
    df_features['forward_price'] = df_features['Adj Close'].shift(-forecast_range)
    df_features['rolling_mean'] = get_rolling_mean(df, forecast_range)
    df_features['sma_50'] = get_rolling_mean(df, 50)
    df_features['sma_100'] = get_rolling_mean(df, 100)
    df_features['rolling_std'] = get_rolling_std(df, forecast_range)
    df_features['exponential_rolling_mean'] = get_rolling_exponential_mean(df, forecast_range)
    df_features.dropna(how='any', inplace=True)
    
    return df_features

def get_returns(df):
    """Return the period returns for a given time series"""
   
    # calculate returns for each row by diving closing price by previous period closing
    # df.shift(0) gives the previous period closing
    
    df_returns = (df / df.shift(1)) - 1
    
    # replace values in first row with 0 
    # since first row will have NaN because it has no previous period closing
    df_returns.replace(to_replace=np.nan, value = 0, inplace=True)
    
    return df_returns

def get_cumulative_returns(df):
    """Return the cumulative returns over a given time series"""
    
    try:
        cum_returns = (df / df.iloc[0, :]) - 1    
    except:
        cum_returns = (df / df[0]) - 1
         
    return cum_returns


## Importing and preparing data

In [3]:
# import data from yahoo finance using the get_data helper function created above
tickers = ['XOM', 'AAPL', 'MSFT', 'GOOG']

# get stock data from yahoo finance using get_data function above
df = get_data(tickers)

# print number of rows in dataset
print("\nThere are {} tradings days worth of data\n".format(df.shape[0]))

display(df.head())

[*********************100%***********************]  4 of 4 completed

There are 15119 tradings days worth of data



Unnamed: 0_level_0,MSFT,MSFT,MSFT,MSFT,MSFT,MSFT,GOOG,GOOG,GOOG,GOOG,...,AAPL,AAPL,AAPL,AAPL,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02,,,,,,,,,,,...,,,,,0.0,1.589844,1.578125,1.578125,0.102984,902400
1962-01-03,,,,,,,,,,,...,,,,,0.0,1.601563,1.578125,1.601563,0.104513,1200000
1962-01-04,,,,,,,,,,,...,,,,,0.0,1.613281,1.597656,1.605469,0.104768,1088000
1962-01-05,,,,,,,,,,,...,,,,,0.0,1.613281,1.566406,1.570313,0.102474,1222400
1962-01-08,,,,,,,,,,,...,,,,,0.0,1.582031,1.546875,1.566406,0.102219,1388800


In [4]:
forecast_range = 20

tickers = df.columns.get_level_values(0).unique()

df_features = pd.DataFrame()

for ticker in tickers: 
    ticker_id = ticker
    adj_close = df.columns.get_level_values(1)[4]
    df_adj_close = pd.DataFrame(df[ticker][adj_close])
    df_temp = create_features(df_adj_close, int(forecast_range))
    df_temp['Ticker'] = ticker
    print("\n", ticker)
    display(df_temp.head(3))
 
    df_features = pd.concat([df_features, df_temp])



 MSFT


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1986-08-04,0.061926,0.06083,0.063913,0.067692,0.066505,0.002578,0.064696,MSFT
1986-08-05,0.061378,0.059186,0.063776,0.06756,0.066505,0.002639,0.06438,MSFT
1986-08-06,0.06083,0.064119,0.063749,0.067374,0.066478,0.002668,0.064042,MSFT



 GOOG


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-10,97.165802,98.949112,93.763052,90.512337,76.899263,4.019539,94.195688,GOOG
2005-01-11,96.408638,95.432297,94.338147,90.541229,77.363522,3.47114,94.406453,GOOG
2005-01-12,97.325203,93.639015,94.753839,90.534753,77.797247,3.29289,94.68444,GOOG



 AAPL


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1981-05-06,0.095649,0.112246,0.09696,0.091378,0.097571,0.003657,0.096439,AAPL
1981-05-07,0.09696,0.110499,0.097091,0.091553,0.097536,0.003605,0.096489,AAPL
1981-05-08,0.097833,0.106568,0.097178,0.091718,0.097562,0.0036,0.096617,AAPL



 XOM


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1962-05-23,0.108139,0.102667,0.111604,0.111682,0.110221,0.001654,0.111109,XOM
1962-05-24,0.108661,0.101625,0.111354,0.111618,0.110278,0.001705,0.110876,XOM
1962-05-25,0.106315,0.101104,0.111026,0.111471,0.110296,0.002001,0.110442,XOM


In [5]:
df_features.head()

Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1986-08-04,0.061926,0.06083,0.063913,0.067692,0.066505,0.002578,0.064696,MSFT
1986-08-05,0.061378,0.059186,0.063776,0.06756,0.066505,0.002639,0.06438,MSFT
1986-08-06,0.06083,0.064119,0.063749,0.067374,0.066478,0.002668,0.064042,MSFT
1986-08-07,0.060282,0.066859,0.063776,0.067133,0.066434,0.002627,0.063684,MSFT
1986-08-08,0.06083,0.066311,0.063749,0.06687,0.066412,0.002656,0.063412,MSFT


## Setting up learner class to help fit/train model and generate predictions

In [6]:
##########################################################################################################

class StockPredictionLearner(object):
    
    """Trains and evaluates a stock prediction model given the model type and X and Y values.
       Also generates a prediction using the trained model
       
       Attributes: model (instatiated object) machine learning model specified e.g linear regression, SVM, KNN
                   X and Y (array) representing predictor variables and target variablereg = make_pipeline(StandardScaler(),
...                     SGDRegressor(max_iter=1000, tol=1e-3)) split into train/test
    """
    
    def __init__(self, clf, params=None):
        self.clf = clf(**params)
        
    def train(self, X_train, Y_train):
        """Train a model given model object/type, independent (X) and dependent (Y) values (Y)
        
        Input: model   - instantiated model object
               X_train - array of predictor variables
               Y_train - array of target variables
        
        Output: A fitted linear regression model
        """
        
        self.clf.fit(X_train, Y_train)
                
    def predict(self, X_input):
        """ Returns a prediction (dependent variable) given the model and indepent variables (X_input)
        Input: model - a trained linear regression model
               X_input - array of the independent variables to be used as predictors
        Output: array of predicted values based on the model and input values provided
        """
        
        return self.clf.predict(X_input)
    
    def fit(self,x,y):
        
        return self.clf.fit(x, y)
    
    def feature_importances(self, x, y):
        
        print(self.clf.fit(x, y).feature_importances_)
        
#########################################################################################################

## Setting up utility/helper functions for model training and evaluation

In [7]:
def prepare_train_test_split(df):
    
    """Generates train_test_split for given dataframe"""
    
    X = np.array(df.drop(['forward_price', 'Ticker'], axis=1))
    y = np.array(df['forward_price'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)
    
    return X_train, X_test, y_train, y_test, X, y

def train_k_folds(clf, X_train, y_train, X_test, X, y):
    """ Fits a model using K-fold cross validation
    
    Input: clf (object) instantiated model object
           X/y train/test (arrays) representing the train/test X/y data 
    
    Output: arrays of cross validation results
    
    """
    
    # Set K-Fold parameters
    ntrain = X_train.shape[0]
    ntest = X_test.shape[0]
    nfolds = 5 
    kf = KFold(n_splits=nfolds, shuffle=False)
    
    kf_train = np.zeros((ntrain,))
    kf_test = np.zeros((ntest,))
    kf_test_skf = np.empty((nfolds, ntest))
    
    i = 0
    
    for train_index, test_index in kf.split(X_train):
        i=+1
        x_tr, x_te = X_train[train_index], X[test_index]
        y_tr, y_te = y[train_index], y[test_index]
       
        clf.train(x_tr, y_tr)

        kf_train[test_index] = clf.predict(x_te)
        kf_test_skf[i, :] = clf.predict(X_test)

    kf_test[:] = kf_test_skf.mean(axis=0)
    
    return kf_train.reshape(-1, 1), kf_test.reshape(-1, 1)

def evaluate_model(model, X_test, Y_test):
    """ Returns scoring metrics to evaluate the performance of the model

    Input: model - a trained linear regression model
           X_test, Y_test - independent and dependent variables from the training dataset

    Output: r-squared, mean squared error, root mean squared error, 
            mean absolute error, mean absolute percentage error 
    """
    
    y_pred = model.predict(X_test)
    
    r_squared = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(Y_test, y_pred)
    mape = mean_absolute_percentage_error(Y_test, y_pred)

    return r_squared, mse, rmse, mae, mape
    
def print_evaluation(result):
    
    """Prints the results for the evaluation of the model performed in the evaluate_model function
    
    Input: result (list/tuple) containing the r_squared, mse, rmse, mae, mape
    
    Output: None (performs print action)
    """
    r_squared, mse, rmse, mae, mape = result[0], result[1], result[2], result[3], result[4]
   
    print("R-squared:\t {:.2f}".format(r_squared))
    print("MSE:\t\t {:.2f}".format(mse))
    print("RMSE:\t\t {:.2f}".format(rmse))
    print("MAE:\t\t {:.2f}".format(mae))
    print("MAPE:\t\t {:.1%}".format(mape))
    print("\n")
    

## Training model

In [8]:
# Set model paramenters
ln_params = {}  # Linear Regression
kr_params = {'alpha': 1.0} # Kernel Ridge Regression
svr_params = {'max_iter':10000} # Linear Support Vector Regression

# Set up object for each models
ln_reg = StockPredictionLearner(clf=linear_model.LinearRegression, params=ln_params)
kr = StockPredictionLearner(clf=KernelRidge, params=kr_params)
ln_svr = StockPredictionLearner(clf=LinearSVR, params=svr_params)

# set up dictionary to be used to loop through the evaluation results of each model
models = {"Linear Regression": ln_reg,
          "Kernel Ridge Regression": kr,
          "Linear Support Vector Regression": ln_svr
         }


In [9]:
def train_test_evaluate(df_data):
    
    """Combines the prepare_train_test_split, train_k_folds, evaluate_model functions above
    to complete the training and testing cycles and recommend the best model to be use for 
    prediction for each ticker provided
    
    Input: df_data (dataframe) representing dataset to be used for training and testing
    
    Output: best_models (dict) representing the model that should be used to generate predictions of each ticker
    
    """
        
    best_models = dict() # set up dictionary to store the best model for each ticker

    for ticker in tickers: # train each model for each ticker in the list of tickers requested

        df_temp_ticker = df_data[df_data['Ticker'] == ticker]

        # generate train/test split using the prepare_train_test_split function 
        # prepare_train_test_split function defined in set-up section above
        X_train, X_test, y_train, y_test, X, y = prepare_train_test_split(df_temp_ticker)

        # Train and test models by calling the train_k_folds defined in the set-up section above
        ln_train, ln_test = train_k_folds(ln_reg, X_train, y_train, X_test, X, y) # Linear Regression
        kr_train, kr_test = train_k_folds(kr, X_train, y_train, X_test, X, y) # Kernel Ridge Regression
        svr_train, svr_test = train_k_folds(ln_svr, X_train, y_train, X_test, X, y) # Linear Support Vector Regression

        print("Training completed for {}\n".format(ticker))

        # set up dataframe to store the evaluation results of each model
        df_results = pd.DataFrame(columns=['R-Squared', 'MSE', 'RMSE', 'MAE', 'MAPE'])

        # loop through the testing results of each model and add results to the dataframe
        for model_name, model_obj in models.items():
            df_results.loc[model_name] = list(evaluate_model(model_obj, X_test, y_test))

        display(df_results)

        # identify the best model (model with lowest RSME)
        best_model_name = df_results['RMSE'].idxmin()   
        best_models[ticker] = best_model_name

        print("The best model for {} is the \033[1m {} \033[1m\033[0m model\n\n".format(ticker, best_model_name))

    print("Training and evaluation completed successfully for all tickers and all models!")
    
    return best_models


In [10]:
# Call train_test_evaluate function above to train and test
# and get results for best model to use

model_to_use_for_prediction = train_test_evaluate(df_features)

Training completed for MSFT



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.985019,100.9501,10.047393,6.608835,0.050497
Kernel Ridge Regression,0.986825,88.774959,9.422046,6.136735,0.047285
Linear Support Vector Regression,0.986536,90.725643,9.525001,6.155947,0.047352


The best model for MSFT is the [1m Kernel Ridge Regression [1m[0m model


Training completed for GOOG



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.961671,14958.121133,122.303398,91.393461,0.05565
Kernel Ridge Regression,0.963536,14230.316415,119.29089,88.663699,0.054437
Linear Support Vector Regression,0.949323,19776.988929,140.630683,109.037956,0.065055


The best model for GOOG is the [1m Kernel Ridge Regression [1m[0m model


Training completed for AAPL



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.982253,30.423583,5.515758,3.488243,0.063445
Kernel Ridge Regression,0.98263,29.777555,5.456881,3.445058,0.061827
Linear Support Vector Regression,0.981581,31.576432,5.619291,3.519596,0.064446


The best model for AAPL is the [1m Kernel Ridge Regression [1m[0m model


Training completed for XOM



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.843355,14.36269,3.789814,2.725567,0.050918
Kernel Ridge Regression,0.842338,14.455968,3.8021,2.737166,0.051131
Linear Support Vector Regression,0.840571,14.61791,3.823337,2.777341,0.051634


The best model for XOM is the [1m Linear Regression [1m[0m model


Training and evaluation completed successfully for all tickers and all models!


In [11]:
# display summary of the best model for each ticker symbol 
# based on the results of the train/test/evaluate cycle above

model_to_use_for_prediction

{'MSFT': 'Kernel Ridge Regression',
 'GOOG': 'Kernel Ridge Regression',
 'AAPL': 'Kernel Ridge Regression',
 'XOM': 'Linear Regression'}

## USER INTERFACE - for generating a prediction

If suggestions for valid ticker symbols are needed, execute this code cell to get the list of ticker symbols for stocks in the S&P 500 index. Yahoo Finance requires the ticker symbol to be provided as the identifier in order to download stock price data using the API

In [12]:
# Get list of S&P 500 stocks

# Import table of S&P 500 stocks from wikipedia
sp500_universe = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

# replace "." with "-" in ticker symbols to conform with yahoo finance format
# for example "BRK.B" should be converted to "BRK-B"
sp500_universe['Symbol'] = sp500_universe['Symbol'].replace(regex=r"[.]", value="-")

display(sp500_universe.head())
print(list(sp500_universe['Symbol']))

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981


['MMM', 'AOS', 'ABT', 'ABBV', 'ABMD', 'ACN', 'ATVI', 'ADBE', 'AAP', 'AES', 'AFL', 'A', 'AIG', 'APD', 'AKAM', 'ALK', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AMD', 'AEE', 'AAL', 'AEP', 'AXP', 'AMT', 'AWK', 'AMP', 'ABC', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'ANTM', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'BKR', 'BLL', 'BAC', 'BBWI', 'BAX', 'BDX', 'BRK-B', 'BBY', 'BIO', 'TECH', 'BIIB', 'BLK', 'BK', 'BA', 'BKNG', 'BWA', 'BXP', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF-B', 'CHRW', 'CDNS', 'CZR', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CTLT', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'CNC', 'CNP', 'CDAY', 'CERN', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CTXS', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CAG', 'COP', 'ED', 'STZ', 'COO', 'CPRT', 'GLW', 'CTVA', 'COST', 'CTRA', 'CCI', 'CSX', 'CMI', 'CVS', 'DHI', '

**Get user input on the list of ticker symbols for the stocks for which predictions are required**


In [36]:
# import data from yahoo finance using the get_data helper function created above

try:
    # request list of ticker symbols from user
    ticker_input = input("Enter list of ticker symbols:---->  ")
    tickers = format_tickers(ticker_input) # clean user entry to required format for yahoo finance using clean_tickers helper function

    print("\n")

    # get stock data from yahoo finance using get_data function above
    df1 = get_data(tickers)

    # print number of rows in dataset
    print("\nThere are {} tradings days worth of data\n".format(df.shape[0]))

    display(df1.head())

except:
    print("\nPlease enter valid ticker symbols. Check list of S&P 500 continuents above if you need ticker examples")


Enter list of ticker symbols:---->  tsla, jpm, c, brk-b


[*********************100%***********************]  4 of 4 completed

There are 15119 tradings days worth of data



Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,C,C,C,C,...,BRK-B,BRK-B,BRK-B,BRK-B,JPM,JPM,JPM,JPM,JPM,JPM
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1977-01-03,,,,,,,16.133125,16.236876,16.133125,16.236876,...,,,,,,,,,,
1977-01-04,,,,,,,16.236876,16.28875,16.184999,16.28875,...,,,,,,,,,,
1977-01-05,,,,,,,16.28875,16.28875,16.133125,16.184999,...,,,,,,,,,,
1977-01-06,,,,,,,16.133125,16.133125,15.925625,16.081249,...,,,,,,,,,,
1977-01-07,,,,,,,16.081249,16.133125,15.925625,16.133125,...,,,,,,,,,,


**Get user input on the date range to be used for training and testing**


In [37]:
print("The stocks you have selected have data from {:%Y-%m-%d} to {:%Y-%m-%d}\n".format(df1.index[0], df1.index[-1]))

train_start = input("Enter the training start date in the same format (yyyy-mm-dd):---->  ")
train_end = input("Enter the training end date in the same formate (yyyy-mm-dd):---->  ") 
msg_try_again = "*****Please try again*****"

try:
    train_start = datetime.strptime(train_start, '%Y-%m-%d')
    train_end = datetime.strptime(train_end, '%Y-%m-%d')
    if df1.index[0] < train_start < df1.index[-1]:
        if df1.index[0] < train_end < df1.index[-1]:
            print("\nTraining and testing will be completed over the period from {:%Y-%m-%d} to {:%Y-%m-%d}".format(train_start, train_end))
        else:
            print("\nThe training end date provided in out of range")
            print(msg_try_again)
    else:
        print("\nThe training start date provided in out of range")
        print(msg_try_again)        
except:
    print("\nPlease enter dates in valid date format (yyyy-mm-dd)")
    print(msg_try_again)   


The stocks you have selected have data from 1977-01-03 to 2022-01-21

Enter the training start date in the same format (yyyy-mm-dd):---->  2010-01-01
Enter the training end date in the same formate (yyyy-mm-dd):---->  2019-12-31

Training and testing will be completed over the period from 2010-01-01 to 2019-12-31


**Get user input on the number of days into future they wish to make a prediction and create features using this forecast range**


In [38]:

forecast_range = input("Enter number of days into the future for which stock prediction is requested:---->  ")

try:
    forecast_range = int(forecast_range)
    
    max_train_date = df1.index[-1] - timedelta(days=forecast_range)
    
    if train_end <= max_train_date:
 
        tickers = df1.columns.get_level_values(0).unique()    
        df1_features = pd.DataFrame()    

        for ticker in tickers: 
            ticker_id = ticker
            adj_close = df1.columns.get_level_values(1)[4]
            df_adj_close = pd.DataFrame(df1[ticker][adj_close])
            df_temp = create_features(df_adj_close, int(forecast_range))
            df_temp['Ticker'] = ticker
            print("\n", ticker)
            display(df_temp.head(3))
            df1_features = pd.concat([df1_features, df_temp])
    else:
        max_delta = df1.index[-1] - train_end
        print("\nThe training end date does not allow for sufficient testing for forecast period of {} days".format(forecast_range))
        print("The maximum forecast range that can be selected is {} days".format(max_delta.days))
        print(msg_try_again)

except:
    print("Enter valid number of days for which stock prediction is required")
    print(msg_try_again)


Enter number of days into the future for which stock prediction is requested:---->  28

 TSLA


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-11-17,5.898,5.546,4.653786,4.42844,4.19758,0.728183,4.908231,TSLA
2010-11-18,5.978,5.3,4.722714,4.46516,4.20958,0.759393,4.982062,TSLA
2010-11-19,6.198,5.326,4.7995,4.50844,4.2239,0.796437,5.065977,TSLA



 C


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1977-05-24,7.980888,8.005678,7.932204,7.910004,7.83193,0.070582,7.944351,C
1977-05-25,8.005678,8.055243,7.929549,7.912483,7.834409,0.066069,7.948584,C
1977-05-26,7.980888,8.129599,7.932204,7.913474,7.836392,0.066602,7.950813,C



 BRK-B


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996-09-30,21.459999,21.860001,21.31,21.1364,21.3232,0.288778,21.365822,BRK-B
1996-10-01,21.24,21.879999,21.311428,21.1444,21.3036,0.288312,21.357138,BRK-B
1996-10-02,21.42,21.92,21.315714,21.1648,21.2778,0.289027,21.361477,BRK-B



 JPM


Unnamed: 0_level_0,Adj Close,forward_price,rolling_mean,sma_50,sma_100,rolling_std,exponential_rolling_mean,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1980-08-06,1.384274,1.443539,1.401358,1.384443,1.314891,0.039417,1.399691,JPM
1980-08-07,1.40544,1.447772,1.403021,1.385628,1.317431,0.038531,1.400088,JPM
1980-08-08,1.396974,1.452006,1.404533,1.386983,1.319801,0.037376,1.399873,JPM


**Train model and get back results of the best model to use given the date range specified for training and the number of days into future that the user wishes to predict**

In [39]:
model_recommendation = train_test_evaluate(df1_features)

Training completed for TSLA



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.832848,16790.588463,129.578503,92.741111,0.208592
Kernel Ridge Regression,0.838163,16256.699016,127.501761,91.067087,0.205085
Linear Support Vector Regression,0.826094,17469.032483,132.170468,94.385095,0.211335


The best model for TSLA is the [1m Kernel Ridge Regression [1m[0m model


Training completed for C



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.740967,32.27067,5.680728,3.615112,0.072441
Kernel Ridge Regression,0.747162,31.498883,5.612387,3.660991,0.072571
Linear Support Vector Regression,0.739351,32.472021,5.698423,3.660066,0.073118


The best model for C is the [1m Kernel Ridge Regression [1m[0m model


Training completed for BRK-B



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.829876,226.176371,15.039161,11.604034,0.053562
Kernel Ridge Regression,0.870611,172.019731,13.115629,9.723025,0.045459
Linear Support Vector Regression,0.845537,205.356429,14.330263,11.024832,0.050993


The best model for BRK-B is the [1m Kernel Ridge Regression [1m[0m model


Training completed for JPM



Unnamed: 0,R-Squared,MSE,RMSE,MAE,MAPE
Linear Regression,0.947332,64.162542,8.010152,5.269022,0.05813
Kernel Ridge Regression,0.948586,62.634919,7.914223,5.161315,0.057112
Linear Support Vector Regression,0.951857,58.650404,7.658355,4.921889,0.055288


The best model for JPM is the [1m Linear Support Vector Regression [1m[0m model


Training and evaluation completed successfully for all tickers and all models!


In [40]:
# display the summary of models to be used to generate prediction

model_recommendation

{'TSLA': 'Kernel Ridge Regression',
 'C': 'Kernel Ridge Regression',
 'BRK-B': 'Kernel Ridge Regression',
 'JPM': 'Linear Support Vector Regression'}

**Get user to specifiy the date for which they require a prediction**

In [41]:
max_prediction_date = df1.index[-1] - timedelta(days=forecast_range)

print("Enter a date between {:%Y-%m-%d} and {:%Y-%m-%d} for which a prediction is required\n".format(train_end, max_prediction_date))

user_prediction_date = input("Enter date in the format yyyy-mm-dd:---->  ")

try:
    user_prediction_date = datetime.strptime(user_prediction_date, '%Y-%m-%d')
    if train_end < user_prediction_date < max_prediction_date:
        
        print("\nThank you! Proceed to the final step to generate your prediction")
    else:
        print("\nThe prediction date provided in out of range")
        print(msg_try_again)        
except:
    print("\nPlease enter a valid date in the format (yyyy-mm-dd)")
    print(msg_try_again)   


Enter a date between 2019-12-31 and 2021-12-24 for which a prediction is required

Enter date in the format yyyy-mm-dd:---->  2020-06-30

Thank you! Proceed to the final step to generate your prediction


**Finally, generate and present prediction for date specified by user**

In [42]:
# X_input_date is the reference date for which predictor values should be retrieved
X_input_date = user_prediction_date - timedelta(days=forecast_range)
X_input_date = X_input_date.strftime('%Y-%m-%d')

# check if the date n-days before prediction date was a day
# if not then check the previous day until a valid trading day is found
# 4 used as the maximum continuous period for scenarios with 2 public holidays on either sides of a weekend 

rng_count = forecast_range # counter to move back one day if calculated prediction date was not a trading day

# attempt to get X/predictor values using X_input_date
# if successful then row found in dataset hence break loop
for i in range(5):
    
    X_input = df1_features.loc[X_input_date, :]
    
    if len(X_input) == 0:
        # if attempt to get X/predictor values from X_input_date fails
        # then set X_input_date to the previous day
        # until a valid trading day is found and X_input_date is set to that date in else statement
        X_input_date = user_prediction_date - timedelta(days=rng_count)
        X_input_date = X_input_date.strftime('%Y-%m-%d')
    else:
        prediction_date = X_input_date
        break
        
    rng_count = rng_count + 1   

# initial dictionary to store prediction results    
predicted_prices = dict()

# loop through each ticker and generate prediction
# also calculate the error and the percentage error (absolute)

for ticker in tickers:
    # Get the prediction model to be used
    prediction_model = models[model_recommendation[ticker]]
    
    # the data for the date to be predicted
    data = pd.DataFrame(df1_features[df1_features['Ticker'] == ticker].loc[prediction_date, :]).T
    X_input = np.array(data.drop(['forward_price', 'Ticker'], axis=1))
    actual = np.array(data['forward_price'])
    
    ### ******generate predictions and store results in predicted_prices dict****** ###
    prediction = prediction_model.predict(X_input)
    error = actual[0] - prediction[0]
    perct_error = abs(error)/actual[0]
    predicted_prices[ticker] = (prediction[0], actual[0], error, perct_error)
    
# display results as a dataframe    
columns = ['Actual Price', 'Predicted Price', 'Error', 'Percent Error']
df_predictions = pd.DataFrame.from_dict(predicted_prices, orient='index', columns=columns)     
df_predictions

Unnamed: 0,Actual Price,Predicted Price,Error,Percent Error
TSLA,169.335745,299.411987,130.076242,0.434439
C,51.473197,49.531532,-1.941664,0.039201
BRK-B,193.351116,184.130005,-9.221111,0.050079
JPM,98.829831,93.819687,-5.010144,0.053402
