In [1]:
import nbimporter
import numpy as np
import pandas as pd
import preprocessing as pp
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, classification_report, matthews_corrcoef, cohen_kappa_score

Importing Jupyter notebook from preprocessing.ipynb


In [2]:
# This fumction calculates and returns the Mean Squared Error (MSE) for a model
def mean_squared_error(Y_true, Y_pred):
    # number of test samples
    M = len(Y_true)
    total_error = 0
    for i in range(M):
        total_error += (Y_true[i] - Y_pred[i]) ** 2
    mse = total_error / M
    return mse

https://clevertap.com/blog/the-best-metric-to-measure-accuracy-of-classification-models/

In [1]:
def accuracy_metrics(Y_test, Y_pred):
    print('Accuracy: ', accuracy_score(Y_test, Y_pred))
    print('Matthews Correlation Coefficient: ', matthews_corrcoef(Y_test, Y_pred))
    print('Cohen Kappa Score: ', cohen_kappa_score(Y_test, Y_pred))
    print('Confustion Matrix')
    print(confusion_matrix(Y_test, Y_pred))
    print('Classification Report')
    print(classification_report(Y_test, Y_pred))

In [4]:
# This function splits the data into train and test sets based on the split ratio supplied and 
# returns the corresponding X and Y values for the train and test data
def train_test_split(df, train_ratio = 0.75):
    split_index = int(train_ratio * len(df.index))
    train_data = df.iloc[0:split_index, :]
    test_data = df.iloc[split_index:, :]
    
    X_train, Y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
    X_test, Y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]
    
    return X_train, X_test, Y_train, Y_test

In [5]:
# This function splits the input data into corresponding X(features) and Y(output) values
# The Y(output) values are assumed to be in the last column of the input data
def get_X_Y(data):
    X_data = list()
    Y_data = list()
    for i in data:
        X_data.append(i[:-1])
        Y_data.append(i[-1])
    return X_data, Y_data

In [6]:
# This function converts the returns value to binary outputs for Classification
# 1: Positive Return, -1: Negative Return
def output_to_binary_indicators(df):
    output_column_index = len(df.columns) - 1
    for i in range(len(df)):
        df.iloc[i, output_column_index] = 1 if df.iloc[i, output_column_index] >= 0 else -1
    return df

In [7]:
# This function adds the lag returns for the past num_lag days returns as features
def add_lag_returns(df, data, num_lags):
    # number of datapoints in the data frame
    M = len(df.index)
    
    # creating columns for each of the lagged returns
    col_names = [('Lag-' + str(i)) for i in range(num_lags, 0, -1)]
    
    for i in range(num_lags):
        # gets the indices of the datapoints to build the features corresponding to RETURNS at t = t0 - (num_features - i)
        indices_to_fetch = df.index[i:(M - num_lags + i)]
        temp = df.loc[indices_to_fetch, 'RETURNS']
        temp.index = df.index[num_lags:]
        data[col_names[i]] = temp
    return data

In [8]:
# This function adds the moving average returns for 2 to the past num_days returns as features
def add_moving_average_returns(df, data, num_days):
#     print(df.info())
    
    # number of datapoints in the data frame
    M = len(df.index)
    
    # creating columns for for each moving average return
    col_names = [('MovAvg-' + str(i)) for i in range(2, num_days + 1)]
    
    for i in range(2, num_days + 1):
        # Calculating moving average of the returns of the past i days
        data[col_names[i - 2]] = df.rolling(i).mean().shift(1).iloc[i:,]
    return data

In [13]:
def prepare_data(num_features, symbol_name = 'AAPL', is_binary_ouput = True):
    """
    INPUT: This function taken in the Symbol of the stock data to be studied and eventually predicted 
    
    It prepares the data for training by fetching the data for the mentioned stock, calculating the daily returns
    of the stock, and adding corresponding features for each datapoint using n previous returns. 
    At the end, if the output is wanted to be binary i.e. Stock going up or down, positive returns are converted 
    to 1 else converted to 0. It further splits the built dataframe into training and test sets
     
    RETURN: It returns X and Y values for the train and test sets ready to start work upon
    """
    
    df = pp.read_file(symbol_name)
    df = pp.data_daily_returns(df, 'ADJ_CLOSE')
    
    # Removing seasonality
    residue = pp.series_decomposition(df, 'additive', 'RETURNS').resid
    df = pd.Series.to_frame(residue)
    
    # creating dataframe to store the LAGGED RETURNS data points
    data = pd.DataFrame()
    
    data = add_lag_returns(df, data, num_features)
#     data = add_moving_average_returns(df, data, num_features)

    # Adding the True Return Value of the day to the Dataframe
    data['RETURNS'] = df.loc[df.index[num_features:], 'RETURNS']
#     print(data.head())

    if is_binary_ouput:    
        data = output_to_binary_indicators(data)    
    
    return train_test_split(data, 0.75)

In [10]:
X_train, X_test, Y_train, Y_test = prepare_data(5)
X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values

<class 'pandas.core.frame.DataFrame'>


### Sklearn TimeSeriesSplit
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

In [11]:
def rolling_cross_validation(X, Y, num_splits, algorithm, parameters, is_classification = True):
    print('Inbuilt Rolling Cross Validation')
    if num_splits <= 0 or num_splits >= len(X):
        num_splits = len(X) - 1
    print('Parameters ------------------------>', parameters)
    
    accuracies = []
    tscv = TimeSeriesSplit(n_splits = num_splits)
    
    for train, test in tscv.split(X):
#         print("Test: %s Train: %s" % (test, train))
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = X[test], Y[test]
        
        Y_pred = algorithm(X_train, Y_train, X_test, parameters[0])
        
        if is_classification:
            accuracies.append(accuracy_score(Y_test, Y_pred))
        else:
            accuracies.append(r2_score(Y_test, Y_pred))
    
    mean_accuracy = np.array(accuracies).mean()
    print('Accuracy:', mean_accuracy)
    return mean_accuracy

### Cross Validation on a Rolling Basis - Implementation

we train our model on a small segment of the time series from the beginning until some  𝑡 , make predictions for the next  𝑡+𝑛 steps, and calculate an error. Then, we expand our training sample to  𝑡+𝑛  value, make predictions from  𝑡+𝑛  until  𝑡+2∗𝑛 , and continue moving our test segment of the time series until we hit the last available observation. As a result, we have as many folds as  𝑛  will fit between the initial training sample and the last observation.  

http://francescopochetti.com/pythonic-cross-validation-time-series-pandas-scikit-learn/
https://stats.stackexchange.com/a/268847
https://robjhyndman.com/hyndsight/tscv/

In [12]:
def timeSeriesCV(X_train, Y_train, num_splits, algorithm, is_classification = True):
    print('Implemented Rolling Cross Validation')
    if num_splits <= 0 or num_splits >= len(X_train):
        num_splits = 10
    
#     print('Number of folds:', num_splits)
#     print('Size train set:', X_train.shape)
    
    # k is the size of each fold. It is computed dividing the number of rows in X_train by the num_splits
    # This number is then floored and converted into an int
    k = int(np.floor(float(X_train.shape[0]) / num_splits))
#     print('Size of each fold:', k)
    
    # Initialise the accuracies array as an array full of zeroes. It has (num_splits - 1) elements as 
    # the first element is always used for trainiing and never tested
    accuracies = np.zeros(num_splits - 1)
    
    # Loop from 2 splits to num_splits
    for i in range(2, num_splits + 1):
        # It is the fraction of the data (used in the iteration) split used for training. Rest is used for testing
        split = float(i - 1)/i
#         print('Splitting the first ' + str(i) + ' chunks at ' + str(i - 1) + '/' + str(i))
        
        # Getting the X and Y values to be used in this iteration
        X = X_train[:(k * i)]
        Y = Y_train[:(k * i)]
#         print('Size of train + test:', X.shape)
        
        # Index to split according to the split fraction calculated earlier
        index = int(np.floor(X.shape[0] * split))
        
        # Folds used to train the model
        X_trainFolds = X[:index]
        Y_trainFolds = Y[:index]
        
        # Folds used to test the model
        X_testFolds = X[index:]
        Y_testFolds = Y[index:]
        
        algorithm.fit(X_trainFolds, Y_trainFolds)
        Y_pred = algorithm.predict(X_testFolds)
        
        if is_classification:
            accuracies[i - 2] = accuracy_score(Y_testFolds, Y_pred)
        else:
            accuracies[i - 2] = r2_score(Y_testFolds, Y_pred)
#         print('Accuracy of fold ' + str(i) + ':' + str(accuracies[i - 2]))
#         print('\n\n')
    
    mean_accuracy = accuracies.mean()
    print('Accuracy:', mean_accuracy, '\n')
    return mean_accuracy