In [1]:
import nbimporter
import numpy as np
import pandas as pd
import preprocessing as pp
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, accuracy_score

Importing Jupyter notebook from preprocessing.ipynb


In [2]:
# This fumction calculates and returns the Mean Squared Error (MSE) for a model
def mean_squared_error(Y_true, Y_pred):
    # number of test samples
    M = len(Y_true)
    total_error = 0
    for i in range(M):
        total_error += (Y_true[i] - Y_pred[i]) ** 2
    mse = total_error / M
    return mse

In [3]:
# This function splits the data into train and test sets based on the split ratio supplied and 
# returns the corresponding X and Y values for the train and test data
def train_test_split(df, train_ratio = 0.75):
    split_index = int(train_ratio * len(df.index))
    train_data = df.iloc[0:split_index, :]
    test_data = df.iloc[split_index:, :]
    
    X_train, Y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
    X_test, Y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]
    
    return X_train, X_test, Y_train, Y_test

In [4]:
# This function splits the input data into corresponding X(features) and Y(output) values
# The Y(output) values are assumed to be in the last column of the input data
def get_X_Y(data):
    X_data = list()
    Y_data = list()
    for i in data:
        X_data.append(i[:-1])
        Y_data.append(i[-1])
    return X_data, Y_data

In [5]:
def output_to_binary_indicators(df):
    output_column_index = len(df.columns) - 1
    for i in range(len(df)):
        df.iloc[i, output_column_index] = 1 if df.iloc[i, output_column_index] > 0 else 0
    return df

In [6]:
def prepare_data(num_features, symbol_name = 'AAPL', is_binary_ouput = True):
    """
    INPUT: This function taken in the Symbol of the stock data to be studied and eventually predicted 
    
    It prepares the data for training by fetching the data for the mentioned stock, calculating the daily returns of the stock, 
    and adding corresponding features for each datapoint using n previous returns. At the end, if the output is wanted to be binary
    i.e. Stock going up or down, positive returns are converted to 1 else converted to 0
    It further splits the built dataframe into training and test sets
     
    RETURN: It returns X and Y values for the train and test sets ready to start work upon
    """
    
    df = pp.read_file(symbol_name)
    df = pp.data_daily_returns(df)
    
    # number of datapoints in the data frame
    M = len(df.index)
    
    # creating columns for t = (t0 - n) return to t = t0 return
    col_names = [str(i) for i in range(num_features, -1, -1)]
    
    # creating dataframe to store the RETURNS data points with correpsonding features
    data = pd.DataFrame()
    
    for i in range(num_features + 1):
        # gets the indices of the datapoints to build the features corresponding to RETURNS at t = t0 - (num_features - i)
        indices_to_fetch = df.index[i:(M - num_features + i)]
        temp = df.loc[indices_to_fetch, 'RETURNS']
        temp.index = df.index[num_features:]
        data[str(num_features - i)] = temp

    if is_binary_ouput:    
        data = output_to_binary_indicators(data)    
    
    return train_test_split(data, 0.75)

In [7]:
X_train, X_test, Y_train, Y_test = prepare_data(5)
X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values

### Sklearn TimeSeriesSplit
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

In [8]:
def rolling_cross_validation(X, Y, num_splits, algorithm, parameters, is_classification = True):
    print('Inbuilt Rolling Cross Validation')
    if num_splits <= 0 or num_splits >= len(X):
        num_splits = len(X) - 1
    print('Parameters ------------------------>', parameters)
    
    accuracies = []
    tscv = TimeSeriesSplit(n_splits = num_splits)
    
    for train, test in tscv.split(X):
#         print("Test: %s Train: %s" % (test, train))
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = X[test], Y[test]
        
        Y_pred = algorithm(X_train, Y_train, X_test, parameters[0])
        
        if is_classification:
            accuracies.append(accuracy_score(Y_test, Y_pred))
        else:
            accuracies.append(r2_score(Y_test, Y_pred))
    
    mean_accuracy = np.array(accuracies).mean()
    print('Accuracy:', mean_accuracy)
    return mean_accuracy

### Cross Validation on a Rolling Basis - Implementation

we train our model on a small segment of the time series from the beginning until some  𝑡 , make predictions for the next  𝑡+𝑛 steps, and calculate an error. Then, we expand our training sample to  𝑡+𝑛  value, make predictions from  𝑡+𝑛  until  𝑡+2∗𝑛 , and continue moving our test segment of the time series until we hit the last available observation. As a result, we have as many folds as  𝑛  will fit between the initial training sample and the last observation.  

http://francescopochetti.com/pythonic-cross-validation-time-series-pandas-scikit-learn/

In [9]:
def timeSeriesCV(X_train, Y_train, num_splits, algorithm, parameters, is_classification = True):
    print('Implemented Rolling Cross Validation')
    if num_splits <= 0 or num_splits >= len(X_train):
        num_splits = 10
    
    print('Parameters ------------------------>', parameters)
#     print('Number of folds:', num_splits)
#     print('Size train set:', X_train.shape)
    
    k = int(np.floor(float(X_train.shape[0]) / num_splits))
#     print('Size of each fold:', k)
    
    accuracies = np.zeros(num_splits - 1)
    
    for i in range(2, num_splits + 1):
#         print('')
        
        split = float(i - 1)/i
#         print('Splitting the first ' + str(i) + ' chunks at ' + str(i - 1) + '/' + str(i))
        
        X = X_train[:(k * i)]
        Y = Y_train[:(k * i)]
#         print('Size of train + test:', X.shape)
        
        index = int(np.floor(X.shape[0] * split))
        
        X_trainFolds = X[:index]
        Y_trainFolds = Y[:index]
        
        X_testFolds = X[index:]
        Y_testFolds = Y[index:]
        
        Y_pred = algorithm(X_trainFolds, Y_trainFolds, X_testFolds, parameters[0])
        
        if is_classification:
            accuracies[i - 2] = accuracy_score(Y_testFolds, Y_pred)
        else:
            accuracies[i - 2] = r2_score(Y_testFolds, Y_pred)
#         print('Accuracy of fold ' + str(i) + ':' + str(accuracies[i - 2]))
    
    mean_accuracy = accuracies.mean()
    print('Accuracy:', mean_accuracy)
    return mean_accuracy