In [1]:
import nbimporter
import numpy as np
import pandas as pd
import preprocessing as pp
from sklearn.metrics import r2_score

Importing Jupyter notebook from preprocessing.ipynb


In [2]:
# This fumction calculates and returns the Mean Squared Error (MSE) for a model
def mean_squared_error(Y_true, Y_pred):
    # number of test samples
    M = len(Y_true)
    total_error = 0
    for i in range(M):
        total_error += (Y_true[i] - Y_pred[i]) ** 2
    mse = total_error / M
    return mse

In [3]:
# This function splits the data into train and test sets based on the split ratio supplied and 
# returns the corresponding X and Y values for the train and test data
def train_test_split(df, train_ratio = 0.75):
    split_index = int(train_ratio * len(df.index))
    
    train_data = df.iloc[0:split_index, :]
    test_data = df.iloc[split_index:, :]
    
    X_train, Y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
    X_test, Y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]
    
    return X_train, X_test, Y_train, Y_test

In [4]:
# This function splits the input data into corresponding X(features) and Y(output) values
# The Y(output) values are assumed to be in the last column of the input data
def get_X_Y(data):
    X_data = list()
    Y_data = list()
    for i in data:
        X_data.append(i[:-1])
        Y_data.append(i[-1])
    return X_data, Y_data

In [5]:
# INPUT: This function taken in the Symbol of the stock data to be studied and eventually predicted 
#
# It prepares the data for training by fetching the data for the mentioned stock, calculating the daily returns of the stock, 
# and adding corresponding features for each datapoint using n previous returns. 
# It further splits the built dataframe into training and test sets
# 
# RETURN: It returns X and Y values for the train and test sets ready to start work upon

def prepare_data(num_features, symbol_name = 'AAPL'):
    df = pp.read_file(symbol_name)
    df = pp.data_daily_returns(df)
    
    # number of datapoints in the data frame
    M = len(df.index)
    
    # creating columns for t = (t0 - n) return to t = t0 return
    col_names = [str(i) for i in range(num_features, -1, -1)]
    
    # creating dataframe to store the RETURNS data points with correpsonding features
    data = pd.DataFrame()
    
    for i in range(num_features + 1):
        # gets the indices of the datapoints to build the features corresponding to RETURNS at t = t0 - (num_features - i)
        indices_to_fetch = df.index[i:(M - num_features + i)]
        
        temp = df.loc[indices_to_fetch, 'RETURNS']
        temp.index = df.index[num_features:]
        
        data[str(num_features - i)] = temp

    return train_test_split(data, 0.75)

In [6]:
num_features = 5
X_train, X_test, Y_train, Y_test = prepare_data(num_features)

### Cross Validation
http://francescopochetti.com/pythonic-cross-validation-time-series-pandas-scikit-learn/

In [7]:
def timeSeriesCV(X_train, Y_train, num_folds, algorithm, parameters):
    print('Parameters ------------------------>', parameters)
#     print('Number of folds:', num_folds)
#     print('Size train set:', X_train.shape)
    
    k = int(np.floor(float(X_train.shape[0]) / num_folds))
#     print('Size of each fold:', k)
    
    accuracies = np.zeros(num_folds - 1)
    
    for i in range(2, num_folds + 1):
#         print('')
        
        split = float(i - 1)/i
#         print('Splitting the first ' + str(i) + ' chunks at ' + str(i - 1) + '/' + str(i))
        
        X = X_train[:(k * i)]
        Y = Y_train[:(k * i)]
#         print('Size of train + test:', X.shape)
        
        index = int(np.floor(X.shape[0] * split))
        
        X_trainFolds = X[:index]
        Y_trainFolds = Y[:index]
        
        X_testFolds = X[index:]
        Y_testFolds = Y[index:]
        
        Y_pred = algorithm(X_trainFolds, Y_trainFolds, X_testFolds, parameters[0])
        
        accuracies[i - 2] = r2_score(Y_testFolds, Y_pred)
#         print('Accuracy of fold ' + str(i) + ':' + str(accuracies[i - 2]))
    
    return accuracies.mean()