In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# This fumction calculates and returns the Mean Squared Error (MSE) for a model
def mean_squared_error(Y_true, Y_pred):
    # number of test samples
    M = len(Y_true)
    total_error = 0
    for i in range(M):
        total_error += (Y_true[i] - Y_pred[i]) ** 2
    mse = total_error / M
    return mse

In [3]:
# This function splits the data into train and test sets based on the split ratio supplied and 
# returns the corresponding X and Y values for the train and test data
def train_test_split(df, train_ratio):
    split_index = int(train_ratio * len(df.index))
    
    train_data = df.iloc[0:split_index, :]
    test_data = df.iloc[split_index:, :]
    
    X_train, Y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
    X_test, Y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]
    
    return X_train, X_test, Y_train, Y_test

In [4]:
# This function splits the input data into corresponding X(features) and Y(output) values
# The Y(output) values are assumed to be in the last column of the input data
def get_X_Y(data):
    X_data = list()
    Y_data = list()
    for i in data:
        X_data.append(i[:-1])
        Y_data.append(i[-1])
    return X_data, Y_data

In [21]:
# INPUT: This function taken in a dataframe with the daily returns of a particular stock
# It prepares the data for training by adding corresponding features for each datapoint using n previous returns. 
# It further splits the built dataframe into training and test sets
# RETURN: It returns X and Y values for the train and test sets ready to start work upon
def prepare_data(df, num_features):
    # number of datapoints in the data frame
    M = len(df.index)
    
    # creating columns for t = (t0 - n) return to t = t0 return
    col_names = [str(i) for i in range(num_features, -1, -1)]
    
    # creating dataframe to store the RETURNS data points with correpsonding features
    data = pd.DataFrame()
    
    for i in range(num_features + 1):
        # gets the indices of the datapoints to build the features corresponding to RETURNS at t = t0 - (num_features - i)
        indices_to_fetch = df.index[i:(M - num_features + i)]
        
        temp = df.loc[indices_to_fetch, 'RETURNS']
        temp.index = df.index[num_features:]
        
        data[str(num_features - i)] = temp

    return train_test_split(data, 0.75)

In [6]:
# This function takes file name at a given location
# Returns dataframe corresponding to data in file
def read_file(company_symbol = 'AAPL'):
    column_names = [
                    'INDEX', 'DATE', 'OPEN', 
                    'HIGH', 'LOW', 'CLOSE',
                    'ADJ_CLOSE', 'VOLUME', 
                    'DIV_AMT', 'SPLIT_COEFF'
                   ]
#     filepath = '../data/alphaVantage/AAPL-full-daily.csv'
    filepath = '../data/alphaVantage/' + company_symbol + '-full-daily_adjusted.csv'
    df = pd.read_csv(filepath, skiprows=1, header=None, names=column_names, index_col=['DATE'], parse_dates=['DATE'])
    df.drop(columns=['INDEX'], inplace=True)
    return df

In [10]:
# This function returns the daily returns of the stock based on a specified parameter within the data (Using Taylor series approximation)
def data_log_returns(data, param = 'ADJ_CLOSE'):
    log_returns = list()
    date = list()
    for i in range(len(data) - 1):
        abs_t1 = math.log10(data.loc[data.index[i], param])
        abs_t0 = math.log10(data.loc[data.index[i + 1], param])
        log_returns.append(abs_t1 - abs_t0)
        date.append(data.index[i])
    return pd.DataFrame(log_returns, index=date, columns=['RETURNS'])

In [20]:
df = read_file()
df = data_log_returns(df)
num_features = 5
X_train, X_test, Y_train, Y_test = prepare_data(df, num_features)

In [22]:
def dummy():
    print('abc')
    return 10