In [1]:
import nbimporter
import math
import numpy as np
import pandas as pd
import preprocessing as pp

from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, classification_report, matthews_corrcoef, cohen_kappa_score
from sklearn.metrics import median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

Importing Jupyter notebook from preprocessing.ipynb


## Error Metrics

In [2]:
def classification_metrics(Y_test, Y_pred):
    print('Accuracy: ', accuracy_score(Y_test, Y_pred))
    print('Matthews Correlation Coefficient: ', matthews_corrcoef(Y_test, Y_pred))
    print('Cohen Kappa Score: ', cohen_kappa_score(Y_test, Y_pred))
    print('Confustion Matrix')
    print(confusion_matrix(Y_test, Y_pred))
    print('Classification Report')
    print(classification_report(Y_test, Y_pred))

In [3]:
def mean_absolute_percentage_error(Y_test, Y_pred): 
    Y_test = np.array(Y_test)
    Y_pred = np.array(Y_pred)
    return np.mean(np.abs((Y_test - Y_pred) / Y_test)) * 100

def symmetric_mean_absolute_percentage_error(Y_test, Y_pred):
    Y_test = np.array(Y_test)
    Y_pred = np.array(Y_pred)
    return 100/len(Y_test) * np.sum(2 * np.abs(Y_pred - Y_test) / (np.abs(Y_test) + np.abs(Y_pred)))

def regression_metrics(Y_test, Y_pred):
    print('Coefficient of Determination (R2 Score): ', r2_score(Y_test, Y_pred))
    print('Mean Absolute Error: ', mean_absolute_error(Y_test, Y_pred))
    print('Median Absolute Error: ', median_absolute_error(Y_test, Y_pred))
    print('Mean Squared Error: ', mean_squared_error(Y_test, Y_pred))
    print('Root Mean Squared Error: ', math.sqrt(mean_squared_error(Y_test, Y_pred)))
    print('Median Absolute Error: ', median_absolute_error(Y_test, Y_pred))
    print('Mean Squared Log Error: ', mean_squared_log_error(Y_test, Y_pred))
    print('Mean Absolute Percentage Error: ', mean_absolute_percentage_error(Y_test, Y_pred))
    print('Symmetric Mean Absolute Percentage Error: ', symmetric_mean_absolute_percentage_error(Y_test, Y_pred))

## Dealing with returns

In [4]:
# This function returns the daily returns of the stock (Using Taylor series approximation)
def data_daily_returns(data):
    daily_returns = list()
    for i in range(len(data) - 1):
        if data[i] <= 0 or data[i + 1] <= 0:
            daily_returns.append((data[i + 1] - data[i]) / data[i])
            continue
        abs_t1 = math.log10(data[i])
        abs_t0 = math.log10(data[i + 1])
        daily_returns.append(abs_t1 - abs_t0)
    return daily_returns

In [5]:
# This function converts the returns value to binary outputs for Classification
# 1: Positive Return, -1: Negative Return
def output_to_binary_indicators(returns):
    for i in range(len(returns)):
        returns[i] = 1 if returns[i] >= 0 else -1
    return returns

## Fetching the data

In [6]:
def prepare_data(symbol_name, train_ratio):
    """
    INPUT: This function taken in the Symbol of the stock data to be studied and eventually predicted 
    
    It prepares the data for the statistical forecasting by fetching the data for the mentioned stock.
    It reverses the data so that the data is ordered from past to future
    
    RETURN: It returns above prepared values of the stock ready to start work upon
    """
    
    df = pp.read_file(symbol_name)
    data = list(reversed(df['ADJ_CLOSE']))
    return series_split(data, train_ratio)

## Splitting data from training and testing

In [7]:
# This function splits the data into train and test sets based on the split ratio supplied and 
# returns the corresponding for the train and test data
def series_split(series, train_ratio):
    split_index = int(train_ratio * len(series))
    train_data = series[0:split_index]
    test_data = series[split_index:]
    
    return train_data, test_data

## Cross Validation

In [9]:
def statTimeSeriesCV(data, num_splits, algorithm, is_classification):
    if num_splits <= 0 or num_splits >= len(data):
        num_splits = 10
        
#     print('Number of folds:', num_splits)
#     print('Size train set:', len(data))
        
    # k is the size of each fold. It is computed dividing the number of rows in data by the num_splits
    # This number is then floored and converted into an int
    k = int(np.floor(float(len(data)) / num_splits))
    
    # Initialise the metric values array as an array full of zeroes. It has (num_splits - 1) elements as 
    # the first element is always used for trainiing and never tested
    metric_values = np.zeros(num_splits - 1)
    
    # Loop from 2 splits to num_splits
    for i in range(2, num_splits + 1):
        # It is the fraction of the data (used in the iteration) split used for training. Rest is used for testing
        split = float(i - 1)/i
#         print('Splitting the first ' + str(i) + ' chunks at ' + str(i - 1) + '/' + str(i))       

        # Getting the X and Y values to be used in this iteration
        temp_data = data[:(k * i)]
        
        # Index to split according to the split fraction calculated earlier
        index = int(np.floor(len(temp_data) * split))
        
        # Folds used to train and test the model
        train_folds = temp_data[:index]
        test_folds = temp_data[index:]
        
        algorithm.fit(train_folds)
        Y_pred = algorithm.predict(test_folds, is_classification)
        
        if is_classification:
            test_folds = output_to_binary_indicators(data_daily_returns(train_folds[-1:] + test_folds))
            metric_values[i - 2] = cohen_kappa_score(test_folds, Y_pred)
#             metric_values[i - 2] = accuracy_score(test_folds, Y_pred)
        else:
            metric_values[i - 2] = math.sqrt(mean_squared_error(test_folds, Y_pred))
#         print('Accuracy of fold ' + str(i) + ':' + str(metric_values[i - 2]))
#         print('\n\n')
    
    mean_value = metric_values.mean()
    print('Mean Metric Value: ', mean_value, '\n')
    return mean_value