In [1]:
import nbimporter
import stats_helper as sh
import preprocessing as pp

import time
import math
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, cohen_kappa_score

Importing Jupyter notebook from stats_helper.ipynb
Importing Jupyter notebook from preprocessing.ipynb


Kohen Cappa - Kappa or Cohen’s Kappa is like classification accuracy, except that it is normalized at the baseline of random chance on your dataset. It is a more useful measure to use on problems that have an imbalance in the classes (e.g. 70-30 split for classes 0 and 1 and you can achieve 70% accuracy by predicting all instances are for class 0)  
https://machinelearningmastery.com/machine-learning-evaluation-metrics-in-r/  

The kappa statistic, which is a number between -1 and 1. The maximum value means complete agreement; zero or lower means chance agreement.

# Double Exponential Smoothing - Holt's Method

In [2]:
class Holt:
    """
        data - dataset with timestamps
        alpha - float [0.0, 1.0], smoothing parameter
        beta - float [0.0, 1.0], smoothing parameter for trend
    """
    
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta
        self.result = []
    
    def fit(self, data_train):
        self.data_train = data_train
        
        # First value is same as series
        self.result.append(data_train[0])
        for n in range(1, len(data_train) + 1):
            
            # Initialising level and trend
            if n == 1:
                level, trend = data_train[0], data_train[1] - data_train[0]
            
            # Forecasting the point ahead
            if n >= len(data_train): 
                value = self.result[-1]
            else:
                value = data_train[n]
            
            last_level, level = level, self.alpha*value + (1 - self.alpha)*(level + trend)
            trend = self.beta*(level - last_level) + (1 - self.beta)*trend
            self.result.append(level + trend)

        # Returning the smoothed values (without the forecast)
        return self.result[:-1]
    
    # Returns the forecasted point during the fit
    def predict_one(self):
        return self.result[-1]

    def predict(self, data_test, is_classification):
        predictions = []
        self.result.append(self.data_train[0])
        for n in range(1, len(self.data_train) + len(data_test)):
            if n == 1:
                level, trend = self.data_train[0], self.data_train[1] - self.data_train[0]
            if n >= len(self.data_train): # we are forecasting
                value = data_test[n - len(self.data_train)]
                
                predict_value = self.result[-1]
                
                # Adding it for generating binary returns
                if n == len(self.data_train) and is_classification:
                    predictions.append(predict_value)
                
                predict_last_level, predict_level = level, self.alpha*predict_value + (1 - self.alpha)*(level + trend)
                predict_trend = self.beta*(predict_level - predict_last_level) + (1 - self.beta)*trend
                predictions.append(predict_level + predict_trend)
            else:
                value = self.data_train[n]
                
            last_level, level = level, self.alpha*value + (1 - self.alpha)*(level + trend)
            trend = self.beta*(level - last_level) + (1 - self.beta)*trend
            self.result.append(level + trend)
        
        if is_classification:
            return sh.output_to_binary_indicators(sh.data_daily_returns(predictions))
        
        return predictions  

## Plotting Holt

In [3]:
def plotDoubleExponentialSmoothing(data, alphas, betas):
    """
        Plots double exponential smoothing with different alphas and betas
        
        data - dataset with timestamps
        alphas - list of floats, smoothing parameters for level
        betas - list of floats, smoothing parameters for trend
    """
    
    with plt.style.context('seaborn-white'):    
        plt.figure(figsize=(20, 8))
        for alpha in alphas:
            for beta in betas:
                holt = Holt(alpha, beta)
                plt.plot(holt.fit(data), label="Alpha {}, beta {}".format(alpha, beta))
        plt.plot(data, label = "Actual")
        plt.legend(loc="best")
        plt.axis('tight')
        plt.title("Double Exponential Smoothing - Holt")
        plt.grid(True)
        
# plotDoubleExponentialSmoothing(values, alphas=[0.8, 0.05], betas=[0.8, 0.05])

In [4]:
def plotHolt(data, fit_data):
    plt.figure(figsize=(20, 7))
    plt.plot(range(len(data)), data, 'bo-')
    plt.plot(range(len(fit_data)), fit_data, 'r^--')
    plt.show()

## Finding optimal parameters

In [5]:
def print_metric_values(metric_values):
    print('Alpha \t\t\t Beta \t\t\t Metric')
    for i in range(len(metric_values)):
        print(metric_values[i][0], '\t\t\t', metric_values[i][1], '\t\t\t', metric_values[i][2])
    print()

In [6]:
def find_optimal_parameters(symbol_name, num_splits, alphas, betas, is_classification):
    metric_values = list()
    if is_classification:
        print('Using Accuracy for CV')
    else:
        print('Using Mean Squared Error for CV')
        
    data_train, data_test = sh.prepare_data(symbol_name, train_ratio = 0.75)
        
    for alpha in alphas:
        for beta in betas:
            print("Alpha {}, Beta {}".format(alpha, beta))
            holt = Holt(alpha, beta)
            metric_value = sh.statTimeSeriesCV(data_train, num_splits, holt, is_classification)
            metric_values.append([alpha, beta, metric_value])
    
    print_metric_values(metric_values)
    
    # Sorting the Metric Values
    metric_values.sort(reverse=True, key=lambda x: x[len(metric_values) - 1])
    print_metric_values(metric_values)
    
    return metric_values[0][0], metric_values[0][1]

### RMS

In [7]:
# # Alpha:  0.99 			 Beta:  0.0 			 RMS:  1.778531300846033
# train_data, test_data = series_split(values)
# min_rms = 1000
# min_alpha = 0
# min_beta = 0
# for alpha in np.arange(0.0, 1.0, 0.01):
#     for beta in np.arange(0.0, 1.0, 0.01):
#         holt = Holt(alpha, beta)
#         updated = holt.predict_and_update(series_train=train_data, series_test=test_data)
#         rms = math.sqrt(mean_squared_error(test_data, updated))
#         if rms < min_rms:
#             min_rms = rms
#             min_alpha = alpha
#             min_beta = beta
# print('Alpha: ', min_alpha, '\t\t\t Beta: ', min_beta, '\t\t\t RMS: ', min_rms)

### Returns and Accuracy Score

In [8]:
# ## Alpha:  0.14 		 Beta:  0.86 		 acc:  0.5362872421695951
# train_data, test_data = series_split(values)
# bin_test_data = output_to_binary_indicators(data_daily_returns(train_data[-1:] + test_data))
# max_acc = 0
# max_alpha = 0
# max_beta = 0
# for alpha in np.arange(0.01, 1.0, 0.01):
#     for beta in np.arange(0.0, 1.0, 0.01):
#         holt = Holt(alpha, beta)
#         predictions = holt.predict_and_update(train_data, test_data[:])
#         acc = accuracy_score(bin_test_data, predictions)
#         if acc > max_acc:
#             max_acc = acc
#             max_alpha = alpha
#             max_beta = beta
# print('Alpha: ', max_alpha, '\t\t Beta: ', max_beta, '\t\t acc: ', max_acc)

## Error Metric Values

### Regression Metrics

### Classification Metrics

## Using Holt for Prediction

In [9]:
def forecast(data_train, data_test, alpha, beta, is_classification):
    print('Alpha: ', alpha, '\t Beta: ', beta)
    
    holt = Holt(alpha, beta)
    print('Fitting...')
    holt.fit(data_train)
    
    print('Predicting...') 
    predictions = holt.predict(data_test, is_classification)
    
    if is_classification:
        bin_data_test = sh.output_to_binary_indicators(sh.data_daily_returns(data_train[-1:] + data_test))
        sh.classification_metrics(bin_data_test, predictions)
    else:
        sh.regression_metrics(data_test, predictions)

In [10]:
def get_data_ready(symbol_name, alphas, betas, is_classification):
    start_time = time.time()
    num_splits = 10
    alpha, beta = find_optimal_parameters(symbol_name, num_splits, alphas, betas, is_classification)
    end_time = time.time()
    print('Time taken for Cross Validation:', end_time - start_time)
    
    data_train, data_test = sh.prepare_data(symbol_name, train_ratio = 0.8)
    return data_train, data_test, alpha, beta

In [11]:
def run_Holt(symbol_name):
    is_classification = False
    alphas = np.arange(0, 1, 0.1)
    betas = np.arange(0, 1, 0.1)
    
    data_train, data_test, alpha, beta = get_data_ready(symbol_name, alphas, betas, is_classification)
    forecast(data_train, data_test, alpha, beta, is_classification = True)

In [12]:
run_Holt(symbol_name = 'AAPL')

Using Mean Squared Error for CV
Alpha 0.0, Beta 0.0
Mean Metric Value:  2962.9828453843575 

Alpha 0.0, Beta 0.1
Mean Metric Value:  2962.9828453843593 

Alpha 0.0, Beta 0.2
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.30000000000000004
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.4
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.5
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.6000000000000001
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.7000000000000001
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.8
Mean Metric Value:  2962.9828453844593 

Alpha 0.0, Beta 0.9
Mean Metric Value:  2962.9828453844593 

Alpha 0.1, Beta 0.0
Mean Metric Value:  1.7592341876684452 

Alpha 0.1, Beta 0.1
Mean Metric Value:  1.417887374675793 

Alpha 0.1, Beta 0.2
Mean Metric Value:  1.374525262315953 

Alpha 0.1, Beta 0.30000000000000004
Mean Metric Value:  1.4647808486111897 

Alpha 0.1, Beta 0.4
Mean Metric Value:  1.630792925332

Matthews Correlation Coefficient:  0.0
Cohen Kappa Score:  0.0
Confustion Matrix
[[  0 546]
 [  0 501]]
Classification Report
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00       546
          1       0.48      1.00      0.65       501

avg / total       0.23      0.48      0.31      1047



  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  'precision', 'predicted', average, warn_for)
