In [3]:
# Basic libraries
import os
import random
from collections import defaultdict
from datetime import datetime as dt
import warnings

import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Machine learning libraries
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    mean_squared_error, 
    mean_absolute_error, 
    mean_absolute_percentage_error, 
    r2_score, 
    mean_squared_log_error
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor, 
    GradientBoostingRegressor, 
    ExtraTreesRegressor, 
    AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct, WhiteKernel
import xgboost as xgb

# Time series libraries
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_breusch_godfrey, acorr_ljungbox, het_white
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import acf, q_stat, adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAXResults
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from arch import arch_model
import pmdarima as pm
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.model_selection import ForecastingGridSearchCV, SlidingWindowSplitter

#Libraries for Statistical Models
import statsmodels.api as sm
from statsmodels.stats.diagnostic import acorr_breusch_godfrey, acorr_ljungbox, het_white
from scipy.stats import jarque_bera
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, q_stat, adfuller
from scipy.stats import probplot, moment

# LightGBM library
import lightgbm as lgb

# Set options
%matplotlib inline
warnings.filterwarnings("ignore")


In [33]:
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)

    return {
        'mae': mae,
        'rmse': rmse,
    }

# univaritae
## Simple models for benchmarking
def random_walk_forecast(start_point, forecast_length, return_mean, return_std, index):
    prediction = [start_point]

    for i in range(1, forecast_length):
        next_value = prediction[-1] * (1 + random.normalvariate(mu=return_mean, sigma=return_std))
        prediction.append(next_value)

    return pd.Series(prediction, index=index)

def calculate_benchmark_errors(series):
    mean_returns, std_returns = calculate_returns_stats(series)
    forecast_length = len(series)
    start_point = series[0]
    predictions = random_walk_forecast(start_point, forecast_length, mean_returns, std_returns, series.index)
    # print(predictions)
    evaluation_result = evaluate(series, predictions)
    # Return a dictionary containing the evaluation results for each baseline
    return {
        'Benchmark MAE': evaluation_result['mae'],
        'Benchmark RMSE': evaluation_result['rmse'],
    }

def calculate_returns_stats(time_series):
    # Calculate the returns
    returns = time_series / time_series.shift(1) - 1
    returns = returns.dropna()

    # Calculate mean returns and standard deviation of returns
    mean_returns = returns.mean()
    std_returns = returns.std()

    return mean_returns, std_returns

# multivariate
def random_walk_forecast_multivariate(start_points, forecast_length, return_means, return_stds, index):
    num_variables = len(start_points)
    predictions = [start_points]

    for _ in range(1, forecast_length):
        next_values = [predictions[-1][i] * (1 + random.normalvariate(mu=return_means[i], sigma=return_stds[i])) for i in range(num_variables)]
        predictions.append(next_values)

    # Transpose the list of predictions and convert each variable's predictions to a pandas Series
    predictions_transposed = list(map(list, zip(*predictions)))
    prediction_series = [pd.Series(pred, index=index) for pred in predictions_transposed]

    return prediction_series
    
def monte_carlo_simulation_univariate(series, n_trials=10000):
    # Initialize a dictionary to store the accumulated evaluation results
    accumulated_results = {
        'Benchmark MAE': 0,
        'Benchmark RMSE': 0,
    }

    for _ in range(n_trials):
        # Calculate benchmark errors for this trial
        trial_results = calculate_benchmark_errors(series)

        # Accumulate the results of this trial
        for key in accumulated_results.keys():
            accumulated_results[key] += trial_results[key]

    # Average the accumulated results over the number of trials
    for key in accumulated_results.keys():
        accumulated_results[key] /= n_trials

    return accumulated_results

def calculate_benchmark_errors_mult(series1, series2):
    mean_returns1, std_returns1 = calculate_returns_stats(series1)
    mean_returns2, std_returns2 = calculate_returns_stats(series2)
    
    forecast_length = len(series1)
    start_points = [series1[0], series2[0]]
    return_means = [mean_returns1, mean_returns2]
    return_stds = [std_returns1, std_returns2]
    
    predictions = random_walk_forecast_multivariate(start_points, forecast_length, return_means, return_stds, series1.index)
    
    evaluation_result1 = evaluate(series1, predictions[0])
    evaluation_result2 = evaluate(series2, predictions[1])
    # plot_time_series(series1, predictions[0])
    # plot_time_series(series2, predictions[1])
    
    return {
        'Series1 Benchmark MAE': evaluation_result1['mae'],
        'Series1 Benchmark RMSE': evaluation_result1['rmse'],
        'Series2 Benchmark MAE': evaluation_result2['mae'],
        'Series2 Benchmark RMSE': evaluation_result2['rmse'],
    }
    
def monte_carlo_simulation_multivariate(series1, series2, n_trials=10000):
    # Initialize a dictionary to store the accumulated evaluation results
    accumulated_results = {
        'Series1 Benchmark MAE': 0,
        'Series1 Benchmark RMSE': 0,
        'Series2 Benchmark MAE': 0,
        'Series2 Benchmark RMSE': 0,
    }

    for _ in range(n_trials):
        # Calculate benchmark errors for this trial
        trial_results = calculate_benchmark_errors_mult(series1, series2)

        # Accumulate the results of this trial
        for key in accumulated_results.keys():
            accumulated_results[key] += trial_results[key]

    # Average the accumulated results over the number of trials
    for key in accumulated_results.keys():
        accumulated_results[key] /= n_trials

    return accumulated_results


In [20]:
def load_raw_data(filename):
    raw_df = pd.read_csv(filename, header=None)
    raw_df.columns = ['datetime', 'ts1', 'ts2']
    raw_df['datetime'] = pd.to_datetime(raw_df['datetime'] - 719529, unit='d').round('s')
    raw_df.set_index('datetime', inplace=True)
    raw_df.dropna(inplace=True)
    return raw_df

def calculate_log_returns(df):
    rt = np.log(df / df.shift(1))
    rt.dropna(inplace=True)
    return rt

def load_clean_data(filename):
    clean_df = pd.read_csv(filename)
    clean_df.columns = ['datetime', 'ts1', 'ts2']
    clean_df.set_index('datetime', inplace=True)
    clean_df.index = pd.to_datetime(clean_df.index)
    return clean_df   

In [21]:
def split_series(series, split_date):
    series.index = pd.to_datetime(series.index)
    split_date = pd.Timestamp(split_date)
    
    before_split = series.loc[series.index <= split_date]
    after_split = series.loc[series.index > split_date]
    
    return before_split, after_split

In [52]:
#load raw data
df = load_raw_data("Test_data.csv")
#calcualte log returns
raw_rt = calculate_log_returns(df)
raw_rt = raw_rt.resample('D').mean().dropna()
train_raw, test_raw = split_series(raw_df, '2011-12-31')


# df = load_clean_data('imputed_df.csv')
# #calcualte log returns
# imp_rt = calculate_log_returns(df)
# imp_rt_daily = imp_rt.resample('D').mean()

# # prerpare train and test experiments
# # train_imp, test_imp = split_series(imp_rt_daily, '2011-12-31')
# train_raw, test_raw = split_series(raw_rt_daily, '2011-12-31')

In [53]:
def calculate_benchmark_univariate(series, var, n_trials):
    print('\nTrain')
    results = monte_carlo_simulation_univariate(series[var], n_trials=10000)
    print(results)
    print('\nTest')
    results = monte_carlo_simulation_univariate(series[var], n_trials=10000)
    print(results)    
    
def calulate_benchmark_multivariate(series1, series2, var1, var2, n_trials):
    print('\nTrain')
    results = monte_carlo_simulation_multivariate(series1[var1], series1[var2], n_trials=10000)
    print(results)
    print('\nTest')
    results = monte_carlo_simulation_multivariate(series2[var1], series2[var2], n_trials=10000)     
    print(results)
    
    
n_trials = 10000
print('Univariate')
print('\nH1')
calculate_benchmark_univariate(train_raw, 'ts1', n_trials=n_trials)
print('\nH2')
calculate_benchmark_univariate(test_raw, 'ts2', n_trials=n_trials)
print('Multivariate')
print('\nH3/H4')
calulate_benchmark_multivariate(train_raw, test_raw, 'ts1', 'ts2', n_trials=n_trials)







Univariate

H1

Train
{'Benchmark MAE': 1.425311743138642, 'Benchmark RMSE': 1.6848674310245457}

Test
{'Benchmark MAE': 1.4344625599349239, 'Benchmark RMSE': 1.696045764746107}

H2

Train
{'Benchmark MAE': 0.04702588537341932, 'Benchmark RMSE': 0.055172784438008944}

Test
{'Benchmark MAE': 0.04752462822795691, 'Benchmark RMSE': 0.05565652110530692}
Multivariate

H3/H4

Train
{'Series1 Benchmark MAE': 1.4333394867764166, 'Series1 Benchmark RMSE': 1.69375050541172, 'Series2 Benchmark MAE': 0.18362620521903997, 'Series2 Benchmark RMSE': 0.2174841209653639}

Test
{'Series1 Benchmark MAE': 0.445771108719327, 'Series1 Benchmark RMSE': 0.5211173588374408, 'Series2 Benchmark MAE': 0.04686230573111296, 'Series2 Benchmark RMSE': 0.05491780070621427}


In [None]:
print('Univariate')
# H1
print('H1')
print('train')
average_results = monte_carlo_simulation_univariate(train_raw['ts1'], n_trials=100)
print(average_results)
print('test')
average_results = monte_carlo_simulation_univariate(test_raw['ts1'], n_trials=100)
print(average_results)
# H2
print('\nH2')
print('train')
average_results = monte_carlo_simulation_univariate(train_raw['ts2'], n_trials=100)
print(average_results)
print('test')
average_results = monte_carlo_simulation_univariate(test_raw['ts2'], n_trials=100)
print(average_results)
# H3/h4
print('\nH3/H4')
print('train')
average_results = monte_carlo_simulation_multivariate(train_raw['ts1'], train_raw['ts2'], n_trials=100)
print(average_results)
print('test')
average_results = monte_carlo_simulation_multivariate(test_raw['ts1'], test_raw['ts2'], n_trials=100)
print(average_results)


In [13]:
df = load_raw_data("Test_data.csv")
raw_df = df.resample('D').mean().dropna()
raw_ts_1_daily = raw_df['ts1'].dropna()
raw_ts_2_daily = raw_df['ts2'].dropna()


In [None]:
# Run the Monte Carlo simulation on your time series data
print('Univariate')
average_results = monte_carlo_simulation(raw_ts_1_daily, n_trials=1000)
print(average_results)
average_results = monte_carlo_simulation(raw_ts_2_daily, n_trials=1000)
print(average_results)
print('Multivariate')
# Run the Monte Carlo simulation on your time series data
average_results = monte_carlo_simulation_mult(raw_ts_1_daily, raw_ts_2_daily)
print(average_results)

In [27]:
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)

    return {
        'mae': mae,
        'rmse': rmse,
    }

def calculate_log_returns(series):
    return np.log(series / series.shift(1)).dropna()

def calculate_return_statistics(time_series):
    mean_returns = time_series.mean()
    std_returns = time_series.std()

    return mean_returns, std_returns

def random_walk_forecast(start_point, forecast_length, return_mean, return_std, index):
    prediction = [start_point]

    for _ in range(1, forecast_length):
        next_value = prediction[-1] * (1 + random.normalvariate(mu=return_mean, sigma=return_std))
        prediction.append(next_value)

    return pd.Series(prediction, index=index)

def evaluate_univariate_random_walk(series, n_trials=10000):
    accumulated_results = {
        'Benchmark MAE': 0,
        'Benchmark RMSE': 0,
    }

    mean_returns, std_returns = calculate_return_statistics(series)
    forecast_length = len(series)
    start_point = series[0]

    for _ in range(n_trials):
        predictions = random_walk_forecast(start_point, forecast_length, mean_returns, std_returns, series.index)
        trial_results = evaluate(series, predictions)

        for key in accumulated_results.keys():
            accumulated_results[key] += trial_results[key]

    for key in accumulated_results.keys():
        accumulated_results[key] /= n_trials

    return accumulated_results

def evaluate_multivariate_random_walk(series1, series2, n_trials=10000):
    accumulated_results = {
        'Series1 Benchmark MAE': 0,
        'Series1 Benchmark RMSE': 0,
        'Series2 Benchmark MAE': 0,
        'Series2 Benchmark RMSE': 0,
    }

    mean_returns1, std_returns1 = calculate_return_statistics(series1)
    mean_returns2, std_returns2 = calculate_return_statistics(series2)
    forecast_length = len(series1)
    start_points = [series1[0], series2[0]]
    return_means = [mean_returns1, mean_returns2]
    return_stds = [std_returns1, std_returns2]

    for _ in range(n_trials):
        prediction_series = random_walk_forecast_multivariate(start_points, forecast_length, return_means, return_stds, series1.index)
        evaluation_result1 = evaluate(series1, prediction_series[0])
        evaluation_result2 = evaluate(series2, prediction_series[1])

        for key in accumulated_results.keys():
            if "Series1" in key:
                accumulated_results[key] += evaluation_result1[key.replace("Series1 ", "")]
            else:
                accumulated_results[key] += evaluation_result2[key.replace("Series2 ", "")]

    for key in accumulated_results.keys():
        accumulated_results[key] /= n_trials

    return accumulated_results


In [28]:
print('Univariate')
# H1
print('H1')
print('train')
average_results = evaluate_univariate_random_walk(train_raw['ts1'], n_trials=100)
print(average_results)
print('test')
average_results = evaluate_univariate_random_walk(test_raw['ts1'], n_trials=100)
print(average_results)
# H2
print('\nH2')
print('train')
average_results = evaluate_univariate_random_walk(train_raw['ts2'], n_trials=100)
print(average_results)
print('test')
average_results = evaluate_univariate_random_walk(test_raw['ts2'], n_trials=100)
print(average_results)
# H3/h4
print('\nH3/H4')
print('train')
average_results = evaluate_multivariate_random_walk(train_raw['ts1'], train_raw['ts2'], n_trials=100)
print(average_results)
print('test')
average_results = evaluate_multivariate_random_walk(test_raw['ts1'], test_raw['ts2'], n_trials=100)
print(average_results)


Univariate
H1
train


ValueError: Input contains infinity or a value too large for dtype('float64').