In [None]:
import os

In [None]:
# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct, WhiteKernel
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor


#Libraries for Deep Learning Models
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#Libraries for Statistical Models
import statsmodels.api as sm
from statsmodels.stats.diagnostic import acorr_breusch_godfrey, acorr_ljungbox, het_white
from scipy.stats import jarque_bera
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, q_stat, adfuller
from scipy.stats import probplot, moment

Functions

In [None]:
# add date tiem features
def add_datetime_features(df):
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)
    df_copy = df.copy()
    
    # Extract year, month, and day information
    df_copy['year'] = df_copy.index.year
    df_copy['month'] = df_copy.index.month
    df_copy['day'] = df_copy.index.day
    df_copy['weekday'] = df_copy.index.weekday
    
    return df_copy

def generate_lagged_features(df, var, max_lag):
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)
    ts_data = df.copy()
    for t in range(1, max_lag + 1):
        ts_data[var + '_lag' + str(t)] = ts_data[var].shift(t, freq='1D')

    ts_data.dropna(inplace=True)

    return ts_data

def prepare_time_series_data(df, var, max_lag):
    # Generate lagged features
    lagged_data = generate_lagged_features(df, var, max_lag)
    
    # Add datetime features
    transformed_data = add_datetime_features(lagged_data)
    
    return transformed_data

def extract_column(df, column_name):
    extracted_column = df[column_name]
    remaining_df = df.drop(column_name, axis=1)
    return remaining_df, extracted_column

# def scale_data(train_data, test_data):
#     scaler = MinMaxScaler()
#     train_scaled = scaler.fit_transform(train_data)
#     test_scaled = scaler.transform(test_data)
#     return train_scaled, test_scaled, scaler

def scale_data(train_data, test_data, target_column):
    scaler_X = MinMaxScaler()
    scaler_Y = MinMaxScaler()
    
    train_X = train_data.drop(target_column, axis=1)
    train_Y = train_data[[target_column]]
    
    test_X = test_data.drop(target_column, axis=1)
    test_Y = test_data[[target_column]]
    
    train_X_scaled = scaler_X.fit_transform(train_X)
    train_Y_scaled = scaler_Y.fit_transform(train_Y)
    
    test_X_scaled = scaler_X.transform(test_X)
    test_Y_scaled = scaler_Y.transform(test_Y)
    
    train_data_scaled = np.concatenate((train_X_scaled, train_Y_scaled), axis=1)
    test_data_scaled = np.concatenate((test_X_scaled, test_Y_scaled), axis=1)
    
    return train_data_scaled, test_data_scaled, scaler_X, scaler_Y


def unscale_data(pred, actual, scaler):
    pred = scaler.inverse_transform(pred)
    actual = scaler.inverse_transform(actual)
    return pred, actual

Data import

In [None]:
rt_d = pd.read_csv('rt_daily.csv')
rt_d.set_index('datetime', inplace = True)

In [None]:
# histograms
rt_d.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, figsize=(6,6))
plt.show()
# density
rt_d.plot(kind='density', subplots=True, layout=(4,4), sharex=False, legend=True, fontsize=1, figsize=(6,6))
plt.show()
#Box and Whisker Plots
rt_d.plot(kind='box', subplots=True, layout=(4,4), sharex=False, sharey=False, figsize=(6,6))
plt.show()

rt_transformed = prepare_time_series_data(rt_d, 'ts1', 2)
rt_transformed.boxplot(figsize = (20, 5))
plt.xticks(rotation=90)
plt.show()
# Create a MinMaxScaler object
minmax_scaler = MinMaxScaler()

# Fit the scaler to the data (ignoring missing values)
minmax_scaler.fit(rt_transformed)

# Convert the scaled data back to a Pandas dataframe
rt_transformed = pd.DataFrame(minmax_scaler.transform(rt_transformed), columns=rt_transformed.columns)
rt_transformed.boxplot(figsize = (20, 5))
plt.xticks(rotation=90)
plt.show()

In [None]:
# feature importance
from sklearn.feature_selection import SelectKBest, f_regression

# Prepare your data (use the prepare_time_series_data function)
rt_transformed = prepare_time_series_data(rt_d, 'ts1', 2)

# Separate the target variable (Y) and the features (X)
Y = rt_transformed['ts1']
X = rt_transformed.loc[:, rt_transformed.columns != 'ts1']

# Create a SelectKBest object with a scoring function (e.g., f_regression)
bestfeatures = SelectKBest(score_func=f_regression, k='all')

# Fit the SelectKBest object to the data
fit = bestfeatures.fit(X, Y)

# Create DataFrames for feature scores and feature names
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concatenate the DataFrames and name the columns
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)
feature_scores.columns = ['Feature', 'Score']

# Sort the features by their scores in descending order
sorted_feature_scores = feature_scores.sort_values('Score', ascending=False)

print(sorted_feature_scores)

Univariate ML
-   Grid search cv
-   best model mean and std
-   train on all training
-   predict on train
-   plot resid + squared resid
-   predict on test
-   plot resid + squared residual
-   

Functions

In [None]:
def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    # msle = mean_squared_log_error(y_true, y_pred)
    return {
        'rmse': rmse,
    }

def perform_grid_search(model, param_grid, X_train, Y_train, cv):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        verbose=3,
        n_jobs=-1,
    )
    grid_search.fit(X_train, Y_train)
    best_model = grid_search.best_estimator_
    cv_results = grid_search.cv_results_
    mean_score = -np.mean(cv_results['mean_test_score'])
    std_score = np.std(cv_results['mean_test_score'])
    
    # Take the square root of the mean_score and std_score
    rmse_mean = np.sqrt(mean_score)
    rmse_std = np.sqrt(std_score)
    
    return best_model, rmse_mean, rmse_std

def train_best_model(best_model, X_train, Y_train):
    best_model.fit(X_train, Y_train)
    return best_model

def predict(model, X):
    return model.predict(X)

def reshape_and_unscale_predictions(preds, actuals, scaler_Y):
    # Reshape predictions and actual values for unscaling
    preds_reshaped = preds.reshape(-1, 1)
    actuals_reshaped = actuals.values.reshape(-1, 1)

    # Unscale the predictions and actual values
    unscaled_preds, unscaled_actuals = unscale_data(preds_reshaped, actuals_reshaped, scaler_Y)

    # Convert the unscaled predictions and actuals back to pandas Series
    unscaled_preds_series = pd.Series(unscaled_preds.squeeze(), index=actuals.index)
    unscaled_actuals_series = pd.Series(unscaled_actuals.squeeze(), index=actuals.index)

    return unscaled_preds_series, unscaled_actuals_series

    
def create_directory_if_not_exists(directory):
    os.makedirs(directory, exist_ok=True)


In [None]:
def save_plots_for_experiment(experiment, hypothesis, model_name, train_or_test, plot_type):
    # Create the necessary directories
    experiment_path = os.path.join(experiment, hypothesis, train_or_test)
    create_directory_if_not_exists(experiment_path)
    
    # Return the save path
    save_path = os.path.join(experiment_path, f"{model_name}_{plot_type}.png")
    return save_path

def model_pipeline(model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, hypothesis, model_name,experiment, show_plot=False):
    # Perform grid search
    best_model, rmean_error_train, std_error = perform_grid_search(model, param_grid, X_train, Y_train, tscv)
    
    print("Best model:", best_model)
    print("Mean error:", rmean_error_train)
    print("Standard error:", std_error)
    
    # Train best model on all training data
    best_model_trained = train_best_model(best_model, X_train, Y_train)
    
    # Predict on train set
    train_preds = predict(best_model_trained, X_train)
    
    # Unscale and plot predictions vs actuals for train set
    unscaled_preds_series, unscaled_actuals_series = reshape_and_unscale_predictions(train_preds, Y_train, scaler_Y)
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "train", "predictions_vs_actuals")
    plot_series(unscaled_actuals_series, unscaled_preds_series, title="Unscaled Predictions vs Actual Training Data", save_path=save_path, show_plot=show_plot)

    # Perform residual analysis for train set
    residuals_train = unscaled_actuals_series - unscaled_preds_series
    print("Residual analysis for train set:")
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "train", "correlogram")
    plot_correlogram(residuals_train, save_path=save_path, show_plot=show_plot)
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "train", "homoskedasticity_plot")
    residual_analysis(residuals_train, unscaled_preds_series, save_path=save_path)
    print("Residuals quared\n")
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "train", "correlogram_squared")
    plot_correlogram(residuals_train.pow(2), save_path=save_path, show_plot=False)
    residual_analysis(residuals_train.pow(2), unscaled_preds_series.pow(2), save_path=None)
    
    # Predict on test set
    test_preds = predict(best_model_trained, X_test)
    rmse_test = np.sqrt(mean_squared_error(Y_test, test_preds))
    print("RMSE:", rmse_test)
    
    # Unscale and plot predictions vs actuals for test set
    unscaled_preds_series, unscaled_actuals_series = reshape_and_unscale_predictions(test_preds, Y_test, scaler_Y)
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "test", "predictions_vs_actuals")
    plot_series(unscaled_actuals_series, unscaled_preds_series, title="Unscaled Predictions vs Actual Test Data", save_path=save_path, show_plot=show_plot)

    # Perform residual analysis for test set
    residuals_test = unscaled_actuals_series - unscaled_preds_series
    print("Residual analysis for test set:")
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "test", "correlogram")
    plot_correlogram(residuals_test, save_path=save_path, show_plot=show_plot)
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "test", "homoskedasticity_plot")
    residual_analysis(residuals_test, unscaled_preds_series, save_path=save_path)
    print("Residuals quared\n")
    save_path = save_plots_for_experiment(experiment, hypothesis, model_name, "test", "correlogram_squared")
    plot_correlogram(residuals_test.pow(2), title='Residuals Squared', save_path=save_path, show_plot=False)
    residual_analysis(residuals_test.pow(2), unscaled_preds_series.pow(2), save_path=None)

    return best_model_trained , rmean_error_train, rmse_test 

In [None]:
def plot_series(series1, series2, label1="Actual", label2="Predicted", title="Unscaled Predictions vs Actual Data", save_path=None, show_plot=False):
    fig, ax = plt.subplots()
    ax.plot(series1.index, series1, label=label1)
    ax.plot(series2.index, series2, label=label2, linestyle="--")
    ax.legend()
    plt.xlabel("Date")
    plt.ylabel("Value")
    plt.title(title)
    if save_path:
        plt.savefig(save_path)
    # if show_plot:
    #     plt.show()

def plot_correlogram(x, lags=None, title=None,save_path=None, show_plot=False):
    lags = min(10, int(len(x)/5)) if lags is None else lags
    x = x + np.random.normal(0, 1e-10, len(x)) ## Add noise to avoid non-invertibility
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 8))
    
    # Residuals plot
    x.plot(ax=axes[0][0], title='Residuals')
    x.rolling(21).mean().plot(ax=axes[0][0], c='k', lw=1)
    q_p = np.max(q_stat(acf(x, nlags=lags), len(x))[1])
    stats = f'Q-Stat: {np.max(q_p):>8.2f}\nADF: {adfuller(x)[1]:>11.2f}'
    axes[0][0].text(x=.02, y=.85, s=stats, transform=axes[0][0].transAxes)
    
    # Probability plot
    probplot(x, plot=axes[0][1])
    mean, var, skew, kurtosis = moment(x, moment=[1, 2, 3, 4])
    s = f'Mean: {mean:>12.2f}\nSD: {np.sqrt(var):>16.2f}\nSkew: {skew:12.2f}\nKurtosis:{kurtosis:9.2f}'
    axes[0][1].text(x=.02, y=.75, s=s, transform=axes[0][1].transAxes)
    
    # ACF and PACF plots
    plot_acf(x=x, lags=lags, zero=False, ax=axes[1][0])
    plot_pacf(x, lags=lags, zero=False, ax=axes[1][1])
    axes[1][0].set_xlabel('Lag')
    axes[1][1].set_xlabel('Lag')
    
    fig.suptitle(title, fontsize=14)
    sns.despine()
    fig.tight_layout()
    fig.subplots_adjust(top=.9)
    
    if save_path:
        plt.savefig(save_path)
    # if show_plot:
    #     plt.show()
    
    
def residual_analysis(residuals, y_pred, save_path=None):
    
    # Check for homoscedasticity
    print("Homoscedasticity scatter plot:")
    plt.scatter(y_pred, residuals)
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.axhline(y=0, color="r", linestyle="--")
    if save_path:
        plt.savefig(save_path)

    # Perform Ljung-Box test for autocorrelation in residuals
    lb_test = acorr_ljungbox(residuals, lags=[10], return_df=True)  
    print("Ljung-Box test for autocorrelation in residuals:")
    print(lb_test)
    # Assess the p-value of the Ljung-Box test
    p_value = lb_test['lb_pvalue'][10]
    if p_value < 0.05:
        print("The Ljung-Box test suggests that there is autocorrelation in the residuals (p-value < 0.05).")
    else:
        print("The Ljung-Box test suggests that there is no significant autocorrelation in the residuals (p-value >= 0.05).")
    
    # Perform Jarque-Bera test for normality in residuals
    jb_test = jarque_bera(residuals)
    print("\nJarque-Bera test for normality in residuals:")
    if jb_test[1] < 0.05:
        print(f"Test statistic: {jb_test[0]}, p-value: {jb_test[1]}")
        print("The Jarque-Bera test suggests that the residuals are not normally distributed (p-value < 0.05).")
    else:
        print(f"Test statistic: {jb_test[0]}, p-value: {jb_test[1]}")
        print("The Jarque-Bera test suggests that the residuals are normally distributed (p-value >= 0.05).")    
        
    # # Calculate the residuals
    # residuals = y_train - y_pred

    # # Plot the residuals
    # fig, ax = plt.subplots(figsize=(12, 6))
    # ax.plot(residuals)
    # ax.axhline(y=0, color='b', linestyle='-')
    # ax.set_xlabel('Time')
    # ax.set_ylabel('Residuals')
    # ax.set_title('Residual Analysis')
    # plt.show()
    # print('Residual Analysis:')
    # print('Mean of residuals:', round(residuals.mean(), 4))
    # print('Standard deviation of residuals:', round(residuals.std(), 4))

    # # Check for autocorrelation
    # print("Autocorrelation plot:")
    # plot_acf(residuals)
    # plt.show()

    # # Check for normality
    # print("Normality Q-Q plot:")
    # probplot(residuals, dist="norm", plot=plt)
    # plt.show()        
            
    # # Perform Breusch-Godfrey test for autocorrelation in residuals
    # bg_test = acorr_breusch_godfrey(model, X_train, nlags=10)
    # print("\nBreusch-Godfrey test for autocorrelation in residuals:")
    # if bg_test[1] < 0.05:
    #     print(f"LM test statistic: {bg_test[0]}, p-value: {bg_test[1]}")
    #     print("The Breusch-Godfrey test suggests that there is autocorrelation in the residuals (p-value < 0.05).")
    # else:
    #     print(f"LM test statistic: {bg_test[0]}, p-value: {bg_test[1]}")
    #     print("The Breusch-Godfrey test suggests that there is no significant autocorrelation in the residuals (p-value >= 0.05).")

    # # Perform White test for heteroscedasticity in residuals
    # white_test = het_white(residuals, X_train)
    # print("\nWhite test for heteroscedasticity in residuals:")
    # if white_test[1] < 0.05:
    #     print(f"Test statistic: {white_test[0]}, p-value: {white_test[1]}")
    #     print("The White test suggests that there is heteroscedasticity in the residuals (p-value < 0.05).")
    # else:
    #     print(f"Test statistic: {white_test[0]}, p-value: {white_test[1]}")
    #     print("The White test suggests that there is no significant heteroscedasticity in the residuals (p-value >= 0.05).")

    # # Perform Shapiro-Wilk test for normality in residuals
    # sw_test = shapiro(residuals)
    # print("\nShapiro-Wilk test for normality in residuals:")
    # if sw_test[1] < 0.05:
    #     print(f"Test statistic: {sw_test[0]}, p-value: {sw_test[1]}")
    #     print("The Shapiro-Wilk test suggests that the residuals are not normally distributed (p-value < 0.05).")
    # else:
    #     print(f"Test statistic: {sw_test[0]}, p-value: {sw_test[1]}")
    #     print("The Shapiro-Wilk test suggests that the residuals are normally distributed (p-value >= 0.05).")
    
    
def plot_rmse_comparison(results, hypothesis, experiment):
    fig = plt.figure()
    model_names = list(results.keys())
    train_rmse_values = [result['train_rmse'] for result in results.values()]
    test_rmse_values = [result['test_rmse'] for result in results.values()]

    ind = np.arange(len(model_names))  # the x locations for the groups
    width = 0.35  # the width of the bars

    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.bar(ind - width/2, train_rmse_values, width=width, label='Train Error')
    plt.bar(ind + width/2, test_rmse_values, width=width, label='Test Error')
    fig.set_size_inches(15, 8)
    plt.legend()
    ax.set_xticks(ind)
    ax.set_xticklabels(model_names)
    
    # Save the plot
    save_path = save_plots_for_experiment(experiment, hypothesis, "Results", "", "Model_Comparison")
    if save_path:
        plt.savefig(save_path)

In [None]:
models_and_param_grids = [
    {
        'model': DecisionTreeRegressor(random_state=42),
        'param_grid': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'max_features': ['auto'],
            'min_samples_leaf': [1, 3, 5, 10]
        },
        'model_name': 'CART',
    },
    # {
    #     'model': MLPRegressor(random_state=42),
    #     'param_grid': {
    #         'hidden_layer_sizes': [(50,), (100,), (50, 50),(100, 100), (50, 50, 50), (100, 100, 100), (50, 50, 50, 50), (100, 100, 100, 100)],
    #         'activation': ['tanh', 'relu', 'logistic'],
    #         'solver': ['sgd'],
    #         'alpha': [0.00005, 0.0005, 0.005],
    #         'early_stopping': [True],
    #         'max_iter': [600],
    #         'shuffle': [False],
    #     },
    #     'model_name': 'MLP',
    # },
    {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto'],
            'leaf_size': [10, 30, 50],
        },
        'model_name': 'KNN',
    },
    # {
    #     'model': GaussianProcessRegressor(random_state=42),
    #     'param_grid': {
    #         'kernel': [RBF(), DotProduct()+ WhiteKernel()],
    #         'alpha': [1e-10, 1e-5, 1e-2, 1],
    #         'n_restarts_optimizer': [0, 1, 3],
    #     },
    #     'model_name': 'GPR',
    # },
    # {
    #     'model': lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=-1, random_state=42),
    #     'param_grid': {
    #         'lgbmregressor__n_estimators': [100],
    #         'lgbmregressor__learning_rate': [0.01],
    #         'lgbmregressor__max_depth': [5, 10, 20],
    #         'lgbmregressor__num_leaves': [31, 50],
    #     },
    #     'model_name': 'GBR',
    # },
    {
        'model': RandomForestRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [10, 50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'max_features': ['auto'],
        },
        'model_name': 'RF',
    },
]



In [None]:
tscv = TimeSeriesSplit(n_splits=30, test_size=14)
lags = 3

# H1

In [None]:
# 1. Split train/test
train_data, test_data = train_test_split(rt_d, test_size=0.2, shuffle=False)
#set up data for supervised learning for H1: ts1 pred ts1
train_transformed = prepare_time_series_data(train_data, 'ts1', lags)
test_transformed = prepare_time_series_data(test_data, 'ts1', lags)
train_transformed = train_transformed.drop(['ts2'], axis=1)
test_transformed = test_transformed.drop(['ts2'], axis=1)
# 2. Scale based on train set
train_data_scaled, test_data_scaled, scaler_X, scaler_Y = scale_data(train_transformed, test_transformed, 'ts1')
# 3. Create supervised learning data by adding datetime features and lagged features
train_data_df = pd.DataFrame(train_data_scaled, index=train_transformed.index, columns=train_transformed.columns)
test_data_df = pd.DataFrame(test_data_scaled, index=test_transformed.index, columns=test_transformed.columns)
X_train, Y_train = extract_column(train_data_df, 'ts1')
X_test, Y_test = extract_column(test_data_df, 'ts1')

hypothesis = 'H1'
experiment = 'Experiment1'

results = {}

for model_info in models_and_param_grids:
    model = model_info['model']
    param_grid = model_info['param_grid']
    model_name = model_info['model_name']
    print(f"\nTraining {model_name} model...")
    
    best_model_trained, train_rmse, test_rmse = model_pipeline(
        model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, hypothesis, model_name, experiment=experiment
    )
    
    results[model_name] = {
        'model': best_model_trained,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
    }
    
print("\nResults:")
for model_name, model_results in results.items():
    print(
        f"{model_name}: Train RMSE: {model_results['train_rmse']:.4f}, Test RMSE: {model_results['test_rmse']:.4f}"
    )
    
plot_rmse_comparison(results, hypothesis, experiment=experiment)

# H2

In [None]:
# 1. Split train/test
train_data, test_data = train_test_split(rt_d, test_size=0.2, shuffle=False)
#set up data for supervised learning for H1: ts1 pred ts1
train_transformed = prepare_time_series_data(train_data, 'ts2', lags)
test_transformed = prepare_time_series_data(test_data, 'ts2', lags)
train_transformed = train_transformed.drop(['ts1'], axis=1)
test_transformed = test_transformed.drop(['ts1'], axis=1)
# 2. Scale based on train set
train_data_scaled, test_data_scaled, scaler_X, scaler_Y = scale_data(train_transformed, test_transformed, 'ts2')
# 3. Create supervised learning data by adding datetime features and lagged features
train_data_df = pd.DataFrame(train_data_scaled, index=train_transformed.index, columns=train_transformed.columns)
test_data_df = pd.DataFrame(test_data_scaled, index=test_transformed.index, columns=test_transformed.columns)
X_train, Y_train = extract_column(train_data_df, 'ts2')
X_test, Y_test = extract_column(test_data_df, 'ts2')


hypothesis = 'H2'
experiment = 'Experiment1'

results = {}

for model_info in models_and_param_grids:
    model = model_info['model']
    param_grid = model_info['param_grid']
    model_name = model_info['model_name']
    print(f"\nTraining {model_name} model...")
    
    best_model_trained, train_rmse, test_rmse = model_pipeline(
        model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, hypothesis, model_name, experiment=experiment
    )
    
    results[model_name] = {
        'model': best_model_trained,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
    }
    
print("\nResults:")
for model_name, model_results in results.items():
    print(
        f"{model_name}: Train RMSE: {model_results['train_rmse']:.4f}, Test RMSE: {model_results['test_rmse']:.4f}"
    )
    
plot_rmse_comparison(results, hypothesis, experiment=experiment)

# H3

In [None]:
# 1. Split train/test
train_data, test_data = train_test_split(rt_d, test_size=0.2, shuffle=False)
#set up data for supervised learning for H1: ts1 pred ts1
train_transformed = prepare_time_series_data(train_data, 'ts1', lags)
test_transformed = prepare_time_series_data(test_data, 'ts1', lags)
train_transformed = train_transformed.drop(['ts1'], axis=1)
test_transformed = test_transformed.drop(['ts1'], axis=1)
# 2. Scale based on train set
train_data_scaled, test_data_scaled, scaler_X, scaler_Y = scale_data(train_transformed, test_transformed, 'ts2')
# 3. Create supervised learning data by adding datetime features and lagged features
train_data_df = pd.DataFrame(train_data_scaled, index=train_transformed.index, columns=train_transformed.columns)
test_data_df = pd.DataFrame(test_data_scaled, index=test_transformed.index, columns=test_transformed.columns)
X_train, Y_train = extract_column(train_data_df, 'ts2')
X_test, Y_test = extract_column(test_data_df, 'ts2')


hypothesis = 'H3'
experiment = 'Experiment1'

results = {}

for model_info in models_and_param_grids:
    model = model_info['model']
    param_grid = model_info['param_grid']
    model_name = model_info['model_name']
    print(f"\nTraining {model_name} model...")
    
    best_model_trained, train_rmse, test_rmse = model_pipeline(
        model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, hypothesis, model_name, experiment=experiment
    )
    
    results[model_name] = {
        'model': best_model_trained,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
    }
    
print("\nResults:")
for model_name, model_results in results.items():
    print(
        f"{model_name}: Train RMSE: {model_results['train_rmse']:.4f}, Test RMSE: {model_results['test_rmse']:.4f}"
    )
    
plot_rmse_comparison(results, hypothesis, experiment=experiment)

# H4

In [None]:
# 1. Split train/test
train_data, test_data = train_test_split(rt_d, test_size=0.2, shuffle=False)
#set up data for supervised learning for H1: ts1 pred ts1
train_transformed = prepare_time_series_data(train_data, 'ts2', lags)
test_transformed = prepare_time_series_data(test_data, 'ts2', lags)
train_transformed = train_transformed.drop(['ts2'], axis=1)
test_transformed = test_transformed.drop(['ts2'], axis=1)
# 2. Scale based on train set
train_data_scaled, test_data_scaled, scaler_X, scaler_Y = scale_data(train_transformed, test_transformed, 'ts1')
# 3. Create supervised learning data by adding datetime features and lagged features
train_data_df = pd.DataFrame(train_data_scaled, index=train_transformed.index, columns=train_transformed.columns)
test_data_df = pd.DataFrame(test_data_scaled, index=test_transformed.index, columns=test_transformed.columns)
X_train, Y_train = extract_column(train_data_df, 'ts1')
X_test, Y_test = extract_column(test_data_df, 'ts1')


hypothesis = 'H4'
experiment = 'Experiment1'

results = {}

for model_info in models_and_param_grids:
    model = model_info['model']
    param_grid = model_info['param_grid']
    model_name = model_info['model_name']
    print(f"\nTraining {model_name} model...")
    
    best_model_trained, train_rmse, test_rmse = model_pipeline(
        model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, hypothesis, model_name, experiment=experiment
    )
    
    results[model_name] = {
        'model': best_model_trained,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
    }
    
print("\nResults:")
for model_name, model_results in results.items():
    print(
        f"{model_name}: Train RMSE: {model_results['train_rmse']:.4f}, Test RMSE: {model_results['test_rmse']:.4f}"
    )
    
plot_rmse_comparison(results, hypothesis, experiment=experiment)

In [None]:


# rt_transformed = prepare_time_series_data(rt_d, 'ts1', 2)
# rt_transformed = rt_transformed.drop(['ts2'], axis=1)
# X, Y = extract_column(rt_transformed, 'ts1')
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
# train_data = pd.concat([X_train, Y_train], axis=1)
# test_data = pd.concat([X_test, Y_test], axis=1)
# train_data_scaled, test_data_scaled = scale_data(train_data, test_data)
# train_data_scaled


RF

In [None]:
tscv = TimeSeriesSplit(n_splits=30, test_size=14)
#find best model
param_grid = { 'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'max_features': ['auto']}
model = RandomForestRegressor()
best_model_trained = model_pipeline(model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y,'H1','RF')


KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
tscv = TimeSeriesSplit(n_splits=40, test_size=14)
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50],
}
model_knn = KNeighborsRegressor()
best_model_trained_knn = model_pipeline(model_knn, param_grid_knn, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1', 'KNN')


CART

In [None]:
cart_param_grid = {
    'max_depth': [None, 3, 5, 7, 9,11],
    'min_samples_split': [2, 5, 10,20],
    'min_samples_leaf': [1, 3, 5, 10]
}

model_cart = DecisionTreeRegressor()
best_model_trained_cart = model_pipeline(model_cart, cart_param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1', 'CART')

SVR

In [None]:
# from sklearn.svm import SVR

# param_grid_svr = {
#     'kernel': ['poly', 'rbf', 'sigmoid'],
#     'C': [0.1, 1, 10],
#     'epsilon': [0.01, 0.1, 1,10],
#     'gamma': [0.001, 0.01, 0.1, 1, 2, 5],
# }
# model_svr = SVR()
# best_model_trained_svr = model_pipeline(model_svr, param_grid_svr, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1', 'SVR')


LightGBM

In [None]:

model_GBR = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=-1, random_state=42)
param_grid_GBM = {
    'lgbmregressor__n_estimators': [100],
    'lgbmregressor__learning_rate': [0.01],
    'lgbmregressor__max_depth': [5, 10,20],
    'lgbmregressor__num_leaves': [31, 50],

}

best_model_trained_BGM = model_pipeline(model_GBR, param_grid_GBM, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1', 'GBR')

GPR

In [None]:

param_grid_gpr = {
    'kernel': [DotProduct()+ WhiteKernel()],
    'alpha': [1e-10, 1e-5, 1e-2, 1],
    'n_restarts_optimizer': [0, 1, 3],
}
model_gpr = GaussianProcessRegressor()
best_model_trained_gpr = model_pipeline(model_gpr, param_grid_gpr, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1', 'GPR')


In [None]:

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50),(100, 100), (50, 50, 50), (100, 100, 100), (50, 50, 50, 50), (100, 100, 100, 100)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'early_stopping': [True],
    'max_iter': [600],
    'shuffle': [False],
}
model_mlp = MLPRegressor()
best_model_trained_mlp = model_pipeline(model_mlp, param_grid_mlp, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1_test', 'MLP')


In [None]:
# 1. Split train/test
train_data, test_data = train_test_split(rt_d, test_size=0.2, shuffle=False)
# 2. Scale based on train set
train_data_scaled, test_data_scaled, scaler_X, scaler_Y = scale_data(train_data, test_data)
# 3. Create supervised learning data by adding datetime features and lagged features
train_data_df = pd.DataFrame(train_data_scaled, index=train_data.index, columns=train_data.columns)
test_data_df = pd.DataFrame(test_data_scaled, index=test_data.index, columns=test_data.columns)
#set up data for supervised learning for H1: ts1 pred ts1
train_transformed = prepare_time_series_data(train_data_df, 'ts1', 2)
test_transformed = prepare_time_series_data(test_data_df, 'ts1', 2)
train_transformed = train_transformed.drop(['ts2'], axis=1)
test_transformed = test_transformed.drop(['ts2'], axis=1)
X_train, Y_train = extract_column(train_transformed, 'ts1')
X_test, Y_test = extract_column(test_transformed, 'ts1')

tscv = TimeSeriesSplit(n_splits=30, test_size=14)

results = {}

for model_info in models_and_param_grids:
    model = model_info['model']
    param_grid = model_info['param_grid']
    model_name = model_info['model_name']
    print(f"\nTraining {model_name} model...")
    
    best_model_trained, train_rmse, test_rmse = model_pipeline(
        model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y, 'H1', model_name
    )
    
    results[model_name] = {
        'model': best_model_trained,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
    }
    
print("\nResults:")
for model_name, model_results in results.items():
    print(
        f"{model_name}: Train RMSE: {model_results['train_rmse']:.4f}, Test RMSE: {model_results['test_rmse']:.4f}"
    )

In [None]:
lags = 3

In [None]:
fig = plt.figure()

model_names = list(results.keys())
train_rmse_values = [result['train_rmse'] for result in results.values()]
test_rmse_values = [result['test_rmse'] for result in results.values()]

ind = np.arange(len(model_names))  # the x locations for the groups
width = 0.35  # the width of the bars

fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.bar(ind - width/2, train_rmse_values, width=width, label='Train Error')
plt.bar(ind + width/2, test_rmse_values, width=width, label='Test Error')
fig.set_size_inches(15, 8)
plt.legend()
ax.set_xticks(ind)
ax.set_xticklabels(model_names)
plt.show()

In [None]:
# from pmdarima.model_selection import RollingForecastCV
# rfcv = RollingForecastCV(
#     h=30,  # forecast horizon
#     step=30,  # step size
# )
# i=0
# for train_index, test_index in rfcv.split(X_train):
#     i+=1
#     print(f"fold {i}\n")
#     print("TRAIN:", train_index, "TEST:", test_index)
    

In [None]:
def perform_grid_search(model, param_grid, X_train, Y_train, cv):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1,
    )
    grid_search.fit(X_train, Y_train)
    best_model = grid_search.best_estimator_
    cv_results = grid_search.cv_results_
    mean_score = -np.mean(cv_results['mean_test_score'])
    std_score = np.std(cv_results['mean_test_score'])
    
    return best_model, mean_score, std_score

In [None]:
model = RandomForestRegressor()
param_grid = {'n_estimators': [10, 50, 100, 200]}
tscv = TimeSeriesSplit(n_splits=30, test_size=14)
best_model, mean_error, std_error = perform_grid_search(model, param_grid, X_train, Y_train, tscv)
print("Best model:", best_model)
print("Mean error:", mean_error)
print("Standard error:", std_error)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

tscv = TimeSeriesSplit(n_splits=30, test_size=14)
# for i, (train_index, test_index) in enumerate(tscv.split(X_train)):
#     print(f"fold {i}\n")
#     print("TRAIN:", train_index, "TEST:", test_index)

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
# # #random forest
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, Y_train)
# preds = model.predict(X_train)
# mse = evaluate(Y_train, preds)


param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid = { 'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'max_features': ['auto']}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
cv_results = grid_search.cv_results_
mean_score = -np.mean(cv_results['mean_test_score'])
std_score = np.std(cv_results['mean_test_score'])

Refactor code in a logical way such that it follows a sequence of functions applied to my already defined training and testing data:
-   Grid search cv
-   best model mean and std
-   train on all training
-   predict on train
-   plot resid + squared resid of unscaled predictions
-   predict on test
-   plot resid + squared residual" of unscaled predictions

write the return as would be required in the sequence of the pipeline as if there were function calls in the main body of code

In [None]:
def plot_unscaled_predictions(preds, actuals, scaler_Y, title="Unscaled Predictions vs Actual Data", residual_analysis_func=None):
    # Reshape predictions and actual values for unscaling
    preds_reshaped = preds.reshape(-1, 1)
    actuals_reshaped = actuals.values.reshape(-1, 1)

    # Unscale the predictions and actual values
    unscaled_preds, unscaled_actuals = unscale_data(preds_reshaped, actuals_reshaped, scaler_Y)

    # Convert the unscaled predictions and actuals back to pandas Series
    unscaled_preds_series = pd.Series(unscaled_preds.squeeze(), index=actuals.index)
    unscaled_actuals_series = pd.Series(unscaled_actuals.squeeze(), index=actuals.index)

    # Plot unscaled predictions against the actual data
    fig, ax = plt.subplots()
    ax.plot(unscaled_actuals_series.index, unscaled_actuals_series, label="Actual")
    ax.plot(unscaled_preds_series.index, unscaled_preds_series, label="Predicted", linestyle="--")
    ax.legend()
    plt.xlabel("Date")
    plt.ylabel("Value")
    plt.title(title)
    plt.show()

    # Perform residual analysis if a function is provided
    if residual_analysis_func is not None:
        residual_analysis_func(unscaled_preds_series, unscaled_actuals_series)

preds = best_model.predict(X_train)
print(mean_score)
plot_unscaled_predictions(preds, Y_train, scaler_Y, title="Unscaled Predictions vs Actual Training Data", residual_analysis_func=residual_analysis)
    

In [None]:
tscv = TimeSeriesSplit(n_splits=30, test_size=14)
#find best model
param_grid = { 'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'max_features': ['auto']}
model = RandomForestRegressor()

best_model_trained = model_pipeline(model, param_grid, X_train, Y_train, X_test, Y_test, tscv, scaler_Y)


In [None]:
tscv = TimeSeriesSplit(n_splits=30, test_size=14)
#find best model
param_grid = { 'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'max_features': ['auto']}
model = RandomForestRegressor()

best_model, mean_error, std_error = perform_grid_search(model, param_grid, X_train, Y_train, tscv)
print("Best model:", best_model)
print("Mean error:", mean_error)
print("Standard error:", std_error)
# Train best model on all training data
best_model_trained = train_best_model(best_model, X_train, Y_train)
# Predict on train
train_preds = predict(best_model_trained, X_train)
# Plot predictions vs actuals
unscaled_preds_series, unscaled_actuals_series = reshape_and_unscale_predictions(train_preds, Y_train, scaler_Y)
# Plot resid + squared resid of unscaled predictions (train)
# Calculate the residuals
residuals = unscaled_actuals_series - unscaled_preds_series
residual_analysis(residuals,unscaled_preds_series)
residual_analysis(residuals.pow(2), unscaled_preds_series.pow(2))


# Predict on test
test_preds = predict(best_model_trained, X_test)
# Plot resid + squared residual of unscaled predictions (test)
unscaled_preds_series, unscaled_actuals_series = reshape_and_unscale_predictions(test_preds, Y_test, scaler_Y)
# Plot resid + squared resid of unscaled predictions (train)
# Calculate the residuals
residuals = unscaled_actuals_series - unscaled_preds_series
residual_analysis(residuals,unscaled_preds_series)
residual_analysis(residuals.pow(2), unscaled_preds_series.pow(2))





In [None]:
# Perform grid search
best_model, mean_error, std_error = perform_grid_search(model, param_grid, X_train, Y_train, tscv)

print("Best model:", best_model)
print("Mean error:", mean_error)
print("Standard error:", std_error)

# Train best model on all training data
best_model_trained = train_best_model(best_model, X_train, Y_train)

# Predict on train set
train_preds = predict(best_model_trained, X_train)

# Unscale and plot predictions vs actuals for train set
unscaled_preds_series, unscaled_actuals_series = reshape_and_unscale_predictions(train_preds, Y_train, scaler_Y)
plot_series(unscaled_actuals_series, unscaled_preds_series, title="Unscaled Predictions vs Actual Training Data")

# Perform residual analysis for train set
residuals_train = unscaled_actuals_series - unscaled_preds_series
residual_analysis(residuals_train, unscaled_preds_series)
residual_analysis(residuals_train.pow(2), unscaled_preds_series.pow(2))

# Predict on test set
test_preds = predict(best_model_trained, X_test)

# Unscale and plot predictions vs actuals for test set
unscaled_preds_series, unscaled_actuals_series = reshape_and_unscale_predictions(test_preds, Y_test, scaler_Y)
plot_series(unscaled_actuals_series, unscaled_preds_series, title="Unscaled Predictions vs Actual Test Data")

# Perform residual analysis for test set
residuals_test = unscaled_actuals_series - unscaled_preds_series
residual_analysis(residuals_test, unscaled_preds_series)
residual_analysis(residuals_test.pow(2), unscaled_preds_series.pow(2))

KNN

In [None]:
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}



In [None]:
preds = best_model.predict(X_train)
print(mean_score)
plot_unscaled_predictions(preds, Y_train, scaler_Y, title="Unscaled Predictions vs Actual Training Data", residual_analysis_func=residual_analysis)

SVR

In [None]:
svr_param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 1]
}

lightGBM

In [None]:
model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=-1, random_state=42)
param_grid = {
    'lgbmregressor__n_estimators': [100, 200, 500],
    'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
    'lgbmregressor__max_depth': [-1, 5, 10],
    'lgbmregressor__num_leaves': [31, 50, 100],
    'lgbmregressor__min_child_samples': [10, 20, 30],
    'lgbmregressor__subsample': [0.8, 0.9, 1.0],
    'lgbmregressor__colsample_bytree': [0.8, 0.9, 1.0],
    'lgbmregressor__reg_alpha': [0.0, 0.1, 0.5],
    'lgbmregressor__reg_lambda': [0.0, 0.1, 0.5]
}

MLP

In [None]:
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (200,), (50, 50)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

GPR

In [None]:
gpr_param_grid = {
    'alpha': [1e-10, 1e-9, 1e-8, 1e-7],
    'optimizer': ['fmin_l_bfgs_b', 'fmin_cg', 'fmin_ncg', None],
    'n_restarts_optimizer': [0, 1, 2, 3]
}

In [None]:
LSTM

In [None]:
from pmdarima.model_selection import RollingForecastCV
rfcv = RollingForecastCV(
    h=170,  # forecast horizon
    step=30,  # step size
)

def grid_search(data, cfg_list):
    scores = {}
    models = {}


    # score, model = 
    scores[cfg] = score
    models[cfg] = model

    # Calculate mean and standard deviation of scores
    mean_score = np.mean(list(scores.values()))
    std_score = np.std(list(scores.values()))

    # Find the best configuration and model using argmin
    best_cfg = min(scores, key=scores.get)
    best_model = models[best_cfg]
    
    return best_model, scores, models, mean_score, std_score


def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    # msle = mean_squared_log_error(y_true, y_pred)
    return {
        'rmse': rmse,
    }
    
   
    
    
def get_best_model_mean_std(grid_search_result):
    # Extract mean and standard deviation of the best model
    
    return best_model_mean, best_model_std

def train_best_model(best_estimator, X_train, y_train):
        # Train the best estimator on all training data
    fitted_model = best_estimator.fit(X_train, y_train)
    
    return fitted_model

def predict_on_train(fitted_model, X_train):
    # Predict on the training data
    # ...
    return train_predictions

In [None]:
def grid_search(data, cfg_list):
    scores_and_models = {}
    for cfg in cfg_list:
        score, model = walk_forward_validation(data, cfg)
        scores_and_models[cfg] = {'score': score, 'model': model}
    
    # Calculate mean and standard deviation of scores
    scores = [v['score'] for v in scores_and_models.values()]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    
    return scores_and_models, mean_score, std_score


def train_model(train, config):
    order = config
    model = ARIMA(train, order=order)
    model_fit = model.fit()
    return model_fit

def model_predict(model, start, end):
    return model.predict(start=start, end=end)

# def evaluate(test_data, prediction):
#     return measure_rmse(test_data, prediction)

def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    # msle = mean_squared_log_error(y_true, y_pred)
    return {
        'rmse': rmse,
    }

def walk_forward_validation(data, cfg):
    rfcv = RollingForecastCV(h=170, step=30)
    evaluation_results = []

    for train_index, test_index in rfcv.split(data):
        train_data = data[train_index]
        test_data = data[test_index]
        model_fit = train_model(train_data, cfg)
        prediction = model_predict(model_fit, start=len(train_data), end=len(train_data)+len(test_data)-1)
        evaluation_result = evaluate(test_data, prediction)
        evaluation_results.append(evaluation_result)

    return np.mean(evaluation_results), model_fit



Converting the data to supervised regression format
All the predictor variables are changed to lagged variable, as the t-1 value of the lagged variable will be used for prediction.

In [None]:


#Create a 60-days window of historical prices (i-60) as our feature data (x_train) and the following 60-days window as label data (y_train).
x_train = []
y_train = []

for i in range(60, len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])
    
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

#Extract the closing prices from our normalized dataset (the last 20% of the dataset).
test_data = scaled_data[training_data_len-60: , : ]
x_test = []
y_test = values[training_data_len:]

for i in range(60, len(test_data)):
  x_test.append(test_data[i-60:i, 0])

x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))


def create_features_and_targets(data, feature_length):
    X = []
    Y = []
    for i in range(len(data) - feature_length - 1):
        X.append(data[i:(i+feature_length), 0])
        Y.append(data[i + feature_length, 0])
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

# calling the function
X_train, y_train = create_features_and_targets(dataV1, feature_length)


In [None]:
# test options for regression
num_folds = 10
scoring = 'neg_mean_squared_error'
#scoring ='neg_mean_absolute_error'
#scoring = 'r2'
# spot check the algorithms
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))
#Neural Network
models.append(('MLP', MLPRegressor()))
models.append(('GBR', GradientBoostingRegressor()))



In [None]:
def unscale_data(pred, actual):
    scaler = MinMaxScaler()
    scaler.fit(actual)
    pred = scaler.inverse_transform(pred)
    actual = scaler.inverse_transform(actual)
    return pred, actual




In [None]:
#plot
data = stock_data.filter(['Close'])
train = data[:training_data_len]
validation = data[training_data_len:]
validation['Predictions'] = predictions
plt.figure(figsize=(16,8))
plt.title('Model')
plt.xlabel('Date')
plt.ylabel('Close Price USD ($)')
plt.plot(train)
plt.plot(validation[['Close', 'Predictions']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

Evaluation functions

In [None]:
def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    # msle = mean_squared_log_error(y_true, y_pred)
    return {
        'rmse': rmse,
    }
    
# def diagnostics(model, data1, data2):
#     # perform diagnostics and return the print statements
#     print('Diagnostics:')
#     print(model.summary())
    
    


#plot results
def plot_results(cols, results, data_name):
    for row in results[cols].iterrows():
        yhat, resid, actual, name = row[1]
        plt.title(f'{data_name} - {name}')
        plt.plot(actual, 'k--', alpha=0.5)
        plt.plot(yhat, 'k')
        plt.legend(['actual', 'forecast'])
        plot_acf(resid, zero=False, 
                 title=f'{data_name} - Autocorrelation')
        plt.show()

# cols = ['yhat', 'resid', 'actual', 'Model Name']
# plot_results(cols, air_results, 'Air Passengers')
    


# def plot_correlogram(x, lags=None, title=None):
#     lags = min(10, int(len(x)/5)) if lags is None else lags
#     with sns.axes_style('whitegrid'):
#         fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 8))
#         x.plot(ax=axes[0][0], title='Residuals')
#         x.rolling(21).mean().plot(ax=axes[0][0], c='k', lw=1)
#         q_p = np.max(q_stat(acf(x, nlags=lags), len(x))[1])
#         stats = f'Q-Stat: {np.max(q_p):>8.2f}\nADF: {adfuller(x)[1]:>11.2f}'
#         axes[0][0].text(x=.02, y=.85, s=stats, transform=axes[0][0].transAxes)
#         probplot(x, plot=axes[0][1])
#         mean, var, skew, kurtosis = moment(x, moment=[1, 2, 3, 4])
#         s = f'Mean: {mean:>12.2f}\nSD: {np.sqrt(var):>16.2f}\nSkew: {skew:12.2f}\nKurtosis:{kurtosis:9.2f}'
#         axes[0][1].text(x=.02, y=.75, s=s, transform=axes[0][1].transAxes)
#         plot_acf(x=x, lags=lags, zero=False, ax=axes[1][0])
#         plot_pacf(x, lags=lags, zero=False, ax=axes[1][1])
#         axes[1][0].set_xlabel('Lag')
#         axes[1][1].set_xlabel('Lag')
#         fig.suptitle(title, fontsize=14)
#         sns.despine()
#         fig.tight_layout()
#         fig.subplots_adjust(top=.9)
        
        
# #plot stats
# error = (y - y_pred).rename('Prediction Errors')
# scores = dict(
#     rmse=np.sqrt(mean_squared_error(y_true=y, y_pred=y_pred)),
#     rmsle=np.sqrt(mean_squared_log_error(y_true=y, y_pred=y_pred)),
#     mean_ae=mean_absolute_error(y_true=y, y_pred=y_pred),
#     median_ae=median_absolute_error(y_true=y, y_pred=y_pred),
#     r2score=explained_variance_score(y_true=y, y_pred=y_pred)
# )
# fig, axes = plt.subplots(ncols=3, figsize=(15, 4))
# sns.scatterplot(x=y, y=y_pred, ax=axes[0])
# axes[0].set_xlabel('Log Price')
# axes[0].set_ylabel('Predictions')
# axes[0].set_ylim(11, 16)
# axes[0].set_title('Predicted vs. Actuals')
# sns.distplot(error, ax=axes[1])
# axes[1].set_title('Residuals')
# pd.Series(scores).plot.barh(ax=axes[2], title='Error Metrics')
# fig.suptitle('In-Sample Regression Errors', fontsize=16)
# sns.despine()
# fig.tight_layout()
# fig.subplots_adjust(top=.88)

RF

In [None]:
def prepare_data_for_modeling(rt_d, x_var, y_var, test_size=0.2, max_lag=2):
    # 1. Split train/test
    train_data, test_data = train_test_split(rt_d, test_size=test_size, shuffle=False)
    
    # 2. Scale based on train set
    train_data_scaled, test_data_scaled = scale_data(train_data, test_data)
    
    # 3. Create supervised learning data by adding datetime features and lagged features
    train_data_df = pd.DataFrame(train_data_scaled, index=train_data.index, columns=train_data.columns)
    test_data_df = pd.DataFrame(test_data_scaled, index=test_data.index, columns=test_data.columns)
    
    # Set up data for supervised learning
    train_transformed = prepare_time_series_data(train_data_df, y_var, max_lag)
    test_transformed = prepare_time_series_data(test_data_df, y_var, max_lag)
    
    # Extract X and Y from the transformed train and test sets
    X_train, Y_train = extract_column(train_transformed, y_var)
    X_test, Y_test = extract_column(test_transformed, y_var)

    return X_train, Y_train, X_test, Y_test

X_train_ts1_ts1, Y_train_ts1_ts1, X_test_ts1_ts1, Y_test_ts1_ts1 = prepare_data_for_modeling(rt_d, 'ts1', 'ts1')
X_train_ts2_ts2, Y_train_ts2_ts2, X_test_ts2_ts2, Y_test_ts2_ts2 = prepare_data_for_modeling(rt_d, 'ts2', 'ts2')
X_train_ts1_ts2, Y_train_ts1_ts2, X_test_ts1_ts2, Y_test_ts1_ts2 = prepare_data_for_modeling(rt_d, 'ts1', 'ts2')
X_train_ts2_ts1, Y_train_ts2_ts1, X_test_ts2_ts1, Y_test_ts2_ts1 = prepare_data_for_modeling(rt_d, 'ts2', 'ts1')


In [None]:
def train_random_forest(train_x, train_y, test_x, test_y):
    model = RandomForestRegressor()
    model.fit(train_x, train_y)
    preds = model.predict(test_x)
    mse, mae, mape = evaluate_predictions(test_y, preds)

    print("Random Forest")
    print(f"MSE: {mse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"MAPE: {mape:.3f}")
    
    return mse, mae, mape



from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()
#####



'''
n_estimators : integer, optional (default=10)
    The number of trees in the forest.
'''
param_grid = {'n_estimators': [50,100,150,200,250,300,350,400]}
model = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

param_grid = {'n_estimators': [200, 400],
              'max_depth': [10, 15, 20],
              'min_samples_leaf': [50, 100],
              }
gridsearch_reg = GridSearchCV(estimator=rf_reg,
                              param_grid=param_grid,
                              scoring='neg_mean_squared_error',
                              n_jobs=-1,
                              cv=cv,
                              refit=True,
                              return_train_score=True,
                              verbose=1)

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
    


SVM

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

LightGBM

In [None]:

def train_lightgbm(train_x, train_y, test_x, test_y):
    # Fit LightGBM model
    params = {'objective': 'regression'}
    d_train = lgb.Dataset(train_x, label=train_y)
    model = lgb.train(params, d_train)
    preds = model.predict(test_x)

    # Evaluate predictions
    mse, mae, mape = evaluate_predictions(test_y, preds)

    print("LightGBM")
    print(f"MSE: {mse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"MAPE: {mape:.3f}")
    
    return mse, mae, mape


'''
n_estimators:

    The number of boosting stages to perform. Gradient boosting
    is fairly robust to over-fitting so a large number usually
    results in better performance.
''' 
param_grid = {'n_estimators': [50,100,150,200,250,300,350,400]}
model = GradientBoostingRegressor(random_state=seed)
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Optimisation

In [None]:
from sklearn.model_selection import GridSearchCV

model_params = []
evaluation_results = []
models = []
for train_index, test_index in rfcv.split(rt_1_D_train):
    train_data = rt_1_D_train.iloc[train_index]
    test_data = rt_1_D_train.iloc[test_index]

rfcv = RollingForecastCV(
    h=170,  # forecast horizon
    step=30,  # step size
)

##retrain model on whole dataset once done with cross validation

#example with scores
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('knn', KNeighborsRegressor())])

n_folds = 5
n_neighbors = tuple(range(5, 101, 5))

param_grid = {'knn__n_neighbors': n_neighbors}

estimator = GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         cv=n_folds,
                         scoring=rmse_score,
#                          n_jobs=-1
                        )
estimator.fit(X=X, y=y)
cv_results = estimator.cv_results_
    
    

LSTM

In [None]:
#simple example
# We do the same thing, but now instead for 12 months
n_input = 12
generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=1)
     

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
     

# define model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
     

model.summary()


In [None]:
# #data geneartor
# def create_dataset(dataset, look_back=1):
#     data_x, data_y = [], []
#     for i in range(len(dataset) - look_back):
#         data_x.append(dataset[i:(i + look_back), 0])
#         data_y.append(dataset[i + look_back, 0])
#     return np.array(data_x), np.array(data_y)

# def data_generator(data, window_size, batch_size):
#     """
#     Generator function to create batches of input-output pairs for time series forecasting.
#     :param data: A NumPy array containing the time series data.
#     :param window_size: The size of the window used to create input-output pairs.
#     :param batch_size: The number of input-output pairs per batch.
#     :return: A generator that yields batches of input-output pairs.
#     """
#     start = 0
#     while True:
#         if start + batch_size >= len(data) - window_size - 1:
#             start = 0

#         X_batch = np.zeros((batch_size, window_size, data.shape[1]))
#         y_batch = np.zeros((batch_size, 2))

#         for i in range(batch_size):
#             X_batch[i] = data[start:start + window_size]
#             y_batch[i] = data[start + window_size, :2]
#             start += 1
            
#         # X_batch = np.transpose(X_batch, (0, 2, 1))
#         # X_batch = np.where(np.isnan(X_batch), threshold+1, X_batch)
        
#         if np.any(y_batch==mask):
#             continue
        
#         if np.count_nonzero(X_batch== mask)/X_batch.size > 0.5:
#             continue

#         yield X_batch, y_batch
        
# window_size = 120  # The size of the sliding window
# batch_size = 32  # The number of samples per batch

# # Split the data into training and validation sets
# train_split = int(0.8 * len(data))
# train_data = data[:train_split]
# val_split = int(0.8 * len(train_data))
# val_data = train_data[val_split:]
# train_data = train_data[:val_split]
# test_data = data[train_split:]
# print(train_data.shape, val_data.shape, test_data.shape)


# # Create the training and validation generators
# train_gen = data_generator(train_data, window_size, batch_size)
# val_gen = data_generator(val_data, window_size, batch_size)
# test_gen = data_generator(test_data, window_size, batch_size)


In [None]:
#build sirojjidin
def get_compiled_model(train=False, lr = 1e-4):
    
    model = mlstm_fcn(2, input_shape = (window_size, 8))
    # optm = Adam(learning_rate = lr, beta_1 = 0.9, beta_2 = 0.98, epsilon=1e-9)
    optm = RMSprop(learning_rate=lr)
    model.compile(loss = 'mse', 
                  optimizer=optm,
                  metrics=["mae", 'mse'])
    return model
model = get_compiled_model()
model.summary()
# train the model
history = model.fit(
    train_gen,
    validation_data=val_gen,
    steps_per_epoch = train_data.shape[0]//batch_size,
    validation_steps = val_data.shape[0]//batch_size,
    epochs=1,
    batch_size=batch_size,
    # callbacks=callback_list,
    verbose=1
)

# Evaluate test
model.evaluate(test_gen, steps = test_data.shape[0]/batch_size)

In [None]:
# #build gneeral


# def fit_lstm(train_x, train_y, test_x, test_y, lstm_units=50, look_back=1, num_epochs=10, batch_size=32):
#     # Reshape input to be [samples, time steps, features]
#     train_x = train_x.reshape((train_x.shape[0], look_back, 1))
#     test_x = test_x.reshape((test_x.shape[0], look_back, 1))

#     # Build LSTM model
#     model = Sequential()
#     model.add(LSTM(8, input_shape=(look_back, 1), return_sequences=True))
#     model.add(LSTM(4))
#     model.add(Dense(1))
#     model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

#     # Fit the model
#     model.fit(train_x, train_y, epochs=num_epochs, batch_size=batch_size, verbose=2)

#     # Make predictions
#     train_pred = model.predict(train_x)
#     test_pred = model.predict(test_x)

#     # Inverse scaling of the predictions
#     train_pred = scaler.inverse_transform(train_pred)
#     train_y = scaler.inverse_transform(train_y)
#     test_pred = scaler.inverse_transform(test_pred)
#     test_y = scaler.inverse_transform(test_y)

#     # Create a DataFrame of the predictions and actual values
#     train_results = pd.DataFrame({'Actual': train_y.flatten(), 'Predicted': train_pred.flatten()}, index=train_data.index[1:])
#     test_results = pd.DataFrame({'Actual': test_y.flatten(), 'Predicted': test_pred.flatten()}, index=test_data.index[1:])

#     # Evaluate the predictions
#     train_mse, train_mae, train_mape = evaluate_predictions(train_y, train_pred)
#     test_mse, test_mae, test_mape = evaluate_predictions(test_y, test_pred)
    
#     # Evaluate the model on the test set
#     loss = model.evaluate(X_test, y_test, verbose=0)
#     print(f"Test loss: {loss}")

#     # Print the evaluation metrics
#     print("Train MSE: {:.3f}, Train MAE: {:.3f}, Train MAPE: {:.3f}".format(train_mse, train_mae, train_mape))
#     print("Test MSE: {:.3f}, Test MAE: {:.3f}, Test MAPE: {:.3f}".format(test_mse, test_mae, test_mape))

#     return pd.concat([train_results, test_results])



# def build_lstm_model(units=50, activation='relu', optimizer='adam'):
#     model = Sequential()
#     model.add(LSTM(units=units, activation=activation))
#     model.add(Dense(1))
#     model.compile(optimizer=optimizer, loss='mean_squared_error')
#     return model

# def tune_lstm(X_train, y_train):
#     param_grid = {
#         'units': [50, 100],
#         'activation': ['relu', 'tanh'],
#         'optimizer': [Adam(learning_rate=0.001), Adam(learning_rate=0.01)]
#     }
#     model = KerasRegressor(build_fn=build_lstm_model, epochs=50, batch_size=32, verbose=0)
#     tscv = TimeSeriesSplit(n_splits=5)
#     grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error')
#     grid_result = grid.fit(X_train, y_train)
#     best_params = grid_result.best_params_
#     best_lstm_model = build_lstm_model(**best_params)
#     best_lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    
#     print(f"Best LSTM model with parameters {best_params}")
#     return best_lstm_model

lstm 3

In [None]:
# #importing required libraries
# from sklearn.preprocessing import MinMaxScaler
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, LSTM

# #creating dataframe
# data = df.sort_index(ascending=True, axis=0)
# new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close'])
# for i in range(0,len(data)):
#     new_data['Date'][i] = data['Date'][i]
#     new_data['Close'][i] = data['Close'][i]

# #setting index
# new_data.index = new_data.Date
# new_data.drop('Date', axis=1, inplace=True)

# #creating train and test sets
# dataset = new_data.values

# train = dataset[0:987,:]
# valid = dataset[987:,:]

# #converting dataset into x_train and y_train
# scaler = MinMaxScaler(feature_range=(0, 1))
# scaled_data = scaler.fit_transform(dataset)

# x_train, y_train = [], []
# for i in range(60,len(train)):
#     x_train.append(scaled_data[i-60:i,0])
#     y_train.append(scaled_data[i,0])
# x_train, y_train = np.array(x_train), np.array(y_train)

# x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))

# # create and fit the LSTM network
# model = Sequential()
# model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
# model.add(LSTM(units=50))
# model.add(Dense(1))

# model.compile(loss='mean_squared_error', optimizer='adam')
# model.fit(x_train, y_train, epochs=1, batch_size=1, verbose=2)

# #predicting 246 values, using past 60 from the train data
# inputs = new_data[len(new_data) - len(valid) - 60:].values
# inputs = inputs.reshape(-1,1)
# inputs  = scaler.transform(inputs)

# X_test = []
# for i in range(60,inputs.shape[0]):
#     X_test.append(inputs[i-60:i,0])
# X_test = np.array(X_test)

# X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
# closing_price = model.predict(X_test)
# closing_price = scaler.inverse_transform(closing_price)
# Results
# rms=np.sqrt(np.mean(np.power((valid-closing_price),2)))
# rms

# #for plotting
# train = new_data[:987]
# valid = new_data[987:]
# valid['Predictions'] = closing_price
# plt.plot(train['Close'])
# plt.plot(valid[['Close','Predictions']])

lstm 4

In [None]:
# #Creating the scaled training data set
# train_data = scaled_data[0:training_data_len  , : ]
# #Spliting the data into x_train and y_train data sets
# x_train=[]
# y_train = []
# for i in range(60,len(train_data)):
#     x_train.append(train_data[i-60:i,0])
#     y_train.append(train_data[i,0])
#     if i<= 61:
#         print(x_train)
#         print(y_train)
#         print()
# #Here we are Converting x_train and y_train to numpy arrays
# x_train, y_train = np.array(x_train), np.array(y_train)
# # Here we are reshaping the data into the shape accepted by the LSTM
# x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))
# #now we are Building the LSTM network model
# model = Sequential()
# model.add(LSTM(units=50, return_sequences=True,input_shape=(x_train.shape[1],1)))
# model.add(LSTM(units=50, return_sequences=False))
# model.add(Dense(units=25))
# model.add(Dense(units=1))
# # here we are Compiling the model
# model.compile(optimizer='adam', loss='mean_squared_error')
# # here we are training the model
# model.fit(x_train, y_train, batch_size=1, epochs=1)
# 1756/1756 [==============================] - 78s 44ms/step - loss: 2.9184e-04
# <tensorflow.python.keras.callbacks.History at 0x7f24c4477670>
# # here we are testing data set
# test_data = scaled_data[training_data_len - 60: , : ]
# #Creating the x_test and y_test data sets
# x_test = []
# y_test =  dataset[training_data_len : , : ] #Get all of the rows from index 1603 to the rest and all of the columns (in this case it's only column 'Close'), so 2003 - 1603 = 400 rows of data
# for i in range(60,len(test_data)):
#     x_test.append(test_data[i-60:i,0])
# # here we are converting x_test to a numpy array  
# x_test = np.array(x_test)
# # here we are reshaping the data into the shape accepted by the LSTM  
# x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1))
# # now we are getting the models predicted price values
# predictions = model.predict(x_test) 
# predictions = scaler.inverse_transform(predictions)#Undo scaling
# # here we are calculaing the value of RMSE 
# rmse=np.sqrt(np.mean(((predictions- y_test)**2)))
# rmse
# # here we are plotting the data
# #Plot/Create the data for the graph
# train = data[:training_data_len]
# valid = data[training_data_len:]
# valid['Predictions'] = predictions
# #Visualize the data
# plt.figure(figsize=(16,8))
# plt.title('Model')
# plt.xlabel('Date', fontsize=18)
# plt.ylabel('Close Price USD ($)', fontsize=18)
# plt.plot(train['Close'])
# plt.plot(valid[['Close', 'Predictions']])
# plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
# plt.show()

LSTM 5

In [None]:
# # Get stacked LSTM model for regression modeling
# def get_reg_model(layer_units=[100,100],dropouts=[0.2,0.2],window_size=50):
#     # build LSTM network
#     model = Sequential()
    
#     # hidden layer 1
#     model.add(LSTM(layer_units[0], 
#                    input_shape=(window_size,1), 
#                    return_sequences=True))
#     model.add(Dropout(dropouts[0]))
    
#     # hidden layer 2
#     model.add(LSTM(layer_units[1]))
#     model.add(Dropout(dropouts[1]))
    
#     # output layer
#     model.add(Dense(1))
#     model.add(Activation("linear"))
    
#     start = time.time()
#     model.compile(loss="mse", optimizer="rmsprop")
#     print("> Compilation Time : ", time.time() - start)
#     print(model.summary())
#     return model

# # Window wise prediction function
# def predict_reg_multiple(model, data, window_size=6, prediction_len=3):
#     prediction_list = []
    
#     # loop for every sequence in the dataset
#     for window in range(int(len(data)/prediction_len)):
#         _seq = data[window*prediction_len]
#         predicted = []
#         # loop till required prediction length is achieved
#         for j in range(prediction_len):
#             predicted.append(model.predict(_seq[np.newaxis,:,:])[0,0])
#             _seq = _seq[1:]
#             _seq = np.insert(_seq, [window_size-1], predicted[-1], axis=0)
#         prediction_list.append(predicted)
#     return prediction_list


# # Plot window wise 
# def plot_reg_results(predicted_data, true_data, prediction_len=3):
#     fig = plt.figure(facecolor='white')
#     ax = fig.add_subplot(111)
    
#     # plot actual data
#     ax.plot(true_data, 
#             label='True Data',
#             c='black',alpha=0.3)
    
#     # plot flattened data
#     plt.plot(np.array(predicted_data).flatten(), 
#              label='Prediction_full',
#              c='g',linestyle='--')
    
#     #plot each window in the prediction list
#     for i, data in enumerate(predicted_data):
#         padding = [None for p in range(i * prediction_len)]
#         plt.plot(padding + data, label='Prediction',c='black')

#     plt.title("Forecast Plot with Prediction Window={}".format(prediction_len))
#     plt.show()

last 6

In [None]:
model = keras.Sequential()
model.add(layers.LSTM(100, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(layers.LSTM(100, return_sequences=False))
model.add(layers.Dense(25))
model.add(layers.Dense(1))
model.summary()

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, batch_size= 1, epochs=3)
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
rmse = np.sqrt(np.mean(predictions - y_test)**2)
rmse

LSTM  optimsation

LSTM 1

In [None]:

# #Running deep learning models and performing cross validation takes time
# #Set the following Flag to 0 if the Deep LEarning Models Flag has to be disabled
# EnableDeepLearningRegreesorFlag = 0

# def create_model(neurons=12, activation='relu', learn_rate = 0.01, momentum=0):
#         # create model
#         model = Sequential()
#         model.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
#         #The number of hidden layers can be increased
#         model.add(Dense(2, activation=activation))
#         # Final output layer
#         model.add(Dense(1, kernel_initializer='normal'))
#         # Compile model
#         optimizer = SGD(lr=learn_rate, momentum=momentum)
#         model.compile(loss='mean_squared_error', optimizer='adam')
#         return model  
#     #Add Deep Learning Regressor
# if ( EnableDeepLearningRegreesorFlag == 1):
#     models.append(('DNN', KerasRegressor(build_fn=create_model, epochs=100, batch_size=100, verbose=1)))  
    
# X_train_LSTM, X_validation_LSTM = np.array(X_train), np.array(X_validation)
# Y_train_LSTM, Y_validation_LSTM = np.array(Y_train), np.array(Y_validation)
# X_train_LSTM= X_train_LSTM.reshape((X_train_LSTM.shape[0], 1, X_train_LSTM.shape[1]))
# X_validation_LSTM= X_validation_LSTM.reshape((X_validation_LSTM.shape[0], 1, X_validation_LSTM.shape[1]))
# print(X_train_LSTM.shape, Y_train_LSTM.shape, X_validation_LSTM.shape, Y_validation_LSTM.shape)   


# def create_LSTMmodel(neurons=12, learn_rate = 0.01, momentum=0):
#         # create model
#     model = Sequential()
#     model.add(LSTM(50, input_shape=(X_train_LSTM.shape[1], X_train_LSTM.shape[2])))
#     #More number of cells can be added if needed 
#     model.add(Dense(1))
#     optimizer = SGD(lr=learn_rate, momentum=momentum)
#     model.compile(loss='mse', optimizer='adam')
#     return model
# LSTMModel = create_LSTMmodel(12, learn_rate = 0.01, momentum=0)
# LSTMModel_fit = LSTMModel.fit(X_train_LSTM, Y_train_LSTM, validation_data=(X_validation_LSTM, Y_validatio
    
# #Visual plot to check if the error is reducing
# pyplot.plot(LSTMModel_fit.history['loss'], label='train')
# pyplot.plot(LSTMModel_fit.history['val_loss'], label='test')
# pyplot.legend()
# pyplot.show()

In [None]:
# #Grid Search for LSTM Model

# # evaluate an LSTM model for a given order (p,d,q)
# def evaluate_LSTM_model(neurons=12, learn_rate = 0.01, momentum=0):
#     #predicted = list()     
#     LSTMModel = create_LSTMmodel(neurons, learn_rate, momentum)
#     LSTMModel_fit = LSTMModel.fit(X_train_LSTM, Y_train_LSTM,epochs=50, batch_size=72, verbose=0, shuffle=False)
#     predicted = LSTMModel.predict(X_validation_LSTM)
#     error = mean_squared_error(predicted, Y_validation)
#     return error

# # evaluate combinations of different variables of LSTM Model
# def evaluate_combinations_LSTM(neurons, learn_rate, momentum): 
#     best_score, best_cfg = float("inf"), None
#     for n in neurons:
#         for l in learn_rate:
#             for m in momentum:
#                 combination = (n,l,m)                
#                 try:
#                     mse = evaluate_LSTM_model(n,l,m)                    
#                     if mse < best_score:
#                         best_score, best_cfg = mse, combination
#                     print('LSTM%s MSE=%.7f' % (combination,mse))
#                 except:
#                     continue
#     print('Best LSTM%s MSE=%.7f' % (best_cfg, best_score))
    
# # evaluate parameters
# neurons = [1, 5]
# learn_rate = [0.001, 0.3]
# momentum = [0.0, 0.9]
# #Other Parameters can be modified as well
# batch_size = [10, 20, 40, 60, 80, 100]
# epochs = [10, 50, 100]
# warnings.filterwarnings("ignore")
# evaluate_combinations_LSTM(neurons,learn_rate,momentum)  


In [None]:


# ########
# #grid search example
# # transform list into supervised learning format
# def series_to_supervised(data, n_in=1, n_out=1):
#  df = DataFrame(data)
#  cols = list()
#  # input sequence (t-n, ... t-1)
#  for i in range(n_in, 0, -1):
#  cols.append(df.shift(i))
#  # forecast sequence (t, t+1, ... t+n)
#  for i in range(0, n_out):
#  cols.append(df.shift(-i))
#  # put it all together
#  agg = concat(cols, axis=1)
#  # drop rows with NaN values
#  agg.dropna(inplace=True)
#  return agg.values
 
# # root mean squared error or rmse
# def measure_rmse(actual, predicted):
#  return sqrt(mean_squared_error(actual, predicted))
 
# # difference dataset
# def difference(data, order):
#  return [data[i] - data[i - order] for i in range(order, len(data))]
 
# # fit a model
# def model_fit(train, config):
#  # unpack config
#  n_input, n_nodes, n_epochs, n_batch, n_diff = config
#  # prepare data
#  if n_diff > 0:
#  train = difference(train, n_diff)
#  # transform series into supervised format
#  data = series_to_supervised(train, n_in=n_input)
#  # separate inputs and outputs
#  train_x, train_y = data[:, :-1], data[:, -1]
#  # reshape input data into [samples, timesteps, features]
#  n_features = 1
#  train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], n_features))
#  # define model
#  model = Sequential()
#  model.add(LSTM(n_nodes, activation='relu', input_shape=(n_input, n_features)))
#  model.add(Dense(n_nodes, activation='relu'))
#  model.add(Dense(1))
#  model.compile(loss='mse', optimizer='adam')
#  # fit model
#  model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
#  return model
 
# # forecast with the fit model
# def model_predict(model, history, config):
#  # unpack config
#  n_input, _, _, _, n_diff = config
#  # prepare data
#  correction = 0.0
#  if n_diff > 0:
#  correction = history[-n_diff]
#  history = difference(history, n_diff)
#  # reshape sample into [samples, timesteps, features]
#  x_input = array(history[-n_input:]).reshape((1, n_input, 1))
#  # forecast
#  yhat = model.predict(x_input, verbose=0)
#  return correction + yhat[0]
 
# # walk-forward validation for univariate data
# def walk_forward_validation(data, n_test, cfg):
#  predictions = list()
#  # split dataset
#  train, test = train_test_split(data, n_test)
#  # fit model
#  model = model_fit(train, cfg)
#  # seed history with training dataset
#  history = [x for x in train]
#  # step over each time-step in the test set
#  for i in range(len(test)):
#  # fit model and make forecast for history
#  yhat = model_predict(model, history, cfg)
#  # store forecast in list of predictions
#  predictions.append(yhat)
#  # add actual observation to history for the next loop
#  history.append(test[i])
#  # estimate prediction error
#  error = measure_rmse(test, predictions)
#  print(' > %.3f' % error)
#  return error
 
# # score a model, return None on failure
# def repeat_evaluate(data, config, n_test, n_repeats=10):
#  # convert config to a key
#  key = str(config)
#  # fit and evaluate the model n times
#  scores = [walk_forward_validation(data, n_test, config) for _ in range(n_repeats)]
#  # summarize score
#  result = mean(scores)
#  print('> Model[%s] %.3f' % (key, result))
#  return (key, result)
 
# # grid search configs
# def grid_search(data, cfg_list, n_test):
#  # evaluate configs
#  scores = [repeat_evaluate(data, cfg, n_test) for cfg in cfg_list]
#  # sort configs by error, asc
#  scores.sort(key=lambda tup: tup[1])
#  return scores
 
# # create a list of configs to try
# def model_configs():
#  # define scope of configs
#  n_input = [12]
#  n_nodes = [100]
#  n_epochs = [50]
#  n_batch = [1, 150]
#  n_diff = [12]
#  # create configs
#  configs = list()
#  for i in n_input:
#  for j in n_nodes:
#  for k in n_epochs:
#  for l in n_batch:
#  for m in n_diff:
#  cfg = [i, j, k, l, m]
#  configs.append(cfg)
#  print('Total configs: %d' % len(configs))
#  return configs
 
# # define dataset
# series = read_csv('monthly-airline-passengers.csv', header=0, index_col=0)
# data = series.values
# # data split
# n_test = 12
# # model configs
# cfg_list = model_configs()
# # grid search
# scores = grid_search(data, cfg_list, n_test)
# print('done')
# # list top 10 configs
# for cfg, error in scores[:3]:
#  print(cfg, error)


Cross val

In [None]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    #converted mean square error to positive. The lower the beter
    cv_results = -1* cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
    
    
## all model results
names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
    names.append(name)
    
    ## K Fold analysis:
    
    kfold = KFold(n_splits=num_folds, random_state=seed)
    #converted mean square error to positive. The lower the beter
    cv_results = -1* cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    kfold_results.append(cv_results)
    

    # Full Training period
    res = model.fit(X_train, Y_train)
    train_result = mean_squared_error(res.predict(X_train), Y_train)
    train_results.append(train_result)
    
    # Test results
    test_result = mean_squared_error(res.predict(X_test), Y_test)
    test_results.append(test_result)
    
    msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result)
    print(msg)


#results
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison: Kfold results')
ax = fig.add_subplot(111)
pyplot.boxplot(kfold_results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
pyplot.show()


# compare algorithms
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
pyplot.show()

#train test erro
# compare algorithms
fig = pyplot.figure()

ind = np.arange(len(names))  # the x locations for the groups
width = 0.35  # the width of the bars

fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.bar(ind - width/2, train_results,  width=width, label='Train Error')
pyplot.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
pyplot.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
pyplot.show()
    






In [None]:
# compare algorithms
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
pyplot.show()

FOrecast


In [None]:
# prepare model
#scaler = StandardScaler().fit(X_train)
#rescaledX = scaler.transform(X_train)
model = RandomForestRegressor(n_estimators=250) # rbf is default kernel
model.fit(X_train, Y_train)
# estimate accuracy on validation set
# transform the validation dataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(X_validation)
print(mean_squared_error(Y_validation, predictions))
print(r2_score(Y_validation, predictions))

In [None]:
# def create_lagged_features(series, lag=1):
#     # Create a DataFrame from the input series
#     df = pd.DataFrame(series)
    
#     # Create lagged features
#     for i in range(1, lag + 1):
#         df[f"lag_{i}"] = df.shift(i)
    
#     # Drop rows with missing values caused by shifting
#     df.dropna(inplace=True)
    
#     return df

    

# >>> import statsmodels.api as sm
# >>> data = sm.datasets.macrodata.load()
# >>> data = data.data[['year','quarter','realgdp','cpi']]
# >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2)

def series_to_supervised(data, lag=1):
    n_vars = data.shape[1]
    df = pd.DataFrame(data)    
    cols, names = list(), list()
    for i in range(lag, 0, -1):
        cols.append(df.shift(i))
        names += [('%s(t-%d)' % (df.columns[j], i)) for j in range(n_vars)]
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg = pd.DataFrame(data.iloc[:,0]).join(agg)
    agg.dropna(inplace=True)
    return agg
dataset= series_to_supervised(dataset,1)
