In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

import matplotlib.pyplot as plt

# reduce display precision on numpy arrays
np.set_printoptions(precision=5)

df = pd.read_csv('C:/Users/Computer/Documents/bachelor/dataset.csv')
pd.options.mode.chained_assignment=None
df['csp'][637:]  = df['csp'][0:638].mean()

df['Index'] = pd.to_numeric(df['Index'], errors='coerce')

In [None]:
df.head(5)

In [None]:
X = df.iloc[:, 1:-30].values
Y = df.iloc[:, -30:].values

feature_names = df.columns[1:18]
target_names = df.columns[-30:] 

from sklearn.preprocessing import StandardScaler

X_df = df[feature_names]
Y_df = df[target_names]

scaler = StandardScaler()
X_df = pd.DataFrame(scaler.fit_transform(X_df), columns=X_df.columns)

data = pd.concat([Y_df, X_df], axis=1)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

def lasso_feature_selection(train, test, y_key, model_config_ols, jobs=24):
    time_series_cv = TimeSeriesSplit(n_splits=model_config_ols["tscv_splits"])
    lasso = LassoCV(n_alphas=model_config_ols["n_alphas"], cv=time_series_cv, n_jobs=jobs, random_state=42, max_iter=20000)
    lasso.fit(train.drop(y_key, axis=1), train[y_key])
    
    selected_features_ols = train.drop(y_key, axis=1).columns[lasso.coef_ != 0]
    return selected_features_ols

def ols(train, test, y_key, selected_features_ols):
    X_train = train[selected_features_ols]
    X_test = test[selected_features_ols]
    y_train = train[y_key]
    
    ols_model = LinearRegression().fit(X_train, y_train)
    y_hat_ols = pd.Series(ols_model.predict(X_test), index=test.index).rename("ols_post_lasso_y_hat")
    
    return y_hat_ols

def multioutput_ols_post_lasso(train, test, y_keys, model_config_ols, jobs=24):
    preds_ols = {}
    for y_key in y_keys:
        selected_features_ols = lasso_feature_selection(train, test, y_key, model_config_ols, jobs)
        preds_ols[y_key] = ols(train, test, y_key, selected_features_ols)
    return preds_ols

model_config_ols = {
    "n_alphas": 100,
    "tscv_splits": 20
}

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_start_size = 200

# Initialize empty dictionaries to store results
Actuals_ols = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecasts_ols = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecast_Errors_ols = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
MAE_ols = {}
MSE_ols = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds_ols = multioutput_ols_post_lasso(train, test, Y_df.columns, model_config_ols, jobs=10)

    for col in Y_df.columns:
        Forecasts_ols[col] = pd.concat([Forecasts_ols[col], preds_ols[col]])
        Actuals_ols[col] = pd.concat([Actuals_ols[col], test[col]])
        Forecast_Errors_ols[col] = pd.concat([Forecast_Errors_ols[col], test[col] - preds_ols[col]])
        
    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_df.columns:
    MAE_ols[col] = mean_absolute_error(Actuals_ols[col], Forecasts_ols[col])
    MSE_ols[col] = mean_squared_error(Actuals_ols[col], Forecasts_ols[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals_ols[col])
    print(f"Forecasts for {col}:")
    print(Forecasts_ols[col])
    print(f"Forecast Errors for {col}:")
    print(Forecast_Errors_ols[col])
    print(f"Mean Absolute Error for {col}: {MAE_ols[col]}")
    print(f"Mean Squared Error for {col}: {MSE_ols[col]}")

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasRegressor
from keras.optimizers import Adam
from tensorflow.keras.models import Model


def create_nn(hidden_layers, neurons, dropout_rate, learning_rate, alpha):
    model = Sequential()
    model.add(Dense(neurons, input_dim=17, kernel_initializer='normal', activation='relu', kernel_regularizer='l2'))

    for _ in range(hidden_layers - 1):
        model.add(Dense(neurons, activation='relu', kernel_regularizer='l2'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(30, activation='linear'))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

def initial_hyperparameter_tuning(train, y_keys, model_config_ffn):
    X_train = train.drop(y_keys, axis=1)
    y_train = train[y_keys]

    time_series_cv = TimeSeriesSplit(n_splits=model_config_ffn["tscv_splits"])

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    param_grid_ffn = {
        "hidden_layers": [1, 2, 3],
        "neurons": [16, 32, 64],
        "dropout_rate": [0.1, 0.2],
        "learning_rate": [0.01, 0.1],
        "alpha": [0.01, 0.1, 0.2]
    }

    grid_search_ffn = GridSearchCV(KerasRegressor(model=create_nn, epochs=100, batch_size=10, verbose=0, alpha=0.01, learning_rate=0.01, dropout_rate=0.1, neurons=16, hidden_layers=1),
                               param_grid_ffn, cv=time_series_cv, scoring="neg_mean_squared_error", n_jobs=1)
    grid_search_ffn.fit(X_train_scaled, y_train)
    best_params_ffn = grid_search_ffn.best_params_

    return best_params_ffn

def nn_tuned(train, test, y_keys, best_params):
    X_train = train.drop(y_keys, axis=1)
    X_test = test.drop(y_keys, axis=1)
    y_train = train[y_keys]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = create_nn(best_params_ffn["hidden_layers"], best_params_ffn["neurons"], best_params_ffn["dropout_rate"],
                      best_params_ffn["learning_rate"], best_params_ffn["alpha"])
    model.fit(X_train_scaled, y_train, epochs=100, batch_size=10, verbose=0)

    y_hat_ffn = pd.DataFrame(model.predict(X_test_scaled), index=test.index, columns=y_keys)

    return y_hat_ffn


model_config_ffn = {
    "tscv_splits": 20
}


# Perform hyperparameter tuning on the initial training set
train_start_size = 200
initial_train = data.iloc[:train_start_size]

best_params_ffn = initial_hyperparameter_tuning(initial_train, Y_df.columns, model_config_ffn)

Actuals_ffn = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecasts_ffn = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecast_Errors_ffn = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
MAE_ffn = {}
MSE_ffn = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds_ffn = nn_tuned(train, test, Y_df.columns, best_params_ffn)

    for col in Y_df.columns:
        Forecasts_ffn[col] = pd.concat([Forecasts_ffn[col], preds_ffn[col]])
        Actuals_ffn[col] = pd.concat([Actuals_ffn[col], test[col]])
        Forecast_Errors_ffn[col] = pd.concat([Forecast_Errors_ffn[col], test[col] - preds_ffn[col]])

    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_df.columns:
    MAE_ffn[col] = mean_absolute_error(Actuals_ffn[col], Forecasts_ffn[col])
    MSE_ffn[col] = mean_squared_error(Actuals_ffn[col], Forecasts_ffn[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals_ffn[col])
    print(f"Forecasts for {col}:")
    print(Forecasts_ffn[col])
    print(f"Forecast Errors for {col}:")
    print(Forecast_Errors_ffn[col])
    print(f"Mean Absolute Error for {col}: {MAE_ffn[col]}")
    print(f"Mean Squared Error for {col}: {MSE_ffn[col]}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

def extratrees_feature_selection(train, y_key, jobs=24):
    X_train = train.drop(y_key, axis=1)
    y_train = train[y_key]

    extratrees = ExtraTreesRegressor(n_jobs=jobs, random_state=42)
    extratrees.fit(X_train, y_train)

    # Select features using ExtraTreesRegressor
    selector = SelectFromModel(extratrees, prefit=True)
    selected_features_svr = train.drop(y_key, axis=1).columns[selector.get_support()]
    
    return selected_features_svr

def initial_hyperparameter_tuning(train, y_key, selected_features_svr, model_config_svr):
    X_train = train[selected_features_svr]
    y_train = train[y_key]

    time_series_cv = TimeSeriesSplit(n_splits=model_config_svr["tscv_splits"])
    
    pipeline = Pipeline([("svr", SVR())])
    param_grid_svr = {
        "svr__kernel": ["linear", "rbf"],
        "svr__C": [0.1, 1, 10],
        "svr__epsilon": [0.01, 0.1, 1, 2]
    }
    
    grid_search_svr = GridSearchCV(pipeline, param_grid_svr, cv=time_series_cv, scoring="neg_mean_squared_error", n_jobs=24)
    grid_search_svr.fit(X_train, y_train)
    best_params_svr = grid_search_svr.best_params_
    
    return best_params_svr

def svr_tuned(train, test, y_key, selected_features_svr, best_params_svr):
    X_train = train[selected_features_svr]
    X_test = test[selected_features_svr]
    y_train = train[y_key]

    svr_model = SVR(kernel=best_params_svr[y_key]["svr__kernel"], C=best_params_svr[y_key]["svr__C"], epsilon=best_params_svr[y_key]["svr__epsilon"])
    svr_model.fit(X_train, y_train)
    
    y_hat_svr = pd.Series(svr_model.predict(X_test), index=test.index).rename("svr_tuned_y_hat")
    
    return y_hat_svr

def multioutput_svr_tuned_extratrees(train, test, y_keys, best_params_svr, jobs=24):
    preds_svr = {}
    for y_key in y_keys:
        selected_features_svr = extratrees_feature_selection(train, y_key, jobs)
        preds_svr[y_key] = svr_tuned(train, test, y_key, selected_features_svr, best_params_svr)
    return preds_svr

model_config_svr = {
    "tscv_splits": 20    
}

from sklearn.metrics import mean_absolute_error, mean_squared_error

# Perform hyperparameter tuning on the initial training set
train_start_size = 200
initial_train = data.iloc[:train_start_size]

initial_selected_features = {}
for y_key in Y_df.columns:
    initial_selected_features[y_key] = extratrees_feature_selection(initial_train, y_key, jobs=10)
    
best_params = {}
for y_key in Y_df.columns:
    best_params[y_key] = initial_hyperparameter_tuning(initial_train, y_key, initial_selected_features[y_key], model_config_svr)

# Initialize empty dictionaries to store results
Actuals_svr = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecasts_svr = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecast_Errors_svr = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
MAE_svr = {}
MSE_svr = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds_svr = multioutput_svr_tuned_extratrees(train, test, Y_df.columns, best_params, jobs=10)

    for col in Y_df.columns:
        Forecasts_svr[col] = pd.concat([Forecasts_svr[col], preds_svr[col]])
        Actuals_svr[col] = pd.concat([Actuals_svr[col], test[col]])
        Forecast_Errors_svr[col] = pd.concat([Forecast_Errors_svr[col], test[col] - preds_svr[col]])

    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_df.columns:
    MAE_svr[col] = mean_absolute_error(Actuals_svr[col], Forecasts_svr[col])
    MSE_svr[col] = mean_squared_error(Actuals_svr[col], Forecasts_svr[col])

In [None]:
from sklearn.metrics import r2_score


# Initialize dictionaries for Forecasts_ensemble, Actuals_ensemble, R-squared values and Forecast_Errors_ensemble
Forecasts_ensemble = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Actuals_ensemble = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecast_Errors_ensemble = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
MAE_ensemble = {}
MSE_ensemble = {}
R2 = {}

for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds_ols = multioutput_ols_post_lasso(train, test, Y_df.columns, model_config_ols, jobs=10)
    preds_ffn = nn_tuned(train, test, Y_df.columns, best_params_ffn)
    preds_svr = multioutput_svr_tuned_extratrees(train, test, Y_df.columns, best_params, jobs=10)

    # Ensemble model: Average the predictions of the three models
    preds_ensemble = {}
    for col in Y_df.columns:
        preds_ensemble[col] = (preds_ols[col] + preds_ffn[col] + preds_svr[col]) / 3

    for col in Y_df.columns:
        Forecasts_ensemble[col] = pd.concat([Forecasts_ensemble[col], preds_ensemble[col]])
        Actuals_ensemble[col] = pd.concat([Actuals_ensemble[col], test[col]])
        Forecast_Errors_ensemble[col] = pd.concat([Forecast_Errors_ensemble[col], test[col] - preds_ensemble[col]])

    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE, MSE, and R-squared for each target variable
for col in Y_df.columns:
    MAE_ensemble[col] = mean_absolute_error(Actuals_ensemble[col], Forecasts_ensemble[col])
    MSE_ensemble[col] = mean_squared_error(Actuals_ensemble[col], Forecasts_ensemble[col])
    R2[col] = r2_score(Actuals_ensemble[col], Forecasts_ensemble[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals_ensemble[col])
    print(f"Forecasts for {col}:")
    print(Forecasts_ensemble[col])
    print(f"Forecast Errors for {col}:")
    print(Forecast_Errors_ensemble[col])
    print(f"Mean Absolute Error for {col}: {MAE_ensemble[col]}")
    print(f"Mean Squared Error for {col}: {MSE_ensemble[col]}")
    print(f"R-squared for {col}: {R2[col]}")

In [None]:
performance_metrics = pd.DataFrame(columns=['Industry', 'MAE', 'MSE', 'R2'])

# Populate the DataFrame with the existing performance metrics
for i, col in enumerate(Y_df.columns):
    performance_metrics.loc[i, 'Industry'] = col
    performance_metrics.loc[i, 'MAE'] = MAE_ensemble[col]
    performance_metrics.loc[i, 'MSE'] = MSE_ensemble[col]
    performance_metrics.loc[i, 'R2'] = R2[col]

# Calculate the mean across all industries and add it
mean_mae_ensemble = performance_metrics['MAE'].mean()
mean_mse_ensemble = performance_metrics['MSE'].mean()
mean_r2_ensemble = performance_metrics['R2'].mean()

performance_metrics.loc[len(Y_df.columns), ['Industry', 'MAE', 'MSE', 'R2']] = ['Mean', mean_mae_ensemble, mean_mse_ensemble, mean_r2_ensemble]

# Display the performance metrics DataFrame
print(performance_metrics)

In [None]:
Historical_Mean_In_Sample_MSE = {}

# Calculate in-sample historical mean forecasts and MSE for each target variable
for col in Y_df.columns:
    in_sample_data = Y_df.iloc[:train_start_size]  # Use only in-sample data
    in_sample_historical_mean = in_sample_data[col].mean()
    in_sample_historical_mean_forecast = pd.Series([in_sample_historical_mean] * len(in_sample_data), index=in_sample_data[col].index)
    
    Historical_Mean_In_Sample_MSE[col] = mean_squared_error(in_sample_data[col], in_sample_historical_mean_forecast)
    
mean_in_sample_mse = sum(Historical_Mean_In_Sample_MSE.values()) / len(Historical_Mean_In_Sample_MSE)

# Create a DataFrame with columns for each performance metric
results_df = pd.DataFrame(columns=["Historical_Mean_In_Sample_MSE"])

# Put MSE results in to dataframe
for col in Y_df.columns:
    results_df.loc[col] = [Historical_Mean_In_Sample_MSE[col]]

# Calculate and add the mean in-sample MSE
results_df.loc["Mean"] = [mean_in_sample_mse]

R2_OS_SAE = {}
for col in Y_df.columns:
    R2_OS_SAE[col] = 1 - (performance_metrics.set_index("Industry").loc[col, "MSE"] / results_df.loc[col, "Historical_Mean_In_Sample_MSE"])

# Create a DataFrame to store the out-of-sample R-squared results
out_of_sample_predictability_df = pd.DataFrame(columns=["Industry", "R2_OS_SAE"])

# put out-of-sample R-squared values in to dataframe
for i, col in enumerate(Y_df.columns):
    out_of_sample_predictability_df.loc[i, "Industry"] = col
    out_of_sample_predictability_df.loc[i, "R2_OS_SAE"] = R2_OS_SAE[col]

# Calculate the mean out-of-sample R-squared across all industries and add it
mean_r2_os = out_of_sample_predictability_df["R2_OS_SAE"].mean()
out_of_sample_predictability_df.loc[len(Y_df.columns), ["Industry", "R2_OS_SAE"]] = ["Mean", mean_r2_os]

print(out_of_sample_predictability_df)

In [None]:
import matplotlib.pyplot as plt

for col in Y_df.columns:
    plt.figure(figsize=(12, 6))
    plt.plot(Actuals_ensemble[col].index, Actuals_ensemble[col].values, label='Actual Returns', linestyle='-', linewidth=1)
    plt.plot(Forecasts_ensemble[col].index, Forecasts_ensemble[col].values, label='Forecasted Returns', linestyle='--', linewidth=1)
    plt.title(f"{col} Actual vs. Forecasted Returns")
    plt.xlabel('Date')
    plt.ylabel('Returns')
    plt.legend()
    plt.show()

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_start_size = 200

# Initialize empty dictionaries to store results
Actuals_historical = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Historical_Mean_Forecasts = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Historical_Mean_Forecast_Errors = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Historical_Mean_MAE = {}
Historical_Mean_MSE = {}
Historical_Mean_R2 = {}

# Expanding window loop
for t in range(train_start_size, len(Y_df) - 1):
    train = Y_df.iloc[:t]
    test = Y_df.iloc[t : t + 1]

    for col in Y_df.columns:
        historical_mean = train[col].mean()
        historical_mean_forecast = pd.Series([historical_mean], index=test[col].index)
        
        Historical_Mean_Forecasts[col] = pd.concat([Historical_Mean_Forecasts[col], historical_mean_forecast])
        Actuals_historical[col] = pd.concat([Actuals_historical[col], test[col]])
        Historical_Mean_Forecast_Errors[col] = pd.concat([Historical_Mean_Forecast_Errors[col], test[col] - historical_mean_forecast])
        
    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(Y_df) - 1}")

# Calculate MAE, MSE, and R-squared for each target variable
for col in Y_df.columns:
    Historical_Mean_MAE[col] = mean_absolute_error(Actuals_historical[col], Historical_Mean_Forecasts[col])
    Historical_Mean_MSE[col] = mean_squared_error(Actuals_historical[col], Historical_Mean_Forecasts[col])
    Historical_Mean_R2[col] = r2_score(Actuals_historical[col], Historical_Mean_Forecasts[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals_historical[col])
    print(f"Historical Mean Forecasts for {col}:")
    print(Historical_Mean_Forecasts[col])
    print(f"Historical Mean Forecast Errors for {col}:")
    print(Historical_Mean_Forecast_Errors[col])
    print(f"Historical Mean MAE for {col}: {Historical_Mean_MAE[col]}")
    print(f"Historical Mean MSE for {col}: {Historical_Mean_MSE[col]}")
    print(f"Historical Mean R-squared for {col}: {Historical_Mean_R2[col]}")

In [None]:
def ols(train, test, y_key, selected_features):
    X_train = train[selected_features]
    X_test = test[selected_features]
    y_train = train[y_key]
    
    ols_model = LinearRegression().fit(X_train, y_train)
    y_hat = pd.Series(ols_model.predict(X_test), index=test.index).rename("ols_y_hat")
    
    return y_hat

def multioutput_ols(train, test, y_keys):
    preds = {}
    for y_key in y_keys:
        selected_features = train.drop(y_key, axis=1).columns
        preds[y_key] = ols(train, test, y_key, selected_features)
    return preds

Y_diff = Y_df.diff().dropna()

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_start_size = 200

# Initialize empty dictionaries to store results
Actuals_ols = {col: pd.Series(dtype=float, name=col) for col in Y_diff.columns}
Forecasts_ols_standard = {col: pd.Series(dtype=float, name=col) for col in Y_diff.columns}
Forecast_Errors_ols_standard = {col: pd.Series(dtype=float, name=col) for col in Y_diff.columns}
MAE_ols = {}
MSE_ols = {}
R2_ols = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds_ols = multioutput_ols(train, test, Y_diff.columns)

    for col in Y_diff.columns:
        Forecasts_ols_standard[col] = pd.concat([Forecasts_ols_standard[col], preds_ols[col]])
        Actuals_ols[col] = pd.concat([Actuals_ols[col], test[col]])
        Forecast_Errors_ols_standard[col] = pd.concat([Forecast_Errors_ols_standard[col], test[col] - preds_ols[col]])
        
    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_diff.columns:
    MAE_ols[col] = mean_absolute_error(Actuals_ols[col], Forecasts_ols_standard[col])
    MSE_ols[col] = mean_squared_error(Actuals_ols[col], Forecasts_ols_standard[col])
    R2_ols[col] = r2_score(Actuals_ols[col], Forecasts_ols_standard[col])

In [None]:
import numpy as np
import scipy.stats as stats

def diebold_mariano_test(errors1, errors2, h=1, alternative='two_sided'):
    """
    Perform Diebold-Mariano test for equal forecast accuracy.
    
    errors1 : array_like
        Forecast errors from the first model.
    errors2 : array_like
        Forecast errors from the second model.
    h : int, optional
        Forecast horizon, default is 1.
    alternative : str, optional
        Alternative hypothesis, one of 'two_sided', 'greater', 'less', default is 'two_sided'.
    
    Returns
    -------
    dm_stat : float
        Diebold-Mariano test statistic.
    p_value : float
        p-value for the Diebold-Mariano test.
    """
    assert len(errors1) == len(errors2), "Error series must have the same length"
    assert alternative in ['two_sided', 'greater', 'less'], "Invalid alternative hypothesis"
    
    d = errors1**2 - errors2**2
    T = len(d)
    d_bar = np.mean(d)
    gamma0 = np.var(d)
    
    # Calculate autocovariance of d for lag j
    autocov_d = [np.cov(d[:-j], d[j:])[0, 1] for j in range(1, h)]
    
    # Calculate variance of d_bar
    variance_d_bar = (1 / T) * (gamma0 + 2 * sum([((T - j) / T) * autocov_d[j - 1] for j in range(1, h)]))
    dm_stat = d_bar / np.sqrt(variance_d_bar)
    
    if alternative == 'two_sided':
        p_value = 2 * (1 - stats.norm.cdf(abs(dm_stat)))
    elif alternative == 'greater':
        p_value = 1 - stats.norm.cdf(dm_stat)
    elif alternative == 'less':
        p_value = stats.norm.cdf(dm_stat)
        
    return dm_stat, p_value

# Compute forecast errors for each model and target variable
errors_ensemble = {col: Forecast_Errors_ensemble[col].values for col in Y_df.columns}
errors_ols = {col: Forecast_Errors_ols_standard[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_ols[col], errors_ensemble[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_svr = {col: Forecast_Errors_svr[col].values for col in Y_df.columns}
errors_ols = {col: Forecast_Errors_ols[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_ols[col], errors_svr[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_ols_post_lasso = {col: Forecast_Errors_ols[col].values for col in Y_df.columns}
errors_ols = {col: Forecast_Errors_ols_standard[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_ols[col], errors_ols_post_lasso[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_ffn = {col: Forecast_Errors_ffn[col].values for col in Y_df.columns}
errors_ols = {col: Forecast_Errors_ols_standard[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_ols[col], errors_ffn[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_ols_post_lasso = {col: Forecast_Errors_ols[col].values for col in Y_df.columns}
errors_historic = {col: Historical_Mean_Forecast_Errors[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_historic[col], errors_ols_post_lasso[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_ensemble = {col: Forecast_Errors_ensemble[col].values for col in Y_df.columns}
errors_historic = {col: Historical_Mean_Forecast_Errors[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_historic[col], errors_ensemble[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_svr = {col: Forecast_Errors_svr[col].values for col in Y_df.columns}
errors_historic = {col: Historical_Mean_Forecast_Errors[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_historic[col], errors_svr[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

In [None]:
# Compute forecast errors for each model and target variable
errors_ffn = {col: Forecast_Errors_ffn[col].values for col in Y_df.columns}
errors_historic = {col: Historical_Mean_Forecast_Errors[col].values for col in Y_df.columns}

# Perform Diebold-Mariano test
for col in Y_df.columns:
    dm_stat, p_value = diebold_mariano_test(errors_historic[col], errors_ffn[col])
    print(f"Diebold-Mariano test for {col}:")
    print(f"Test statistic: {dm_stat:.5f}")
    print(f"p-value: {p_value:.8f}")

I det sidste afsnit af koden vil der laves en rotationsportefølje for alle modellerne 

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew

def calculate_cagr(portfolio_returns):
    total_return = np.prod(1 + portfolio_returns)
    return (total_return ** (1 / (len(portfolio_returns) / 12))) - 1

def calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate):
    excess_returns = portfolio_returns - risk_free_rate / 12
    return np.mean(excess_returns) / np.std(excess_returns) * np.sqrt(12)

def calculate_mppm(portfolio_returns, risk_free_rate):
    excess_returns = portfolio_returns - risk_free_rate / 12
    return (1 + np.mean(excess_returns)) / (1 + skew(excess_returns))

def calculate_max_drawdown(portfolio_returns):
    cum_returns = (1 + portfolio_returns).cumprod()
    max_returns = cum_returns.expanding().max()
    drawdowns = (cum_returns / max_returns) - 1
    return drawdowns.min()

def rotation_portfolio_strategy(Forecasts_p, Actuals_p):
    portfolio_returns = []

    print("Month | Long Industries | Short Industries | Monthly Return")
    print("-" * 65)

    for t in range(len(Forecasts_p[next(iter(Forecasts_p))])):
        forecasts_t = {col: Forecasts_p[col].iloc[t] for col in Forecasts_p}
        sorted_industries = sorted(forecasts_t, key=forecasts_t.get)
        top_6 = sorted_industries[-6:]
        bottom_6 = sorted_industries[:6]

        long_positions = sum([Actuals_p[col].iloc[t] / 100 for col in top_6]) / 6
        short_positions = sum([-Actuals_p[col].iloc[t] / 100 for col in bottom_6]) / 6

        portfolio_return = long_positions - short_positions

        portfolio_returns.append(portfolio_return)

        # Print chosen industries and the return for the current month (only for the first 12 months)
        if t < 100:
            print(f"{t+1:5} | {top_6} | {bottom_6} | {portfolio_return * 100:.2f}%")

    return pd.Series(portfolio_returns, name='Portfolio Returns')

# Use the provided risk-free rate
risk_free_rate = 0.02

years = len(portfolio_returns) / 12

Actuals_p = Actuals_ols
Forecasts_p = Forecasts_ols

# Calculate the portfolio returns using the rotation strategy
portfolio_returns = rotation_portfolio_strategy(Forecasts_p, Actuals_p)

cagr_ols = calculate_cagr(portfolio_returns)
sharpe_ratio_ols = calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate)
mppm_ols = calculate_mppm(portfolio_returns, risk_free_rate)
max_drawdown_ols = calculate_max_drawdown(portfolio_returns)
years = len(portfolio_returns) / 12

# Print results
print(f"CAGR_ols: {cagr_ols * 12:.4f}")
print(f"Annual Sharpe Ratio ols: {sharpe_ratio_ols:.4f}")
print(f"MPPM ols: {mppm_ols:.4f}")
print(f"Maximum Drawdown ols: {max_drawdown_ols:.4f}")

In [None]:
Actuals_p = Actuals_svr
Forecasts_p = Forecasts_svr

# Calculate the portfolio returns using the rotation strategy
portfolio_returns = rotation_portfolio_strategy(Forecasts_p, Actuals_p)

cagr_svr = calculate_cagr(portfolio_returns)
sharpe_ratio_svr = calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate)
mppm_svr = calculate_mppm(portfolio_returns, risk_free_rate)
max_drawdown_svr = calculate_max_drawdown(portfolio_returns)
years = len(portfolio_returns) / 12

# Print results
print(f"CAGR svr: {cagr_svr * 12:.4f}")
print(f"Annual Sharpe Ratio svr: {sharpe_ratio_svr:.4f}")
print(f"MPPM svr: {mppm_svr:.4f}")
print(f"Maximum Drawdown svr: {max_drawdown_svr:.4f}")

In [None]:
Actuals_p = Actuals_ffn
Forecasts_p = Forecasts_ffn

# Calculate the portfolio returns using the rotation strategy
portfolio_returns = rotation_portfolio_strategy(Forecasts_p, Actuals_p)

cagr_ffn = calculate_cagr(portfolio_returns)
sharpe_ratio_ffn = calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate)
mppm_ffn = calculate_mppm(portfolio_returns, risk_free_rate)
max_drawdown_ffn = calculate_max_drawdown(portfolio_returns)
years = len(portfolio_returns) / 12

# Print results
print(f"CAGR ffn: {cagr_ffn * 12:.4f}")
print(f"Annual Sharpe Ratio ffn: {sharpe_ratio_ffn:.4f}")
print(f"MPPM ffn: {mppm_ffn:.4f}")
print(f"Maximum Drawdown ffn: {max_drawdown_ffn:.4f}")

In [None]:
Actuals_p = Actuals_ols
Forecasts_p = Historical_Mean_Forecasts

# Calculate the portfolio returns using the rotation strategy
portfolio_returns = rotation_portfolio_strategy(Forecasts_p, Actuals_p)

cagr_historical = calculate_cagr(portfolio_returns)
sharpe_ratio_historical = calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate)
mppm_historical = calculate_mppm(portfolio_returns, risk_free_rate)
max_drawdown_historical = calculate_max_drawdown(portfolio_returns)
years = len(portfolio_returns) / 12

# Print results
print(f"CAGR Historic: {cagr_historical * 12:.4f}")
print(f"Annual Sharpe Ratio Historic: {sharpe_ratio_historical:.4f}")
print(f"MPPM Historic: {mppm_historical:.4f}")
print(f"Maximum Drawdown Historic: {max_drawdown_historical:.4f}")

In [None]:
Actuals_p = Actuals_ensemble
Forecasts_p = Forecasts_ensemble

# Calculate the portfolio returns using the rotation strategy
portfolio_returns = rotation_portfolio_strategy(Forecasts_p, Actuals_p)

cagr_ensemble = calculate_cagr(portfolio_returns)
sharpe_ratio_ensemble = calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate)
mppm_ensemble = calculate_mppm(portfolio_returns, risk_free_rate)
max_drawdown_ensemble = calculate_max_drawdown(portfolio_returns)
years = len(portfolio_returns) / 12

# Print results
print(f"CAGR Ensemble: {cagr_ensemble * 12:.4f}")
print(f"Annual Sharpe Ratio Ensemble: {sharpe_ratio_ensemble:.4f}")
print(f"MPPM Ensemble: {mppm_ensemble:.4f}")
print(f"Maximum Drawdown Ensemble: {max_drawdown_ensemble:.4f}")

In [None]:
Actuals_p = Actuals_ols
Forecasts_p = Forecasts_ols_standard

# Calculate the portfolio returns using the rotation strategy
portfolio_returns = rotation_portfolio_strategy(Forecasts_p, Actuals_p)

cagr_ols_standard = calculate_cagr(portfolio_returns)
sharpe_ratio_ols_standard = calculate_annual_sharpe_ratio(portfolio_returns, risk_free_rate)
mppm_ols_standard = calculate_mppm(portfolio_returns, risk_free_rate)
max_drawdown_ols_standard = calculate_max_drawdown(portfolio_returns)
years = len(portfolio_returns) / 12

# Print results
print(f"CAGR Ols Standard: {cagr_ols_standard * 12:.4f}")
print(f"Annual Sharpe Ratio Ols Standard: {sharpe_ratio_ols_standard:.4f}")
print(f"MPPM Ols Standard: {mppm_ols_standard:.4f}")
print(f"Maximum Drawdown Ols Standard: {max_drawdown_ols_standard:.4f}")

In [None]:
def plot_metrics_comparison(models, metrics, metric_name):
    plt.bar(models, metrics, width=0.4)
    plt.title(f'{metric_name} Comparison')
    plt.xlabel('Models')
    plt.ylabel(metric_name)
    plt.grid(axis='y')
    plt.show()

# Define the models and their corresponding metrics
models = ['OLS', 'SVR', 'FFN', 'Hist Mean', 'Ensemble', 'OLS Std']
cagrs = [cagr_ols, cagr_svr, cagr_ffn, cagr_historical, cagr_ensemble, cagr_ols_standard]
sharpe_ratios = [sharpe_ratio_ols, sharpe_ratio_svr, sharpe_ratio_ffn, sharpe_ratio_historical, sharpe_ratio_ensemble, sharpe_ratio_ols_standard]
mppms = [mppm_ols, mppm_svr, mppm_ffn, mppm_historical, mppm_ensemble, mppm_ols_standard]
max_drawdowns = [max_drawdown_ols, max_drawdown_svr, max_drawdown_ffn, max_drawdown_historical, max_drawdown_ensemble, max_drawdown_ols_standard]

# Call the function for each metric
plot_metrics_comparison(models, cagrs, 'CAGR')
plot_metrics_comparison(models, sharpe_ratios, 'Sharpe Ratio')
plot_metrics_comparison(models, mppms, 'MPPM')
plot_metrics_comparison(models, max_drawdowns, 'Max Drawdown')

In [None]:
def plot_cumulative_returns(portfolio_returns, title):
    cum_returns = (1 + portfolio_returns).cumprod() - 1
    plt.plot(cum_returns, label=title)

# Call the function for each model
plot_cumulative_returns(rotation_portfolio_strategy(Forecasts_ols, Actuals_ols), 'OLS')
plot_cumulative_returns(rotation_portfolio_strategy(Forecasts_svr, Actuals_svr), 'SVR')
plot_cumulative_returns(rotation_portfolio_strategy(Forecasts_ffn, Actuals_ffn), 'FFN')
plot_cumulative_returns(rotation_portfolio_strategy(Historical_Mean_Forecasts, Actuals_ols), 'Historical Mean')
plot_cumulative_returns(rotation_portfolio_strategy(Forecasts_ensemble, Actuals_ensemble), 'Ensemble')
plot_cumulative_returns(rotation_portfolio_strategy(Forecasts_ols_standard, Actuals_ols), 'OLS Standard')

# Customize the plot
plt.title('Cumulative Returns of Different Models')
plt.xlabel('Months')
plt.ylabel('Cumulative Returns')
plt.legend(loc='upper left')
plt.grid()
plt.show()

In [None]:
def plot_cumulative_returns_in_periods(portfolio_returns, title, start_month, end_month):
    cum_returns = (1 + portfolio_returns).cumprod() - 1
    plt.plot(cum_returns[start_month:end_month], label=title)

def create_subplots(period_title, start_month, end_month):
    plot_cumulative_returns_in_periods(rotation_portfolio_strategy(Forecasts_ols, Actuals_ols), 'OLS', start_month, end_month)
    plot_cumulative_returns_in_periods(rotation_portfolio_strategy(Forecasts_svr, Actuals_svr), 'SVR', start_month, end_month)
    plot_cumulative_returns_in_periods(rotation_portfolio_strategy(Forecasts_ffn, Actuals_ffn), 'FFN', start_month, end_month)
    plot_cumulative_returns_in_periods(rotation_portfolio_strategy(Historical_Mean_Forecasts, Actuals_ols), 'Historical Mean', start_month, end_month)
    plot_cumulative_returns_in_periods(rotation_portfolio_strategy(Forecasts_ensemble, Actuals_ensemble), 'Ensemble', start_month, end_month)
    plot_cumulative_returns_in_periods(rotation_portfolio_strategy(Forecasts_ols_standard, Actuals_ols), 'OLS Standard', start_month, end_month)

    plt.title(f'Cumulative Returns of Different Models ({period_title})')
    plt.xlabel('Months')
    plt.ylabel('Cumulative Returns')
    plt.legend(loc='upper left')
    plt.grid()

total_months = len(rotation_portfolio_strategy(Forecasts_ols, Actuals_ols))
halfway_month = total_months // 2

plt.figure(figsize=(8, 18))

# First period
plt.subplot(3, 1, 1)
create_subplots("First Half", 0, halfway_month)

# Second period
plt.subplot(3, 1, 2)
create_subplots("Second Half", halfway_month, total_months)

# Full period
plt.subplot(3, 1, 3)
create_subplots("Full Period", 0, total_months)

plt.tight_layout()
plt.show()