In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

import matplotlib.pyplot as plt

# reduce display precision on numpy arrays
np.set_printoptions(precision=5)

df = pd.read_csv('C:/Users/Computer/Documents/bachelor/dataset.csv')
pd.options.mode.chained_assignment=None
df['csp'][637:]  = df['csp'][0:638].mean()

df['Index'] = pd.to_numeric(df['Index'], errors='coerce')

In [None]:
df.head(865)

In [None]:
X = df.iloc[:, 1:-30].values
Y = df.iloc[:, -30:].values

In [None]:
feature_names = df.columns[1:18]
target_names = df.columns[-30:] 

from sklearn.preprocessing import StandardScaler

X_df = df[feature_names]
Y_df = df[target_names]

scaler = StandardScaler()
X_df = pd.DataFrame(scaler.fit_transform(X_df), columns=X_df.columns)

data = pd.concat([Y_df, X_df], axis=1)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from scikeras.wrappers import KerasRegressor
from keras.optimizers import Adam
from tensorflow.keras.models import Model


def create_nn(hidden_layers, neurons, dropout_rate, learning_rate, alpha):
    model = Sequential()
    model.add(Dense(neurons, input_dim=17, kernel_initializer='normal', activation='relu', kernel_regularizer='l2'))

    for _ in range(hidden_layers - 1):
        model.add(Dense(neurons, activation='relu', kernel_regularizer='l2'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(30, activation='linear'))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

In [None]:
def initial_hyperparameter_tuning(train, y_keys, model_config):
    X_train = train.drop(y_keys, axis=1)
    y_train = train[y_keys]

    time_series_cv = TimeSeriesSplit(n_splits=model_config["tscv_splits"])

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    param_grid = {
        "hidden_layers": [1, 2, 3],
        "neurons": [16, 32, 64],
        "dropout_rate": [0.1, 0.2],
        "learning_rate": [0.01, 0.1],
        "alpha": [0.01, 0.1, 0.2]
    }

    grid_search = GridSearchCV(KerasRegressor(model=create_nn, epochs=100, batch_size=10, verbose=0, alpha=0.01, learning_rate=0.01, dropout_rate=0.1, neurons=16, hidden_layers=1),
                               param_grid, cv=time_series_cv, scoring="neg_mean_squared_error", n_jobs=1)
    grid_search.fit(X_train_scaled, y_train)
    best_params = grid_search.best_params_

    return best_params

In [None]:
def nn_tuned(train, test, y_keys, best_params):
    X_train = train.drop(y_keys, axis=1)
    X_test = test.drop(y_keys, axis=1)
    y_train = train[y_keys]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = create_nn(best_params["hidden_layers"], best_params["neurons"], best_params["dropout_rate"],
                      best_params["learning_rate"], best_params["alpha"])
    model.fit(X_train_scaled, y_train, epochs=100, batch_size=10, verbose=0)

    y_hat = pd.DataFrame(model.predict(X_test_scaled), index=test.index, columns=y_keys)

    return y_hat


model_config = {
    "tscv_splits": 20
}

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_start_size = 200
initial_train = data.iloc[:train_start_size]

best_params = initial_hyperparameter_tuning(initial_train, Y_df.columns, model_config)

Actuals = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecasts = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecast_Errors = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
MAE = {}
MSE = {}
R2 = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds = nn_tuned(train, test, Y_df.columns, best_params)

    for col in Y_df.columns:
        Forecasts[col] = pd.concat([Forecasts[col], preds[col]])
        Actuals[col] = pd.concat([Actuals[col], test[col]])
        Forecast_Errors[col] = pd.concat([Forecast_Errors[col], test[col] - preds[col]])

    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_df.columns:
    MAE[col] = mean_absolute_error(Actuals[col], Forecasts[col])
    MSE[col] = mean_squared_error(Actuals[col], Forecasts[col])
    R2[col] = r2_score(Actuals[col], Forecasts[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals[col])
    print(f"Forecasts for {col}:")
    print(Forecasts[col])
    print(f"Forecast Errors for {col}:")
    print(Forecast_Errors[col])
    print(f"Mean Absolute Error for {col}: {MAE[col]}")
    print(f"Mean Squared Error for {col}: {MSE[col]}")
    print(f"R-squared for {col}: {R2[col]}")

In [None]:
print("Best parameters:")
for y_key in best_params.keys():
    print(y_key, ":", best_params[y_key])

In [None]:
performance_metrics = pd.DataFrame(columns=['Industry', 'MAE', 'MSE', 'R2'])

# put existing performance metrics in to df
for i, col in enumerate(Y_df.columns):
    performance_metrics.loc[i, 'Industry'] = col
    performance_metrics.loc[i, 'MAE'] = MAE[col]
    performance_metrics.loc[i, 'MSE'] = MSE[col]
    performance_metrics.loc[i, 'R2'] = R2[col]

# Calculate the mean across all industries and add it 
mean_mae = performance_metrics['MAE'].mean()
mean_mse = performance_metrics['MSE'].mean()
mean_r2 = performance_metrics['R2'].mean()

performance_metrics.loc[len(Y_df.columns), ['Industry', 'MAE', 'MSE', 'R2']] = ['Mean', mean_mae, mean_mse, mean_r2]

print(performance_metrics)

In [None]:
Historical_Mean_In_Sample_MSE = {}

# Calculate in-sample historical mean forecasts and MSE for each target variable
for col in Y_df.columns:
    in_sample_data = Y_df.iloc[:train_start_size]  # Use only in-sample data
    in_sample_historical_mean = in_sample_data[col].mean()
    in_sample_historical_mean_forecast = pd.Series([in_sample_historical_mean] * len(in_sample_data), index=in_sample_data[col].index)
    
    Historical_Mean_In_Sample_MSE[col] = mean_squared_error(in_sample_data[col], in_sample_historical_mean_forecast)
    
mean_in_sample_mse = sum(Historical_Mean_In_Sample_MSE.values()) / len(Historical_Mean_In_Sample_MSE)

# Create a DataFrame with columns for each performance metric
results_df = pd.DataFrame(columns=["Historical_Mean_In_Sample_MSE"])

# in the DataFrame we put in the in-sample MSE results
for col in Y_df.columns:
    results_df.loc[col] = [Historical_Mean_In_Sample_MSE[col]]

# Calculate and add the mean in-sample MSE to the DataFrame
results_df.loc["Mean"] = [mean_in_sample_mse]

R2_OS_FFN = {}
for col in Y_df.columns:
    R2_OS_FFN[col] = 1 - (performance_metrics.set_index("Industry").loc[col, "MSE"] / results_df.loc[col, "Historical_Mean_In_Sample_MSE"])

# Create a DataFrame to store the out-of-sample R-squared results
out_of_sample_predictability_df = pd.DataFrame(columns=["Industry", "R2_OS_FFN"])

# Populate the DataFrame with the out-of-sample R-squared values
for i, col in enumerate(Y_df.columns):
    out_of_sample_predictability_df.loc[i, "Industry"] = col
    out_of_sample_predictability_df.loc[i, "R2_OS_FFN"] = R2_OS_FFN[col]

# Calculate the mean out-of-sample R-squared across all industries and add it to the DataFrame
mean_r2_os = out_of_sample_predictability_df["R2_OS_FFN"].mean()
out_of_sample_predictability_df.loc[len(Y_df.columns), ["Industry", "R2_OS_FFN"]] = ["Mean", mean_r2_os]

# Display the out-of-sample predictability DataFrame
print(out_of_sample_predictability_df)