In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

import matplotlib.pyplot as plt

# reduce display precision on numpy arrays
np.set_printoptions(precision=5)

df = pd.read_csv('C:/Users/Computer/Documents/bachelor/dataset.csv')
pd.options.mode.chained_assignment=None
df['csp'][637:]  = df['csp'][0:638].mean()

df['Index'] = pd.to_numeric(df['Index'], errors='coerce')

In [None]:
df.head(865)

In [None]:
X = df.iloc[:, 1:-30].values
Y = df.iloc[:, -30:].values

In [None]:
feature_names = df.columns[1:18] 
target_names = df.columns[-30:]  

from sklearn.preprocessing import StandardScaler

X_df = df[feature_names]
Y_df = df[target_names]

scaler = StandardScaler()
X_df = pd.DataFrame(scaler.fit_transform(X_df), columns=X_df.columns)

data = pd.concat([Y_df, X_df], axis=1)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

In [None]:
def extratrees_feature_selection(train, y_key, jobs=24):
    X_train = train.drop(y_key, axis=1)
    y_train = train[y_key]

    extratrees = ExtraTreesRegressor(n_jobs=jobs, random_state=42)
    extratrees.fit(X_train, y_train)

    selector = SelectFromModel(extratrees, prefit=True)
    selected_features = train.drop(y_key, axis=1).columns[selector.get_support()]
    
    return selected_features

In [None]:
def initial_hyperparameter_tuning(train, y_key, selected_features, model_config):
    X_train = train[selected_features]
    y_train = train[y_key]

    time_series_cv = TimeSeriesSplit(n_splits=model_config["tscv_splits"])
    
    pipeline = Pipeline([("svr", SVR())])
    param_grid = {
        "svr__kernel": ["linear", "rbf"],
        "svr__C": [0.1, 1, 10],
        "svr__epsilon": [0.01, 0.1, 1, 2]
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=time_series_cv, scoring="neg_mean_squared_error", n_jobs=24)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    
    return best_params

In [None]:
def svr_tuned(train, test, y_key, selected_features, best_params):
    X_train = train[selected_features]
    X_test = test[selected_features]
    y_train = train[y_key]

    svr_model = SVR(kernel=best_params[y_key]["svr__kernel"], C=best_params[y_key]["svr__C"], epsilon=best_params[y_key]["svr__epsilon"])
    svr_model.fit(X_train, y_train)
    
    y_hat = pd.Series(svr_model.predict(X_test), index=test.index).rename("svr_tuned_y_hat")
    
    return y_hat

In [None]:
def multioutput_svr_tuned_extratrees(train, test, y_keys, best_params, jobs=24):
    preds = {}
    for y_key in y_keys:
        selected_features = extratrees_feature_selection(train, y_key, jobs)
        preds[y_key] = svr_tuned(train, test, y_key, selected_features, best_params)
    return preds

In [None]:
model_config = {
    "tscv_splits": 20
}

In [None]:
def print_current_values_and_errors():
    for col in Y_df.columns:
        actuals_no_nan = Actuals[col][train_start_size:].dropna()
        forecasts_no_nan = Forecasts[col][train_start_size:].dropna()
        forecast_errors_no_nan = Forecast_Errors[col][train_start_size:].dropna()
        
        print(f"Actuals for {col}:\n{actuals_no_nan}")
        print(f"Forecasts for {col}:\n{forecasts_no_nan}")
        print(f"Forecast_Errors for {col}:\n{forecast_errors_no_nan}")
        
        mae = mean_absolute_error(actuals_no_nan, forecasts_no_nan)
        mse = mean_squared_error(actuals_no_nan, forecasts_no_nan)
        
        print(f"Current MAE for {col}: {mae}")
        print(f"Current MSE for {col}: {mse}")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Perform hyperparameter tuning on the initial training set
train_start_size = 200
initial_train = data.iloc[:train_start_size]

initial_selected_features = {}
for y_key in Y_df.columns:
    initial_selected_features[y_key] = extratrees_feature_selection(initial_train, y_key, jobs=10)
    
best_params = {}
for y_key in Y_df.columns:
    best_params[y_key] = initial_hyperparameter_tuning(initial_train, y_key, initial_selected_features[y_key], model_config)

# Initialize empty dictionaries to store results
Actuals = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecasts = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Forecast_Errors = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
MAE = {}
MSE = {}
R2 = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds = multioutput_svr_tuned_extratrees(train, test, Y_df.columns, best_params, jobs=10)

    for col in Y_df.columns:
        Forecasts[col] = pd.concat([Forecasts[col], preds[col]])
        Actuals[col] = pd.concat([Actuals[col], test[col]])
        Forecast_Errors[col] = pd.concat([Forecast_Errors[col], test[col] - preds[col]])

    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_df.columns:
    MAE[col] = mean_absolute_error(Actuals[col], Forecasts[col])
    MSE[col] = mean_squared_error(Actuals[col], Forecasts[col])
    R2[col] = r2_score(Actuals[col], Forecasts[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals[col])
    print(f"Forecasts for {col}:")
    print(Forecasts[col])
    print(f"Forecast Errors for {col}:")
    print(Forecast_Errors[col])
    print(f"Mean Absolute Error for {col}: {MAE[col]}")
    print(f"Mean Squared Error for {col}: {MSE[col]}")
    print(f"R-squared for {col}: {R2[col]}")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

best_params_list = []

for y_key in Y_df.columns:
    best_params_list.append({'Industry': y_key, 'Best Parameters': best_params[y_key]})

best_params_df = pd.DataFrame(best_params_list)

print(best_params_df)

In [None]:
performance_metrics = pd.DataFrame(columns=['Industry', 'MAE', 'MSE', 'R2'])

# Populate the DataFrame with the existing performance metrics
for i, col in enumerate(Y_df.columns):
    performance_metrics.loc[i, 'Industry'] = col
    performance_metrics.loc[i, 'MAE'] = MAE[col]
    performance_metrics.loc[i, 'MSE'] = MSE[col]
    performance_metrics.loc[i, 'R2'] = R2[col]

# Calculate the mean across all industries and add it to the DataFrame
mean_mae = performance_metrics['MAE'].mean()
mean_mse = performance_metrics['MSE'].mean()
mean_r2 = performance_metrics['R2'].mean()

performance_metrics.loc[len(Y_df.columns), ['Industry', 'MAE', 'MSE', 'R2']] = ['Mean', mean_mae, mean_mse, mean_r2]

print(performance_metrics)

In [None]:
Historical_Mean_In_Sample_MSE = {}

# Calculate in-sample historical mean forecasts and MSE for each target variable
for col in Y_df.columns:
    in_sample_data = Y_df.iloc[:train_start_size]  # Use only in-sample data
    in_sample_historical_mean = in_sample_data[col].mean()
    in_sample_historical_mean_forecast = pd.Series([in_sample_historical_mean] * len(in_sample_data), index=in_sample_data[col].index)
    
    Historical_Mean_In_Sample_MSE[col] = mean_squared_error(in_sample_data[col], in_sample_historical_mean_forecast)
    
mean_in_sample_mse = sum(Historical_Mean_In_Sample_MSE.values()) / len(Historical_Mean_In_Sample_MSE)

# Create a DataFrame with columns for each performance metric
results_df = pd.DataFrame(columns=["Historical_Mean_In_Sample_MSE"])

# Populate the DataFrame with in-sample MSE results
for col in Y_df.columns:
    results_df.loc[col] = [Historical_Mean_In_Sample_MSE[col]]

# Calculate and add the mean in-sample MSE to the DataFrame
results_df.loc["Mean"] = [mean_in_sample_mse]

R2_OS_SVR = {}
for col in Y_df.columns:
    R2_OS_SVR[col] = 1 - (performance_metrics.set_index("Industry").loc[col, "MSE"] / results_df.loc[col, "Historical_Mean_In_Sample_MSE"])

# Create a DataFrame to store the out-of-sample R-squared results
out_of_sample_predictability_df = pd.DataFrame(columns=["Industry", "R2_OS_SVR"])

# Populate the DataFrame with the out-of-sample R-squared values
for i, col in enumerate(Y_df.columns):
    out_of_sample_predictability_df.loc[i, "Industry"] = col
    out_of_sample_predictability_df.loc[i, "R2_OS_SVR"] = R2_OS_SVR[col]

# Calculate the mean out-of-sample R-squared across all industries and add it to the DataFrame
mean_r2_os = out_of_sample_predictability_df["R2_OS_SVR"].mean()
out_of_sample_predictability_df.loc[len(Y_df.columns), ["Industry", "R2_OS_SVR"]] = ["Mean", mean_r2_os]

# Display the out-of-sample predictability DataFrame
print(out_of_sample_predictability_df)