In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

import matplotlib.pyplot as plt

# reduce display precision on numpy arrays
np.set_printoptions(precision=5)

In [None]:
df = pd.read_csv('C:/Users/Computer/Documents/bachelor/dataset.csv')
pd.options.mode.chained_assignment=None
df['csp'][637:]  = df['csp'][0:638].mean()

df['Index'] = pd.to_numeric(df['Index'], errors='coerce')

In [None]:
df.head(200)

In [None]:
X = df.iloc[:, 1:-30].values
Y = df.iloc[:, -30:].values

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV, LinearRegression, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
feature_names = df.columns[1:18]
target_names = df.columns[-30:] 

from sklearn.preprocessing import StandardScaler

X_df = df[feature_names]
Y_df = df[target_names]

scaler = StandardScaler()
X_df = pd.DataFrame(scaler.fit_transform(X_df), columns=X_df.columns)

data = pd.concat([Y_df, X_df], axis=1)

In [None]:
def ols(train, test, y_key, selected_features):
    X_train = train[selected_features]
    X_test = test[selected_features]
    y_train = train[y_key]
    
    ols_model = LinearRegression().fit(X_train, y_train)
    y_hat = pd.Series(ols_model.predict(X_test), index=test.index).rename("ols_y_hat")
    
    return y_hat

def multioutput_ols(train, test, y_keys):
    preds = {}
    for y_key in y_keys:
        selected_features = train.drop(y_key, axis=1).columns
        preds[y_key] = ols(train, test, y_key, selected_features)
    return preds

In [None]:
# Take the first difference of Y_df to ensure a stationary process
Y_diff = Y_df.diff().dropna()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_start_size = 200

# Initialize empty dictionaries to store results
Actuals = {col: pd.Series(dtype=float, name=col) for col in Y_diff.columns}
Forecasts = {col: pd.Series(dtype=float, name=col) for col in Y_diff.columns}
Forecast_Errors = {col: pd.Series(dtype=float, name=col) for col in Y_diff.columns}
MAE = {}
MSE = {}
R2 = {}

# Expanding window loop
for t in range(train_start_size, len(data) - 1):
    train = data.iloc[:t]
    test = data.iloc[t : t + 1]

    preds = multioutput_ols(train, test, Y_diff.columns)

    for col in Y_diff.columns:
        Forecasts[col] = pd.concat([Forecasts[col], preds[col]])
        Actuals[col] = pd.concat([Actuals[col], test[col]])
        Forecast_Errors[col] = pd.concat([Forecast_Errors[col], test[col] - preds[col]])
        
    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(data) - 1}")

# Calculate MAE and MSE for each target variable
for col in Y_diff.columns:
    MAE[col] = mean_absolute_error(Actuals[col], Forecasts[col])
    MSE[col] = mean_squared_error(Actuals[col], Forecasts[col])
    R2[col] = r2_score(Actuals[col], Forecasts[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals[col])
    print(f"Forecasts for {col}:")
    print(Forecasts[col])
    print(f"Forecast Errors for {col}:")
    print(Forecast_Errors[col])
    print(f"Mean Absolute Error for {col}: {MAE[col]}")
    print(f"Mean Squared Error for {col}: {MSE[col]}")
    print(f"R-squared for {col}: {R2[col]}")

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

significance_level = 0.05

train = data.iloc[:train_start_size]

# Calculate in-sample actuals and forecasts
in_sample_preds = multioutput_ols(train, train, Y_df.columns)
    
autocorrelation_results = pd.DataFrame(columns=['Industry', 'Lag', 'Test Statistic', 'p-value', 'Evidence'])

for col in Y_df.columns:
    actuals = train[col]
    residuals = actuals - in_sample_preds[col]
    lb_test_result = acorr_ljungbox(residuals, lags=60, return_df=True)

    for i, row in lb_test_result.iterrows():
        if i + 1 in [2, 12, 24, 36, 48, 60]:  # Only check for lags 2 and 24
            lb_stat, p_value = row['lb_stat'], row['lb_pvalue']
            evidence = "Yes" if p_value < significance_level else "No"
            result_row = pd.DataFrame(
                {
                    'Industry': [col],
                    'Lag': [i + 1],
                    'Test Statistic': [lb_stat],
                    'p-value': [p_value],
                    'Evidence': [evidence]
                }
            )
            autocorrelation_results = pd.concat([autocorrelation_results, result_row], ignore_index=True)

In [None]:
def highlight_significant(val):
    if val < 0.05:
        return 'background-color: green'
    else:
        return ''

# Pivot the DataFrame to have industries as rows and lags as columns
reshaped_autocorrelation_results = autocorrelation_results.pivot_table(
    index='Industry',
    columns='Lag',
    values=['Test Statistic', 'p-value']
)

# Apply the custom formatting to the p-value part of the reshaped DataFrame
highlighted_pvalues = reshaped_autocorrelation_results['p-value'].style.applymap(highlight_significant)

# Display the highlighted p-values
display(highlighted_pvalues)

In [None]:
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm

# Breusch-Pagan test
bp_test_results = {}

for col in Y_diff.columns:
    selected_features = train.drop(col, axis=1).columns
    in_sample_forecasts = ols(train, train, col, selected_features)  # Modify this line
    
    actuals = train[col]
    residuals = actuals - in_sample_forecasts
    
    X_train = train[selected_features]
    ols_model = sm.OLS(actuals, sm.add_constant(X_train)).fit()
    bp_test_result = het_breuschpagan(ols_model.resid, sm.add_constant(X_train))
    bp_test_results[col] = bp_test_result

bp_test_df = pd.DataFrame(columns=["Industry", "LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"])

for col, bp_result in bp_test_results.items():
    row_df = pd.DataFrame({"Industry": [col],
                           "LM Statistic": [bp_result[0]],
                           "LM-Test p-value": [bp_result[1]],
                           "F-Statistic": [bp_result[2]],
                           "F-Test p-value": [bp_result[3]]})
    bp_test_df = pd.concat([bp_test_df, row_df], ignore_index=True)

# Set the index of the DataFrame to the Industry column
bp_test_df.set_index("Industry", inplace=True)

# Print the DataFrame as a table
print("Breusch-Pagan Test Results:")
print("----------------------------")
print(bp_test_df)

In [None]:
def highlight_bp(val):
    if val < 0.05:
        return 'background-color: green'
    else:
        return ''

styled_bp_test_df = bp_test_df.style.applymap(highlight_bp, subset=['LM-Test p-value', 'F-Test p-value'])

styled_bp_test_df

In [None]:
performance_metrics = pd.DataFrame(columns=['Industry', 'MAE', 'MSE', 'R2'])

# Populate the DataFrame with the existing performance metrics
for i, col in enumerate(Y_df.columns):
    performance_metrics.loc[i, 'Industry'] = col
    performance_metrics.loc[i, 'MAE'] = MAE[col]
    performance_metrics.loc[i, 'MSE'] = MSE[col]
    performance_metrics.loc[i, 'R2'] = R2[col]

# Calculate the mean across all industries and add it to the DataFrame
mean_mae = performance_metrics['MAE'].mean()
mean_mse = performance_metrics['MSE'].mean()
mean_r2 = performance_metrics['R2'].mean()

performance_metrics.loc[len(Y_df.columns), ['Industry', 'MAE', 'MSE', 'R2']] = ['Mean', mean_mae, mean_mse, mean_r2]

# Display the performance metrics DataFrame
print(performance_metrics)