In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

import matplotlib.pyplot as plt

# reduce display precision on numpy arrays
np.set_printoptions(precision=5)

In [None]:
df = pd.read_csv('C:/Users/Computer/Documents/bachelor/dataset.csv')
pd.options.mode.chained_assignment=None
df['csp'][637:]  = df['csp'][0:638].mean()

df['Index'] = pd.to_numeric(df['Index'], errors='coerce')

In [None]:
df.head(200)

In [None]:
X = df.iloc[:, 1:-30].values
Y = df.iloc[:, -30:].values

In [None]:
feature_names = df.columns[1:18]
target_names = df.columns[-30:]

from sklearn.preprocessing import StandardScaler

X_df = df[feature_names]
Y_df = df[target_names]

scaler = StandardScaler()
X_df = pd.DataFrame(scaler.fit_transform(X_df), columns=X_df.columns)

data = pd.concat([Y_df, X_df], axis=1)

In [None]:
industry_names = Y_df.columns

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_start_size = 200

# Initialize empty dictionaries to store results
Actuals = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Historical_Mean_Forecasts = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Historical_Mean_Forecast_Errors = {col: pd.Series(dtype=float, name=col) for col in Y_df.columns}
Historical_Mean_MAE = {}
Historical_Mean_MSE = {}
Historical_Mean_R2 = {}

# Expanding window loop
for t in range(train_start_size, len(Y_df) - 1):
    train = Y_df.iloc[:t]
    test = Y_df.iloc[t : t + 1]

    for col in Y_df.columns:
        historical_mean = train[col].mean()
        historical_mean_forecast = pd.Series([historical_mean], index=test[col].index)
        
        Historical_Mean_Forecasts[col] = pd.concat([Historical_Mean_Forecasts[col], historical_mean_forecast])
        Actuals[col] = pd.concat([Actuals[col], test[col]])
        Historical_Mean_Forecast_Errors[col] = pd.concat([Historical_Mean_Forecast_Errors[col], test[col] - historical_mean_forecast])
        
    # Print progress every 25 data points
    if t % 25 == 0:
        print(f"Progress: {t}/{len(Y_df) - 1}")

# Calculate MAE, MSE, and R-squared for each target variable
for col in Y_df.columns:
    Historical_Mean_MAE[col] = mean_absolute_error(Actuals[col], Historical_Mean_Forecasts[col])
    Historical_Mean_MSE[col] = mean_squared_error(Actuals[col], Historical_Mean_Forecasts[col])
    Historical_Mean_R2[col] = r2_score(Actuals[col], Historical_Mean_Forecasts[col])

for col in Y_df.columns:
    print(f"Actuals for {col}:")
    print(Actuals[col])
    print(f"Historical Mean Forecasts for {col}:")
    print(Historical_Mean_Forecasts[col])
    print(f"Historical Mean Forecast Errors for {col}:")
    print(Historical_Mean_Forecast_Errors[col])
    print(f"Historical Mean MAE for {col}: {Historical_Mean_MAE[col]}")
    print(f"Historical Mean MSE for {col}: {Historical_Mean_MSE[col]}")
    print(f"Historical Mean R-squared for {col}: {Historical_Mean_R2[col]}")

In [None]:
performance_metrics = pd.DataFrame(columns=['Industry', 'MAE', 'MSE', 'R2'])

# put existing performance metrics in to dataframe.
for i, col in enumerate(Y_df.columns):
    performance_metrics.loc[i, 'Industry'] = col
    performance_metrics.loc[i, 'MAE'] = Historical_Mean_MAE[col]
    performance_metrics.loc[i, 'MSE'] = Historical_Mean_MSE[col]
    performance_metrics.loc[i, 'R2'] = Historical_Mean_R2[col]

# Calculate the mean across all industries and add it
mean_mae = performance_metrics['MAE'].mean()
mean_mse = performance_metrics['MSE'].mean()
mean_r2 = performance_metrics['R2'].mean()

performance_metrics.loc[len(Y_df.columns), ['Industry', 'MAE', 'MSE', 'R2']] = ['Mean', mean_mae, mean_mse, mean_r2]

# Display the performance metrics DataFrame
print(performance_metrics)

In [None]:
Historical_Mean_In_Sample_MSE = {}

# Calculate in-sample historical mean forecasts and MSE for each target variable
for col in Y_df.columns:
    in_sample_data = Y_df.iloc[:train_start_size]  # Use only in-sample data
    in_sample_historical_mean = in_sample_data[col].mean()
    in_sample_historical_mean_forecast = pd.Series([in_sample_historical_mean] * len(in_sample_data), index=in_sample_data[col].index)
    
    # Calculate in-sample MSE
    Historical_Mean_In_Sample_MSE[col] = mean_squared_error(in_sample_data[col], in_sample_historical_mean_forecast)

# Print in-sample MSE results
for col in Y_df.columns:
    print(f"Historical Mean In-Sample MSE for {col}: {Historical_Mean_In_Sample_MSE[col]}")

# Calculate the mean in-sample MSE
mean_in_sample_mse = sum(Historical_Mean_In_Sample_MSE.values()) / len(Historical_Mean_In_Sample_MSE)

print(f"Mean In-Sample MSE for Historical Mean Model: {mean_in_sample_mse}")

In [None]:
# Create a DataFrame with columns for each performance metric
results_df = pd.DataFrame(columns=["Historical_Mean_In_Sample_MSE"])

# Populate the DataFrame with in-sample MSE results
for col in Y_df.columns:
    results_df.loc[col] = [Historical_Mean_In_Sample_MSE[col]]

# Calculate and add the mean in-sample MSE 
results_df.loc["Mean"] = [mean_in_sample_mse]

print(results_df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X = df.iloc[:, 1:-30].values
Y = df.iloc[:, -30:].values

feature_names = df.columns[1:18]  # Extract the feature names (columns 1 to 17)
target_names = df.columns[-30:]  # Extract the target names (last 30 columns)

X_df = df[feature_names]
Y_df = df[target_names]

scaler = StandardScaler()
X_df = pd.DataFrame(scaler.fit_transform(X_df), columns=X_df.columns)

data = pd.concat([Y_df, X_df], axis=1)

train_start_size = 200
coefficients_list = []

for target in target_names:
    train_X = X_df.iloc[:train_start_size]
    train_Y = Y_df.iloc[:train_start_size][target]

    ols_model = LinearRegression().fit(train_X, train_Y)
    coefficients = ols_model.coef_
    coefficients_list.append(coefficients)

coefficients_df = pd.DataFrame(coefficients_list, columns=feature_names, index=target_names)

print(coefficients_df)

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

def hac_standard_errors(X, y):
    model = sm.OLS(y, sm.add_constant(X)).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
    return model.bse[1:] 

hac_se = {}
for col in Y_df.columns:
    X_train = data[feature_names]
    y_train = data[col]
    hac_se[col] = hac_standard_errors(X_train, y_train)

hac_se_df = pd.DataFrame(hac_se, index=feature_names)

In [None]:
from scipy import stats

def hypothesis_tests(X, y, feature_names, standard_errors, alpha=0.05):
    model = sm.OLS(y, sm.add_constant(X)).fit(cov_type='HAC', cov_kwds={'maxlags': 1})
    t_stats = model.tvalues[1:]  # exclude the constant term
    p_values = model.pvalues[1:]  # exclude the constant term
    critical_value = stats.t.ppf(1 - alpha / 2, model.df_resid)
    confidence_intervals = model.conf_int(alpha).iloc[1:]  # exclude the constant term
    results = pd.DataFrame({'t_stat': t_stats,
                            'p_value': p_values,
                            'lower_ci': confidence_intervals[0],
                            'upper_ci': confidence_intervals[1]},
                           index=feature_names)
    return results

hypothesis_results = {}
for col in Y_df.columns:
    X_train = data[feature_names]
    y_train = data[col]
    hypothesis_results[col] = sm.OLS(y_train, sm.add_constant(X_train)).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

hypothesis_test_results = {}
for col, model in hypothesis_results.items():
    t_stats = model.tvalues[1:]  # exclude the constant term
    p_values = model.pvalues[1:]  # exclude the constant term
    confidence_intervals = model.conf_int().iloc[1:]  # exclude the constant term
    hypothesis_test_results[col] = pd.DataFrame({'t_stat': t_stats,
                                                 'p_value': p_values,
                                                 'lower_ci': confidence_intervals[0],
                                                 'upper_ci': confidence_intervals[1]},
                                                index=feature_names)

hypothesis_results_df = pd.concat(hypothesis_test_results, axis=1)

In [None]:
print(hypothesis_results_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the t-statistics
t_stats_df = hypothesis_results_df.xs('t_stat', level=1, axis=1)

alpha = 0.05
min_df_resid = min([results.df_resid for results in hypothesis_results.values()])
critical_value = stats.t.ppf(1 - alpha / 2, min_df_resid)

t_stats_df = hypothesis_results_df.xs('t_stat', level=1, axis=1)

plt.figure(figsize=(18, 10))
sns.heatmap(t_stats_df, cmap="coolwarm", annot=True, cbar_kws={'label': 't-statistic'})

plt.axhline(y=0, color='red', linestyle='--')
plt.axhline(y=len(t_stats_df), color='red', linestyle='--')

plt.title('T-Statistics for All Coefficients')
plt.show()

In [None]:
p_values_df = hypothesis_results_df.xs('p_value', level=1, axis=1)

# Create a boolean DataFrame on the p-value
significant_coefficients_df = p_values_df < 0.05

plt.figure(figsize=(12, 6))
sns.heatmap(significant_coefficients_df, cmap="coolwarm", cbar_kws={'label': 'Significance (True: Significant, False: Not Significant)'})

plt.title('Significance of Coefficients (p-value < 0.05)')
plt.show()