#MGT3011 – ECONOMETRICS
#Econometrics Experiments Assignment
##Homadhitya J P (21MIA1096)


#BANK OF BARODA STOCKS DATA

In [None]:
import pandas as pd

df = pd.read_csv('bankofbarodamain.csv')

df = df.dropna()

df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-08-22,94.199997,94.949997,89.75,90.550003,83.042824,1523095.0
1,2019-08-23,90.599998,94.550003,90.0,93.449997,85.702393,1604894.0
2,2019-08-26,99.75,99.800003,91.5,95.050003,87.169746,2701594.0
3,2019-08-27,97.199997,99.199997,96.599998,97.25,89.187347,2123031.0
4,2019-08-28,97.400002,97.650002,93.800003,94.5,86.665337,1273207.0


#1. DW - Durbin-Watson Statistic

In [None]:
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson

close_prices = df['Close']
X = sm.add_constant(range(len(close_prices)))
model = sm.OLS(close_prices, X).fit()
dw_statistic = durbin_watson(model.resid)

# Hypotheses
print("H0 (Null Hypothesis): No Autocorrelation.")
print("HA (Alternative Hypothesis): Autocorrelation exists.")

print(f"Durbin-Watson Statistic: {dw_statistic}")
if dw_statistic < 1.5 or dw_statistic > 2.5:
    print("Reject H0: Autocorrelation exists.")
else:
    print("Fail to Reject H0: No Autocorrelation.")


H0 (Null Hypothesis): No Autocorrelation.
HA (Alternative Hypothesis): Autocorrelation exists.
Durbin-Watson Statistic: 0.012467857417160819
Reject H0: Autocorrelation exists.


#2. BPG - Breusch-Pagan Test (Autocorrelation)

In [None]:
from statsmodels.stats.diagnostic import acorr_breusch_godfrey

bg_test = acorr_breusch_godfrey(model, nlags=1)

# Hypotheses
print("H0 (Null Hypothesis): No Autocorrelation.")
print("HA (Alternative Hypothesis): Autocorrelation exists.")

print(f"Breusch-Godfrey Test Statistic: {bg_test[0]}")
print(f"P-Value: {bg_test[1]}")

alpha = 0.05
if bg_test[1] < alpha:
    print("Reject H0: Autocorrelation exists.")
else:
    print("Fail to Reject H0: No Autocorrelation.")


H0 (Null Hypothesis): No Autocorrelation.
HA (Alternative Hypothesis): Autocorrelation exists.
Breusch-Godfrey Test Statistic: 1196.1546466875134
P-Value: 4.178330568201085e-262
Reject H0: Autocorrelation exists.


#3. ARCH - Autoregressive Conditional Heteroskedasticity Lagrange Multiplier

In [None]:
from statsmodels.stats.diagnostic import het_arch

arch_test = het_arch(model.resid)

# Hypotheses
print("H0 (Null Hypothesis): No Heteroscedasticity.")
print("HA (Alternative Hypothesis): Heteroscedasticity exists.")

print(f"ARCH LM Test Statistic: {arch_test[0]}")
print(f"P-Value: {arch_test[1]}")

alpha = 0.05
if arch_test[1] < alpha:
    print("Reject H0: Heteroscedasticity exists.")
else:
    print("Fail to Reject H0: No Heteroscedasticity.")


H0 (Null Hypothesis): No Heteroscedasticity.
HA (Alternative Hypothesis): Heteroscedasticity exists.
ARCH LM Test Statistic: 1160.6002663039778
P-Value: 4.531609738997045e-243
Reject H0: Heteroscedasticity exists.


#4. BP - Breusch-Pagan Test (Heteroscedasticity)

In [None]:
from statsmodels.stats.diagnostic import het_breuschpagan

bp_test = het_breuschpagan(model.resid, model.model.exog)

# Hypotheses
print("H0 (Null Hypothesis): No Heteroscedasticity.")
print("HA (Alternative Hypothesis): Heteroscedasticity exists.")

print(f"Breusch-Pagan Test Statistic: {bp_test[0]}")
print(f"P-Value: {bp_test[1]}")

alpha = 0.05
if bp_test[1] < alpha:
    print("Reject H0: Heteroscedasticity exists.")
else:
    print("Fail to Reject H0: No Heteroscedasticity.")


H0 (Null Hypothesis): No Heteroscedasticity.
HA (Alternative Hypothesis): Heteroscedasticity exists.
Breusch-Pagan Test Statistic: 104.40807757546123
P-Value: 1.646579242051769e-24
Reject H0: Heteroscedasticity exists.


#5. VIF - Variance Inflation Factor (Multicollinearity)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

X = df[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
X = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Hypotheses
print("H0 (Null Hypothesis): No Multicollinearity (acceptable VIF values).")
print("HA (Alternative Hypothesis): Multicollinearity exists (high VIF values).")

print(vif_data)

threshold = 10
for i, row in vif_data.iterrows():
    if row['VIF'] > threshold:
        print(f"Reject H0: Variable '{row['Variable']}' has high VIF, indicating multicollinearity.")
    else:
        print(f"Fail to Reject H0: Variable '{row['Variable']}' has acceptable VIF.")


H0 (Null Hypothesis): No Multicollinearity (acceptable VIF values).
HA (Alternative Hypothesis): Multicollinearity exists (high VIF values).
    Variable          VIF
0      const    16.402930
1       Open  2779.040979
2       High  4325.320059
3        Low  2533.525264
4      Close  4261.626178
5  Adj Close  1031.792378
6     Volume     1.387907
Reject H0: Variable 'const' has high VIF, indicating multicollinearity.
Reject H0: Variable 'Open' has high VIF, indicating multicollinearity.
Reject H0: Variable 'High' has high VIF, indicating multicollinearity.
Reject H0: Variable 'Low' has high VIF, indicating multicollinearity.
Reject H0: Variable 'Close' has high VIF, indicating multicollinearity.
Reject H0: Variable 'Adj Close' has high VIF, indicating multicollinearity.
Fail to Reject H0: Variable 'Volume' has acceptable VIF.


#6. Z1 - Z-Test: One Sample

In [None]:
import numpy as np
import scipy.stats as stats

population_mean = 95
sample_mean = np.mean(df['Close'])
sample_std = np.std(df['Close'], ddof=1)
n = len(df['Close'])

z_statistic = (sample_mean - population_mean) / (sample_std / np.sqrt(n))
p_value = 2 * (1 - stats.norm.cdf(abs(z_statistic)))

# Hypotheses
print("H0 (Null Hypothesis): The sample mean is equal to the population mean.")
print("HA (Alternative Hypothesis): The sample mean is different from the population mean.")

print(f"One-Sample Z Test Statistic: {z_statistic}")
print(f"P-Value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: The sample mean is different from the population mean.")
else:
    print("Fail to Reject H0: The sample mean is equal to the population mean.")


H0 (Null Hypothesis): The sample mean is equal to the population mean.
HA (Alternative Hypothesis): The sample mean is different from the population mean.
One-Sample Z Test Statistic: 17.037106863446187
P-Value: 0.0
Reject H0: The sample mean is different from the population mean.


#7. Z2 - Z-Test: Two Sample

In [None]:
import numpy as np
import scipy.stats as stats

# Define two independent samples (Example: dividing the data into two time periods)
sample1 = df['Close'][:len(df)//2]  # First half of the 'Close' prices
sample2 = df['Close'][len(df)//2:]   # Second half of the 'Close' prices

# Calculate means and standard deviations for both samples
mean1, mean2 = np.mean(sample1), np.mean(sample2)
std1, std2 = np.std(sample1, ddof=1), np.std(sample2, ddof=1)
n1, n2 = len(sample1), len(sample2)

# Calculate the Z statistic
z_statistic = (mean1 - mean2) / np.sqrt((std1**2 / n1) + (std2**2 / n2))

# Calculate the two-tailed p-value
p_value = 2 * (1 - stats.norm.cdf(abs(z_statistic)))

# Hypotheses
print("H0 (Null Hypothesis): The means of the two samples are equal.")
print("HA (Alternative Hypothesis): The means of the two samples are not equal.")

# Output results
print(f"Two-Sample Z-Test Statistic: {z_statistic}")
print(f"P-Value: {p_value}")

# Decision based on significance level
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The means of the two samples are significantly different.")
else:
    print("Fail to Reject H0: The means of the two samples are not significantly different.")


H0 (Null Hypothesis): The means of the two samples are equal.
HA (Alternative Hypothesis): The means of the two samples are not equal.
Two-Sample Z-Test Statistic: -45.70520848045105
P-Value: 0.0
Reject H0: The means of the two samples are significantly different.


#8. T1 - One-Sample T-Test

In [None]:
import scipy.stats as stats

population_mean = 129  # Example value
t_statistic, p_value = stats.ttest_1samp(df['Close'], population_mean)

# Hypotheses
print("H0 (Null Hypothesis): The sample mean is equal to the population mean.")
print("HA (Alternative Hypothesis): The sample mean is not equal to the population mean.")

print(f"One-Sample T-Test Statistic: {t_statistic}")
print(f"P-Value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: The sample mean is not equal to the population mean.")
else:
    print("Fail to Reject H0: The sample mean is equal to the population mean.")


H0 (Null Hypothesis): The sample mean is equal to the population mean.
HA (Alternative Hypothesis): The sample mean is not equal to the population mean.
One-Sample T-Test Statistic: -0.06685368681416654
P-Value: 0.9467091820042363
Fail to Reject H0: The sample mean is equal to the population mean.


#9. T2 - Two-Sample T-Test

In [None]:
sample1 = df['Close'][:len(df)//2]
sample2 = df['Close'][len(df)//2:]
t_statistic, p_value = stats.ttest_ind(sample1, sample2)

# Hypotheses
print("H0 (Null Hypothesis): The means of the two samples are equal.")
print("HA (Alternative Hypothesis): The means of the two samples are not equal.")

print(f"Two-Sample T-Test Statistic: {t_statistic}")
print(f"P-Value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: The means of the two samples are not equal.")
else:
    print("Fail to Reject H0: The means of the two samples are equal.")


H0 (Null Hypothesis): The means of the two samples are equal.
HA (Alternative Hypothesis): The means of the two samples are not equal.
Two-Sample T-Test Statistic: -45.70520848045103
P-Value: 4.026854988285564e-266
Reject H0: The means of the two samples are not equal.


#10. χ² - Chi-Square Test Statistic

In [None]:
import numpy as np
expected_frequencies = np.full_like(df['Close'], np.mean(df['Close']))
chi2_statistic, p_value = stats.chisquare(df['Close'], f_exp=expected_frequencies)

# Hypotheses
print("H0 (Null Hypothesis): The observed distribution fits the expected distribution.")
print("HA (Alternative Hypothesis): The observed distribution does not fit the expected distribution.")

print(f"Chi-Square Test Statistic: {chi2_statistic}")
print(f"P-Value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: The observed distribution does not fit the expected distribution.")
else:
    print("Fail to Reject H0: The observed distribution fits the expected distribution.")


H0 (Null Hypothesis): The observed distribution fits the expected distribution.
HA (Alternative Hypothesis): The observed distribution does not fit the expected distribution.
Chi-Square Test Statistic: 45303.55068020599
P-Value: 0.0
Reject H0: The observed distribution does not fit the expected distribution.


# 11. ANOVA - One Way ANOVA F Test

In [None]:
group1 = df['Close'][:len(df)//3]
group2 = df['Close'][len(df)//3:2*len(df)//3]
group3 = df['Close'][2*len(df)//3:]
f_statistic, p_value = stats.f_oneway(group1, group2, group3)

# Hypotheses
print("H0 (Null Hypothesis): The means of the groups are equal.")
print("HA (Alternative Hypothesis): At least one group mean is different.")

print(f"ANOVA F-Test Statistic: {f_statistic}")
print(f"P-Value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: At least one group mean is different.")
else:
    print("Fail to Reject H0: The means of the groups are equal.")


H0 (Null Hypothesis): The means of the groups are equal.
HA (Alternative Hypothesis): At least one group mean is different.
ANOVA F-Test Statistic: 2877.0576728157307
P-Value: 0.0
Reject H0: At least one group mean is different.


#12. ADF - Dickey-Fuller Test (with Drift)

In [None]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(df['Close'], regression='c')

# Hypotheses
print("H0 (Null Hypothesis): The time series has a unit root (non-stationary).")
print("HA (Alternative Hypothesis): The time series does not have a unit root (stationary).")

print(f"Dickey-Fuller Test Statistic: {result[0]}")
print(f"P-Value: {result[1]}")

alpha = 0.05
if result[1] < alpha:
    print("Reject H0: The time series is stationary.")
else:
    print("Fail to Reject H0: The time series is non-stationary.")


H0 (Null Hypothesis): The time series has a unit root (non-stationary).
HA (Alternative Hypothesis): The time series does not have a unit root (stationary).
Dickey-Fuller Test Statistic: 0.3771965209735524
P-Value: 0.9806218511778408
Fail to Reject H0: The time series is non-stationary.


#13. ADF-DriftTrend - Augmented Dickey-Fuller Test (with Drift and Trend)

In [None]:
result = adfuller(df['Close'], regression='ct')

print("H0 (Null Hypothesis): The time series has a unit root (non-stationary).")
print("HA (Alternative Hypothesis): The time series does not have a unit root (stationary).")

print(f"Augmented Dickey-Fuller Test Statistic: {result[0]}")
print(f"P-Value: {result[1]}")

if result[1] < alpha:
    print("Reject H0: The time series is stationary.")
else:
    print("Fail to Reject H0: The time series is non-stationary.")


H0 (Null Hypothesis): The time series has a unit root (non-stationary).
HA (Alternative Hypothesis): The time series does not have a unit root (stationary).
Augmented Dickey-Fuller Test Statistic: -2.329390923715192
P-Value: 0.41776620829006583
Fail to Reject H0: The time series is non-stationary.


#14. ADF-NC/NT - Augmented Dickey-Fuller Test (No Constant and No Trend)

In [None]:
result = adfuller(df['Close'], regression='n')

print("H0 (Null Hypothesis): The time series has a unit root (non-stationary).")
print("HA (Alternative Hypothesis): The time series does not have a unit root (stationary).")

print(f"Augmented Dickey-Fuller Test Statistic: {result[0]}")
print(f"P-Value: {result[1]}")

if result[1] < alpha:
    print("Reject H0: The time series is stationary.")
else:
    print("Fail to Reject H0: The time series is non-stationary.")


H0 (Null Hypothesis): The time series has a unit root (non-stationary).
HA (Alternative Hypothesis): The time series does not have a unit root (stationary).
Augmented Dickey-Fuller Test Statistic: 1.6478978308483072
P-Value: 0.9761106143152962
Fail to Reject H0: The time series is non-stationary.


#15. EVALUATION METRICS

In [None]:
import statsmodels.api as sm

# Define X and y (features and target)
X = df[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
X = sm.add_constant(X)  # Add constant if the model needs an intercept
y = df['Close']

# Train the model
model = sm.OLS(y, X).fit()

# Generate predictions
predictions = model.predict(X)

# a) Mean Absolute Deviation (MAD)
mad = abs(df['Close'] - predictions).mean()
print(f"MAD: {mad}")

# b) Mean Absolute Percentage Error (MAPE)
mape = (abs((df['Close'] - predictions) / df['Close'])).mean() * 100
print(f"MAPE: {mape}%")

# c) Mean Absolute Error (MAE)
mae = abs(df['Close'] - predictions).mean()
print(f"MAE: {mae}")

# d) Mean Squared Error (MSE)
mse = ((df['Close'] - predictions) ** 2).mean()
print(f"MSE: {mse}")

# e) Root Mean Squared Error (RMSE)
rmse = ((df['Close'] - predictions) ** 2).mean() ** 0.5
print(f"RMSE: {rmse}")


MAD: 3.889660523495159e-11
MAPE: 4.51850396963058e-11%
MAE: 3.889660523495159e-11
MSE: 2.543762178506042e-21
RMSE: 5.043572323766203e-11


#16. ARIMA - Autoregressive Integrated Moving Average

In [None]:
from statsmodels.tsa.arima.model import ARIMA

order = (1, 1, 1)  # Set ARIMA order based on your data
arima_model = ARIMA(df['Close'], order=order)
arima_result = arima_model.fit()

# Print the summary
print(arima_result.summary())

# Interpretation based on model significance
print("H0 (Null Hypothesis): The ARIMA model adequately fits the time series data.")
print("HA (Alternative Hypothesis): The ARIMA model does not adequately fit the time series data.")

# Using AIC as a model fit indicator (lower AIC suggests a better fit)
aic_threshold = 1000  # Define an AIC threshold (or compare with alternative models)
print(f"AIC: {arima_result.aic}")
if arima_result.aic < aic_threshold:
    print("Reject H0: The ARIMA model fits the time series data adequately.")
else:
    print("Fail to Reject H0: The ARIMA model does not adequately fit the time series data.")


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:                  Close   No. Observations:                 1216
Model:                 ARIMA(1, 1, 1)   Log Likelihood               -3240.714
Date:                Sun, 03 Nov 2024   AIC                           6487.428
Time:                        12:13:35   BIC                           6502.735
Sample:                             0   HQIC                          6493.190
                               - 1216                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.2515      0.159      1.581      0.114      -0.060       0.563
ma.L1         -0.3719      0.159     -2.336      0.019      -0.684      -0.060
sigma2        12.1413      0.196     61.881      0.0

#17. VAR - Vector Autoregression

In [None]:
from statsmodels.tsa.api import VAR

data = df[['Open', 'Close']].dropna()
var_model = VAR(data)
var_result = var_model.fit(maxlags=15, ic='aic')

print(var_result.summary())

print("H0 (Null Hypothesis): No Granger causality between the time series.")
print("HA (Alternative Hypothesis): Granger causality exists between the time series.")

# Using p-value to interpret Granger causality for each lag
for col in var_result.pvalues.columns:
    p_value = var_result.pvalues[col].max()
    if p_value < 0.05:
        print(f"Reject H0 for {col}: Evidence of Granger causality at p-value {p_value}")
    else:
        print(f"Fail to Reject H0 for {col}: No evidence of Granger causality.")


  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sun, 03, Nov, 2024
Time:                     12:13:35
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                    3.12363
Nobs:                     1214.00    HQIC:                   3.09743
Log likelihood:          -5305.72    FPE:                    21.7935
AIC:                      3.08161    Det(Omega_mle):         21.6150
--------------------------------------------------------------------
Results for equation Open
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const            0.012158         0.087120            0.140           0.889
L1.Open          0.014048         0.030904            0.455           0.649
L1.Close         0.981413         0.012673           77.440           0.000
L2.Op

  self._init_dates(dates, freq)


#18. GC - Granger Causality

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests

max_lag = 5
granger_test_result = grangercausalitytests(data[['Open', 'Close']], max_lag, verbose=True)

print("H0 (Null Hypothesis): Variable X does not Granger-cause Variable Y.")
print("HA (Alternative Hypothesis): Variable X Granger-causes Variable Y.")

# Check if p-values for any lag are below significance level
significance_level = 0.05
for lag in range(1, max_lag + 1):
    p_value = granger_test_result[lag][0]['ssr_chi2test'][1]
    if p_value < significance_level:
        print(f"Reject H0 at lag {lag}: Evidence of Granger causality (p-value {p_value}).")
    else:
        print(f"Fail to Reject H0 at lag {lag}: No evidence of Granger causality.")





Granger Causality
number of lags (no zero) 1
ssr based F test:         F=6070.7904, p=0.0000  , df_denom=1212, df_num=1
ssr based chi2 test:   chi2=6085.8172, p=0.0000  , df=1
likelihood ratio test: chi2=2178.7893, p=0.0000  , df=1
parameter F test:         F=6070.7904, p=0.0000  , df_denom=1212, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2998.9587, p=0.0000  , df_denom=1209, df_num=2
ssr based chi2 test:   chi2=6022.7226, p=0.0000  , df=2
likelihood ratio test: chi2=2167.2908, p=0.0000  , df=2
parameter F test:         F=2998.9587, p=0.0000  , df_denom=1209, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1998.9785, p=0.0000  , df_denom=1206, df_num=3
ssr based chi2 test:   chi2=6031.7436, p=0.0000  , df=3
likelihood ratio test: chi2=2167.8488, p=0.0000  , df=3
parameter F test:         F=1998.9785, p=0.0000  , df_denom=1206, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=15

#19. GARCH - Generalized Autoregressive Conditional Heteroskedasticity

In [None]:
# Install the arch package if not already installed
!pip install arch

# Import arch_model from arch
from arch import arch_model

# Define and fit the GARCH model
garch_model = arch_model(df['Close'], vol='Garch', p=1, q=1)
garch_result = garch_model.fit()

# Print the model summary
print(garch_result.summary())

# Hypothesis statements
print("H0 (Null Hypothesis): No ARCH effects (constant variance).")
print("HA (Alternative Hypothesis): ARCH effects are present (changing variance).")

# Interpret the p-value of the ARCH term
p_value = garch_result.pvalues['omega']
if p_value < 0.05:
    print("Reject H0: ARCH effects are present.")
else:
    print("Fail to Reject H0: No ARCH effects detected.")


Iteration:      1,   Func. Count:      6,   Neg. LLF: 34978.414789355134
Iteration:      2,   Func. Count:     13,   Neg. LLF: 13738351.713982517
Iteration:      3,   Func. Count:     20,   Neg. LLF: 7543.543855733757
Iteration:      4,   Func. Count:     27,   Neg. LLF: 6113.4668087909595
Iteration:      5,   Func. Count:     33,   Neg. LLF: 6035.247476292059
Iteration:      6,   Func. Count:     38,   Neg. LLF: 6016.914140745712
Iteration:      7,   Func. Count:     43,   Neg. LLF: 6012.748430103173
Iteration:      8,   Func. Count:     48,   Neg. LLF: 6009.873105214272
Iteration:      9,   Func. Count:     53,   Neg. LLF: 5996.221157007655
Iteration:     10,   Func. Count:     58,   Neg. LLF: 5988.552589765737
Iteration:     11,   Func. Count:     63,   Neg. LLF: 5982.4801110669105
Iteration:     12,   Func. Count:     68,   Neg. LLF: 5967.687615162202
Iteration:     13,   Func. Count:     73,   Neg. LLF: 10999.17631698264
Iteration:     14,   Func. Count:     79,   Neg. LLF: 9546.2

#20. FEM - Fixed Effects Model

In [None]:
# Import necessary libraries
import pandas as pd
from linearmodels.panel import PanelOLS

# Load the dataset from a CSV file
# Ensure to update the file path to where your CSV file is located
file_path = 'bankofbarodamain.csv'  # Update this path
df = pd.read_csv(file_path)

# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'])  # This will infer the format automatically

# Add an entity column (assuming we're dealing with one stock)
df['entity'] = 'Stock_A'  # You can modify this as necessary

# Set multi-index with entity and Date
df_panel = df.set_index(['entity', 'Date'])

# Ensure we have enough data for the model
if df_panel.shape[0] < 2:  # Check if there are at least two observations
    print("Not enough data points for the fixed effects model.")
else:
    # Define the fixed effects model without entity effects since there's only one entity
    fixed_effect_model = PanelOLS(df_panel['Close'], df_panel[['Open', 'High', 'Low', 'Volume']], entity_effects=False)

    # Fit the model
    fixed_effect_result = fixed_effect_model.fit()

    # Print the results
    print(fixed_effect_result.summary)  # Access without parentheses

    print("H0 (Null Hypothesis): No significant individual effects.")
    print("HA (Alternative Hypothesis): Significant individual effects present.")

    # Use F-statistic and p-value
    f_stat = fixed_effect_result.f_statistic

    # Correctly retrieve the p-value from the f_stat object
    f_stat_pvalue = f_stat.pval  # Use pval instead of pvalue

    if f_stat_pvalue < 0.05:
        print("Reject H0: Significant individual effects present.")
    else:
        print("Fail to Reject H0: No significant individual effects.")

                          PanelOLS Estimation Summary                           
Dep. Variable:                  Close   R-squared:                        0.9999
Estimator:                   PanelOLS   R-squared (Between):              1.0000
No. Observations:                1216   R-squared (Within):               0.9997
Date:                Sun, Nov 03 2024   R-squared (Overall):              0.9999
Time:                        12:13:46   Log-likelihood                   -1951.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   4.476e+06
Entities:                           1   P-value                           0.0000
Avg Obs:                       1216.0   Distribution:                  F(4,1212)
Min Obs:                       1216.0                                           
Max Obs:                       1216.0   F-statistic (robust):          4.476e+06
                            

#21. REM - Random Effects Model

In [None]:
# Import necessary libraries
import pandas as pd
from linearmodels.panel import RandomEffects

# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'])  # This will infer the format automatically

# Add an entity column (assuming we're dealing with one stock)
df['entity'] = 'Stock_A'  # You can modify this as necessary

# Set multi-index with entity and Date
df_panel = df.set_index(['entity', 'Date'])

# Ensure we have enough data for the model
if df_panel.shape[0] < 2:  # Check if there are at least two observations
    print("Not enough data points for the random effects model.")
else:
    # Define the Random Effects model
    random_effect_model = RandomEffects(df_panel['Close'], df_panel[['Open', 'High', 'Low', 'Volume']])

    # Fit the model
    random_effect_result = random_effect_model.fit()

    # Print the results
    print(random_effect_result.summary)  # Access the summary directly

    print("H0 (Null Hypothesis): No significant individual effects.")
    print("HA (Alternative Hypothesis): Significant individual effects present.")

    # Display the R-squared value
    print(f"R-squared: {random_effect_result.rsquared}")

    # For random effects, we usually look at the p-values of the coefficients directly
    # To check for individual significance
    p_values = random_effect_result.pvalues
    print("P-values of the coefficients:")
    print(p_values)

    if (p_values < 0.05).any():  # Check if any p-value is significant
        print("Reject H0: Significant individual effects present.")
    else:
        print("Fail to Reject H0: No significant individual effects.")

                        RandomEffects Estimation Summary                        
Dep. Variable:                  Close   R-squared:                        0.9999
Estimator:              RandomEffects   R-squared (Between):              1.0000
No. Observations:                1216   R-squared (Within):               0.9997
Date:                Sun, Nov 03 2024   R-squared (Overall):              0.9999
Time:                        12:13:46   Log-likelihood                   -1951.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   4.476e+06
Entities:                           1   P-value                           0.0000
Avg Obs:                       1216.0   Distribution:                  F(4,1212)
Min Obs:                       1216.0                                           
Max Obs:                       1216.0   F-statistic (robust):          4.476e+06
                            

#22. ECM - Error Correction Model

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

# Convert 'Date' to datetime format (no format specified for automatic inference)
df['Date'] = pd.to_datetime(df['Date'])

# Check for missing values in the original dataset
print("Initial missing values in the dataset:")
print(df.isnull().sum())

# Option 1: Drop rows with any missing values
df.dropna(inplace=True)

# Verify if there are still missing values
if df.isnull().values.any():
    print("Warning: There are still NaN values in the dataset after dropping.")
else:
    print("No missing values in the dataset after dropping.")

# Check for stationarity and perform differencing if necessary
def check_stationarity(series):
    result = adfuller(series)
    return result[1] <= 0.05  # Return True if p-value is less than 0.05 (stationary)

# Assume we're focusing on Close prices
if not check_stationarity(df['Close']):
    print("The 'Close' series is non-stationary; differencing will be applied.")
    df['Close_diff'] = df['Close'].diff()
else:
    df['Close_diff'] = df['Close']

# Create differenced independent variables
df[['Open_diff', 'High_diff', 'Low_diff', 'Volume_diff']] = df[['Open', 'High', 'Low', 'Volume']].diff()

# Drop NaN values resulting from differencing
df.dropna(inplace=True)

# Ensure that the dependent variable (Close_diff) and independent variables are aligned
Y = df['Close_diff']
X = df[['Open_diff', 'High_diff', 'Low_diff', 'Volume_diff']]

# Check if there are still any NaN values
if Y.isnull().any() or X.isnull().any().any():
    print("Warning: There are still NaN values in the data after dropping.")
else:
    # Add a constant term for the intercept
    X = sm.add_constant(X)

    # Fit the Error Correction Model (ECM)
    ecm_model = sm.OLS(Y, X).fit()

    # Print the results
    print(ecm_model.summary())

    # Interpretation of results
    print("H0 (Null Hypothesis): No significant relationship.")
    print("HA (Alternative Hypothesis): Significant relationship present.")

    # Check the p-values
    p_values = ecm_model.pvalues
    print("P-values of the coefficients:")
    print(p_values)

    if (p_values < 0.05).any():  # Check if any p-value is significant
        print("Reject H0: Significant relationship present.")
    else:
        print("Fail to Reject H0: No significant relationship.")

Initial missing values in the dataset:
Date          0
Open         20
High         20
Low          20
Close        20
Adj Close    20
Volume       20
entity        0
dtype: int64
No missing values in the dataset after dropping.
The 'Close' series is non-stationary; differencing will be applied.
                            OLS Regression Results                            
Dep. Variable:             Close_diff   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.773
Method:                 Least Squares   F-statistic:                     1032.
Date:                Sun, 03 Nov 2024   Prob (F-statistic):               0.00
Time:                        12:13:47   Log-Likelihood:                -2347.4
No. Observations:                1215   AIC:                             4705.
Df Residuals:                    1210   BIC:                             4730.
Df Model:                           4                                  