In [10]:
import pandas as pd
from statsmodels.tsa.api import VAR
import numpy as np
import pickle

In [11]:
# Load training data
train = pd.read_csv('./splits_data/weather_data_train.csv', index_col=0, parse_dates=True)

In [12]:
# Basic data checks
print("Data shape:", train.shape)
print("Unique values per column:\n", train.nunique())
print("Correlation matrix (numeric columns only):\n", train.select_dtypes(include='number').corr())

Data shape: (103236, 5)
Unique values per column:
 city                  30
rainfall             823
precipitation_sum    823
temperature          142
wind_speed           437
dtype: int64
Correlation matrix (numeric columns only):
                    rainfall  precipitation_sum  temperature  wind_speed
rainfall           1.000000           1.000000    -0.235527   -0.083794
precipitation_sum  1.000000           1.000000    -0.235527   -0.083794
temperature       -0.235527          -0.235527     1.000000    0.314445
wind_speed        -0.083794          -0.083794     0.314445    1.000000


In [13]:
# Drop constant columns (only one unique value)
constant_cols = [col for col in train.columns if train[col].nunique() == 1]
if constant_cols:
    print(f"Dropping constant columns: {constant_cols}")
    train = train.drop(columns=constant_cols)

In [14]:
# Check remaining columns count
if train.shape[1] < 2:
    print("⚠️ Warning: Very few columns remain after dropping constants.")

In [15]:
# Drop rows with missing values
train = train.dropna()

# Select numeric columns only
train_numeric = train.select_dtypes(include='number')

# Drop any remaining missing values
train_numeric = train_numeric.dropna()

In [16]:
# Initialize VAR model with numeric data
model = VAR(train_numeric)

try:
    # Select lag order (maxlags=5)
    lag_order_results = model.select_order(maxlags=5)
    print("Lag order selection results:")
    print(lag_order_results.summary())

    # Choose lag with lowest AIC
    optimal_lag = lag_order_results.aic.idxmin()
    print(f"Optimal lag order by AIC: {optimal_lag}")

    # Fit VAR model
    var_model = model.fit(optimal_lag)
except np.linalg.LinAlgError:
    print("Warning: Singular matrix error during lag order selection.")
    print("Fitting VAR with lag=1 as fallback.")
    var_model = model.fit(1)

try:
    print(var_model.summary())
except np.linalg.LinAlgError:
    print("Singular matrix error during model summary printing. Summary skipped.")



  self._init_dates(dates, freq)



Fitting VAR with lag=1 as fallback.
Singular matrix error during model summary printing. Summary skipped.


In [17]:
# Optional: Check residual autocorrelation with Durbin-Watson
try:
    from statsmodels.stats.stattools import durbin_watson
    print("\nDurbin-Watson statistics for residuals:")
    dw_stats = durbin_watson(var_model.resid)
    for col_name, dw_val in zip(train_numeric.columns, dw_stats):
        print(f"{col_name}: {dw_val:.2f}")
except ImportError:
    print("statsmodels not installed; skipping Durbin-Watson test.")
except Exception as e:
    print(f"Error during Durbin-Watson test: {e}")


Durbin-Watson statistics for residuals:
rainfall: 2.42
precipitation_sum: 2.42
temperature: 2.08
wind_speed: 2.27


In [18]:
# Save trained VAR model with pickle
try:
    with open("var_model.pkl", "wb") as f:
        pickle.dump(var_model, f)
    print("\n VAR model saved as 'var_model.pkl'")
except Exception as e:
    print(f" Failed to save VAR model: {e}")

print("\n VAR model training completed.")


 VAR model saved as 'var_model.pkl'

 VAR model training completed.
