### Forecasting Models on КРС 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from pylab import rcParams
from IPython.display import display
import math
from prophet import Prophet


import warnings
from statsmodels.tools.sm_exceptions import InterpolationWarning
warnings.simplefilter("ignore", category=InterpolationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)



#### Data preparation

In [2]:
df = pd.read_excel("Датасет по КРС.xlsx")
df.head(5)

Unnamed: 0,Регион,Период,КРС,Осадки,Поголовье: КРС,Температура
0,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-01,4455.35,9.8,372560.0,-12.490323
1,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-02,3654.2,9.8,399442.0,-10.192857
2,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-03,4287.08,8.3,425605.0,-5.870968
3,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-04,3923.21,8.8,440023.0,4.49
4,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-05,3849.7,42.8,444647.0,14.574194


In [None]:
df_akmola = df[df['Регион'] == 'АКМОЛИНСКАЯ ОБЛАСТЬ']
df_akmola

Unnamed: 0,Регион,Период,КРС,Осадки,Поголовье: КРС,Температура
0,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-01,4455.35,9.8,372560.0,-12.490323
1,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-02,3654.20,9.8,399442.0,-10.192857
2,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-03,4287.08,8.3,425605.0,-5.870968
3,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-04,3923.21,8.8,440023.0,4.490000
4,АКМОЛИНСКАЯ ОБЛАСТЬ,2015-05,3849.70,42.8,444647.0,14.574194
...,...,...,...,...,...,...
115,АКМОЛИНСКАЯ ОБЛАСТЬ,2024-08,2412.70,49.0,403674.0,17.338710
116,АКМОЛИНСКАЯ ОБЛАСТЬ,2024-09,2909.66,15.0,398450.0,11.300000
117,АКМОЛИНСКАЯ ОБЛАСТЬ,2024-10,2608.27,15.6,392916.0,3.832258
118,АКМОЛИНСКАЯ ОБЛАСТЬ,2024-11,3649.19,23.7,387565.0,-4.343333


In [None]:
df_akmola.isna().sum()

Регион            0
Период            0
КРС               0
Осадки            0
Поголовье: КРС    0
Температура       0
dtype: int64

### Multivariate Regression

#### check statistical significance

In [7]:
y = df_akmola["КРС"]
X = df_akmola[["Поголовье: КРС", "Температура", "Осадки"]]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

# Print model summary
print(f"\n📍 Регион: {df_akmola.loc[0]['Регион']}")
print(model.summary())

#     # ✅ Collect results
#     for var in model.params.index:
#         results_list.append({
#             "Регион": region,
#             "Переменная": var,
#             "Коэффициент": model.params[var],
#             "p-value": model.pvalues[var],
#             "R-квадрат": model.rsquared
#         })

# # Convert to DataFrame
# results_df = pd.DataFrame(results_list)

# # Export to Excel
# # results_df.to_excel("krs_linear_regression_results.xlsx", index=False)

# print("\n Готово! Результаты экспортированы в 'krs_linear_regression_results.xlsx'")



📍 Регион: АКМОЛИНСКАЯ ОБЛАСТЬ
                            OLS Regression Results                            
Dep. Variable:                    КРС   R-squared:                       0.377
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     23.37
Date:                Fri, 11 Apr 2025   Prob (F-statistic):           6.63e-12
Time:                        15:56:10   Log-Likelihood:                -958.51
No. Observations:                 120   AIC:                             1925.
Df Residuals:                     116   BIC:                             1936.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const        

#### forecast using MLR

In [None]:
# Forecasting horizon
forecast_horizon = 3

# Container for summary results
results = []

# Loop through each region
for region in df_model["Регион"].unique():
    if region == "РЕСПУБЛИКА КАЗАХСТАН":
        continue  # ⛔ skip this region
    df_r = df_model[df_model["Регион"] == region].sort_values("Период")

    if df_r.shape[0] < forecast_horizon + 12:
        continue  # skip if not enough data

    # Train-test split
    df_train = df_r.iloc[:-forecast_horizon]
    df_test = df_r.iloc[-forecast_horizon:]

    X_train = df_train[["Поголовье: КРС", "Температура", "Осадки"]]
    y_train = df_train["КРС"]
    X_test = df_test[["Поголовье: КРС", "Температура", "Осадки"]]
    y_test = df_test["КРС"]

    # Fit model
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mape = (abs((y_test - y_pred) / y_test).mean()) * 100

    results.append({
        "Регион": region,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE (%)": mape,
        "R² (Train)": model.score(X_train, y_train)
    })

# Save results
results_df = pd.DataFrame(results)
# results_df.to_excel("mlr_forecast_evaluation_by_region.xlsx", index=False)

results_df


In [None]:
# Calculate and print mean metrics
mean_mape = results_df["MAPE (%)"].mean()
mean_rmse = results_df["RMSE"].mean()
mean_mae = results_df["MAE"].mean()

print(f"Средние метрики по регионам:")
print(f"Средний MAPE: {mean_mape:.2f}%")
print(f"Средний RMSE: {mean_rmse:.2f}")
print(f"Средний MAE: {mean_mae:.2f}")

#### regression for Kazakhstan - Поголовье not statistically significant

In [None]:
df_kazakhstan = df_model[df_model['Регион'] == 'РЕСПУБЛИКА КАЗАХСТАН'][["КРС", "Поголовье: КРС"]].dropna()
y = df_kazakhstan["КРС"]
X = df_kazakhstan[["Поголовье: КРС"]]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

# Print model summary
print("Регион: РЕСПУБЛИКА КАЗАХСТАН")
print(model.summary())

In [None]:
# from sklearn.linear_model import Ridge

# # Forecasting horizon
# forecast_horizon = 3

# # Container for summary results
# results = []

# # Loop through each region
# for region in df_model["Регион"].unique():
#     if region == "РЕСПУБЛИКА КАЗАХСТАН":
#         continue  # ⛔ skip this region
#     df_r = df_model[df_model["Регион"] == region].sort_values("Период")

#     if df_r.shape[0] < forecast_horizon + 12:
#         continue  # skip if not enough data

#     # Train-test split
#     df_train = df_r.iloc[:-forecast_horizon]
#     df_test = df_r.iloc[-forecast_horizon:]

#     X_train = df_train[["Поголовье: КРС", "Температура", "Осадки"]]
#     y_train = df_train["КРС"]
#     X_test = df_test[["Поголовье: КРС", "Температура", "Осадки"]]
#     y_test = df_test["КРС"]

#     # Fit model
#     model = Ridge(alpha=1.0).fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     # Calculate metrics
#     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#     mae = mean_absolute_error(y_test, y_pred)
#     mape = (abs((y_test - y_pred) / y_test).mean()) * 100

#     results.append({
#         "Регион": region,
#         "RMSE": rmse,
#         "MAE": mae,
#         "MAPE (%)": mape,
#         "R² (Train)": model.score(X_train, y_train)
#     })

#     # Plot actual vs forecast
#     plt.figure(figsize=(6, 3))
#     plt.plot(df_test["Период"], y_test, marker="o", label="Факт")
#     plt.plot(df_test["Период"], y_pred, marker="s", label="Прогноз")
#     plt.title(f"{region} — Прогноз объема КРС на {forecast_horizon} месяца")
#     plt.xlabel("Месяц")
#     plt.ylabel("Объем КРС")
#     plt.legend()
#     plt.xticks(rotation=45)
#     plt.grid(True)
#     plt.tight_layout()
#     plt.show()

# # Save results
# results_df = pd.DataFrame(results)
# results_df.to_excel("ridge_forecast_evaluation_by_region.xlsx", index=False)

# print("✅ Готово! Метрики и графики прогноза рассчитаны.")


In [None]:
# # Forecasting horizon
# forecast_horizon = 12

# # Container for summary results
# results = []

# # Loop through each region
# for region in df_model["Регион"].unique():
#     if region == "РЕСПУБЛИКА КАЗАХСТАН":
#         continue  # ⛔ skip this region
#     df_r = df_model[df_model["Регион"] == region].sort_values("Период")

#     if df_r.shape[0] < forecast_horizon + 12:
#         continue  # skip if not enough data

#     # Train-test split
#     df_train = df_r.iloc[:-forecast_horizon]
#     df_test = df_r.iloc[-forecast_horizon:]

#     X_train = df_train[["Поголовье: КРС", "Температура", "Осадки"]]
#     y_train = df_train["КРС"]
#     X_test = df_test[["Поголовье: КРС", "Температура", "Осадки"]]
#     y_test = df_test["КРС"]

#     # Fit model
#     model = LinearRegression().fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     # Calculate metrics
#     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#     mae = mean_absolute_error(y_test, y_pred)
#     mape = (abs((y_test - y_pred) / y_test).mean()) * 100

#     results.append({
#         "Регион": region,
#         "RMSE": rmse,
#         "MAE": mae,
#         "MAPE (%)": mape,
#         "R² (Train)": model.score(X_train, y_train)
#     })

#     # 🟦 Optional: Plot actual vs forecast
#     plt.figure(figsize=(8, 4))
#     plt.plot(df_test["Период"], y_test, marker="o", label="Факт")
#     plt.plot(df_test["Период"], y_pred, marker="s", label="Прогноз")
#     plt.title(f"{region} — Прогноз объема КРС на {forecast_horizon} месяца")
#     plt.xlabel("Месяц")
#     plt.ylabel("Объем КРС")
#     plt.legend()
#     plt.xticks(rotation=45)
#     plt.grid(True)
#     plt.tight_layout()
#     plt.show()

# # Save results
# results_df = pd.DataFrame(results)
# results_df.to_excel("mlr_forecast_evaluation_by_region_12m.xlsx", index=False)

# print("✅ Готово! Метрики и графики прогноза рассчитаны.")


In [None]:
# def sliding_window_evaluation(df_r, forecast_horizon=3, window_count=9):
#     metrics = []

#     # Ensure sorted by time
#     df_r = df_r.sort_values("Период").reset_index(drop=True)

#     for i in range(window_count):
#         # Define train and test split
#         train_end = i + 12  # require 12+ months for training
#         test_start = train_end
#         test_end = test_start + forecast_horizon

#         if test_end > len(df_r):
#             break

#         df_train = df_r.iloc[:train_end]
#         df_test = df_r.iloc[test_start:test_end]

#         X_train = df_train[["Поголовье: КРС", "Температура", "Осадки"]]
#         y_train = df_train["КРС"]
#         X_test = df_test[["Поголовье: КРС", "Температура", "Осадки"]]
#         y_test = df_test["КРС"]

#         # Fit model
#         model = LinearRegression().fit(X_train, y_train)
#         y_pred = model.predict(X_test)

#         # Calculate metrics
#         rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#         mae = mean_absolute_error(y_test, y_pred)
#         mape = (np.abs((y_test - y_pred) / y_test).mean()) * 100

#         metrics.append({"RMSE": rmse, "MAE": mae, "MAPE": mape})

#     # Aggregate metrics over all sliding windows
#     return pd.DataFrame(metrics).mean().to_dict()

# results = []

# for region in df_model["Регион"].unique():
#     if region == "РЕСПУБЛИКА КАЗАХСТАН":
#         continue  # Optional: exclude a region

#     df_r = df_model[df_model["Регион"] == region][["Период", "КРС", "Поголовье: КРС", "Температура", "Осадки"]].dropna()

#     if df_r.shape[0] < 24:
#         continue  # need enough data for multiple windows

#     metrics = sliding_window_evaluation(df_r)

#     results.append({
#         "Регион": region,
#         "Avg RMSE": metrics["RMSE"],
#         "Avg MAE": metrics["MAE"],
#         "Avg MAPE (%)": metrics["MAPE"]
#     })

# # Save result
# results_df = pd.DataFrame(results)
# results_df.to_excel("sliding_window_forecast_evaluation.xlsx", index=False)

# print("✅ Готово! Средние метрики по скользящим окнам сохранены.")



### Autoregressive Integrated Moving Average (ARIMA)

In [None]:
# Container for forecast evaluation results
results = []

# Loop over each region
for region in df_model["Регион"].unique():
    df_r = df_model[df_model["Регион"] == region].sort_values("Период")
    y = df_r["КРС"].reset_index(drop=True)

    if y.isna().sum() > 0 or len(y) < 24:  # Need at least 2 full seasons
        continue

    # Split data
    test_size = 3
    y_train = y[:-test_size]
    y_test = y[-test_size:]

    try:
        # Auto SARIMA model selection
        model = auto_arima(
            y_train,
            seasonal=True,        # Enable SARIMA
            m=12,                 # Season length (12 months)
            stepwise=True,
            suppress_warnings=True,
            error_action="ignore"
        )

        # Forecast
        forecast = model.predict(n_periods=test_size)

        # Evaluation
        rmse = np.sqrt(mean_squared_error(y_test, forecast))
        mae = mean_absolute_error(y_test, forecast)
        mape = (np.abs((y_test - forecast) / y_test).mean()) * 100

        results.append({
            "Регион": region,
            "SARIMA order": model.order,
            "Seasonal order": model.seasonal_order,
            "RMSE": rmse,
            "MAE": mae,
            "MAPE (%)": mape
        })

        # Plot actual vs forecast
        plt.figure(figsize=(8, 4))
        plt.plot(y.index, y, color="lightgray", label="Все данные")
        plt.plot(y_test.index, y_test, marker="o", color="black", label="Факт (тест)")
        plt.plot(y_test.index, forecast, marker="s", linestyle="--", color="red", label="Прогноз")
        plt.axvline(len(y_train) - 1, color="gray", linestyle="--", label="Граница Train/Test")
        plt.title(f"{region} — SARIMA{model.order}x{model.seasonal_order} Прогноз КРС")
        plt.legend()
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"⚠️ Ошибка в {region}: {e}")

# Save results to Excel
results_df = pd.DataFrame(results)
results_df
# results_df.to_excel("sarima_forecast_evaluation_all_regions.xlsx", index=False)


In [None]:
# Calculate and print mean metrics
mean_mape = results_df["MAPE (%)"].mean()
mean_rmse = results_df["RMSE"].mean()
mean_mae = results_df["MAE"].mean()

print(f"Средние метрики по регионам:")
print(f"Средний MAPE: {mean_mape:.2f}%")
print(f"Средний RMSE: {mean_rmse:.2f}")
print(f"Средний MAE: {mean_mae:.2f}")

In [None]:
# Step 1: Select one region's time series
region = "АКМОЛИНСКАЯ ОБЛАСТЬ"
df_r = df_model[df_model["Регион"] == region].sort_values("Период")
y = df_r["КРС"].reset_index(drop=True)

# Step 2: Check stationarity (ADF test)
result = adfuller(y)
print("ADF Statistic:", result[0])
print("p-value:", result[1])
if result[1] > 0.05:
    print("❌ Non-stationary — differencing needed.")
else:
    print("✅ Stationary")



In [None]:
# Step 3: (If needed) difference the series
y_diff = y.diff().dropna()

In [None]:
# Step 4: Fit ARIMA model (p,d,q) — try (1,1,1) to start
model = ARIMA(y, order=(1,1,1))  # (p=1, d=1, q=1) — adjust based on AIC later
model_fit = model.fit()

# Step 5: Forecast next 3 months
forecast = model_fit.forecast(steps=3)
print("🔮 Forecast for next 3 months:")
print(forecast)

# Step 6: Plot
plt.figure(figsize=(10,5))
plt.plot(y, label="Actual")
plt.plot(np.arange(len(y), len(y)+3), forecast, label="Forecast", marker='o')
plt.title(f"ARIMA Forecast — {region}")
plt.legend()
plt.grid(True)
plt.show()