In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("cleaned_dataset.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-01-03,58.485714,58.92857,58.42857,58.747143,50.765709,75555200
1,2012-01-04,58.57143,59.240002,58.468571,59.062859,51.038536,65005500
2,2012-01-05,59.278572,59.792858,58.952858,59.718571,51.605175,67817400
3,2012-01-06,59.967144,60.392857,59.888573,60.342857,52.14463,79573200
4,2012-01-09,60.785713,61.107143,60.192856,60.247143,52.061932,98506100


In [5]:
%%writefile app.py
import streamlit as st
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np

# ---------------------------------------
# Streamlit Page Config
# ---------------------------------------
st.set_page_config(page_title="SARIMAX & ARIMAX Forecast", layout="wide")
st.title("ðŸ“ˆ Stock Forecasting: SARIMAX & ARIMAX Comparison")

# ---------------------------------------
# 1. Load Dataset
# ---------------------------------------
@st.cache_data
def load_data():
    df = pd.read_csv("cleaned_dataset.csv")
    df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0], errors='coerce', dayfirst=True)
    df = df.dropna(subset=[df.columns[0]])
    df = df.sort_values(by=df.columns[0])
    df = df.set_index(df.columns[0])
    df = df[~df.index.duplicated(keep='first')]
    df = df.asfreq("D", method="pad")
    return df

df = load_data()
st.subheader("Dataset Preview")
st.write(df.head())

# ---------------------------------------
# 2. User Inputs
# ---------------------------------------
st.subheader("Select Target Column")
target_column = st.selectbox("Target Column", df.columns)

st.subheader("Select Exogenous Variables (for ARIMAX)")
exog_columns = st.multiselect("Select Exogenous Columns", 
                              [col for col in df.columns if col != target_column])

st.subheader("Select Training Date Range")
start_date = st.date_input("Start Date", value=df.index.min().date())
end_date = st.date_input("End Date", value=df.index.max().date())

filtered_df = df.loc[str(start_date):str(end_date)]

if len(filtered_df) < 20:
    st.error("Please select at least 20 days of data.")
    st.stop()

st.write("Filtered Data:")
st.write(filtered_df.head())

# ---------------------------------------
# 3. SARIMAX Parameters
# ---------------------------------------
st.subheader("Define SARIMAX Parameters")
p = st.number_input("SARIMAX p", min_value=0, value=1)
d = st.number_input("SARIMAX d", min_value=0, value=1)
q = st.number_input("SARIMAX q", min_value=0, value=1)
P = st.number_input("SARIMAX P", min_value=0, value=1)
D = st.number_input("SARIMAX D", min_value=0, value=1)
Q = st.number_input("SARIMAX Q", min_value=0, value=1)
m = st.number_input("Seasonal Period (m)", min_value=1, value=7)

forecast_steps = st.number_input("Days to Forecast", min_value=1, value=30)

# ---------------------------------------
# 4. Train Models & Forecast
# ---------------------------------------
st.subheader("Train and Compare Models")

if st.button("Train and Forecast"):
    y = filtered_df[target_column]
    results = []

    sarimax_fit = None
    arimax_fit = None

    # ---------- SARIMAX ----------
    with st.spinner("Training SARIMAX..."):
        try:
            sarimax_model = SARIMAX(
                y,
                order=(p, d, q),
                seasonal_order=(P, D, Q, m),
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            sarimax_fit = sarimax_model.fit()
            sarimax_rmse = np.sqrt(mean_squared_error(y, sarimax_fit.fittedvalues))

            results.append({
                "Model": "SARIMAX",
                "AIC": sarimax_fit.aic,
                "BIC": sarimax_fit.bic,
                "RMSE": sarimax_rmse
            })

            st.success(f"SARIMAX trained successfully! RMSE: {sarimax_rmse:.4f}")

        except Exception as e:
            st.warning(f"SARIMAX failed: {e}")

    # ---------- ARIMAX ----------
    if exog_columns:
        with st.spinner("Training ARIMAX..."):
            exog_data = filtered_df[exog_columns]
            try:
                arimax_model = SARIMAX(
                    y,
                    order=(p, d, q),
                    exog=exog_data,
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                arimax_fit = arimax_model.fit()
                arimax_rmse = np.sqrt(mean_squared_error(y, arimax_fit.fittedvalues))

                results.append({
                    "Model": "ARIMAX",
                    "AIC": arimax_fit.aic,
                    "BIC": arimax_fit.bic,
                    "RMSE": arimax_rmse
                })

                st.success(f"ARIMAX trained successfully! RMSE: {arimax_rmse:.4f}")

            except Exception as e:
                st.error(f"ARIMAX failed: {e}")
    else:
        st.info("No exogenous variables selected. ARIMAX skipped.")

    # ---------------------------------------
    # Compare & Select Best Model
    # ---------------------------------------
    if results:

        results_df = pd.DataFrame(results).sort_values("AIC")

        # CASE 1 â†’ ONLY ONE MODEL (skip comparison table)
        if len(results_df) == 1:
            only_model = results_df.iloc[0]

            st.info("Only one model was trained â€” skipping model comparison.")
            st.write(f"*Model:* {only_model['Model']}")
            st.write(f"*AIC:* {only_model['AIC']:.4f}")
            st.write(f"*BIC:* {only_model['BIC']:.4f}")
            st.write(f"*RMSE:* {only_model['RMSE']:.4f}")

            best_model_name = only_model["Model"]

        # CASE 2 â†’ Multiple models
        else:
            st.write("### Model Comparison (sorted by AIC)")
            st.dataframe(results_df)

            best_model_name = results_df.loc[results_df["AIC"].idxmin(), "Model"]
            st.success(f"Best Model Selected: {best_model_name}")

        # ---------- SELECT MODEL FOR FORECAST ----------
        if best_model_name == "SARIMAX":
            forecast_model = sarimax_fit
            exog_forecast = None

        else:
            forecast_model = arimax_fit
            last_exog = exog_data.iloc[-1:]
            exog_forecast = pd.concat([last_exog] * forecast_steps)
            exog_forecast.index = pd.date_range(
                filtered_df.index[-1] + pd.Timedelta(days=1),
                periods=forecast_steps
            )

        # ---------- FORECAST ----------
        forecast = forecast_model.get_forecast(steps=forecast_steps, exog=exog_forecast)
        forecast_mean = forecast.predicted_mean
        conf_int = forecast.conf_int()

        future_dates = pd.date_range(
            filtered_df.index[-1] + pd.Timedelta(days=1),
            periods=forecast_steps
        )

        forecast_df = pd.DataFrame({
            "Date": future_dates,
            "Prediction": forecast_mean.values,
            "Lower": conf_int.iloc[:, 0].values,
            "Upper": conf_int.iloc[:, 1].values
        }).set_index("Date")

        st.subheader("Forecast")
        st.write(forecast_df)

        # ---------- PLOT ----------
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.plot(filtered_df[target_column], label="Training Data")
        ax.plot(forecast_df["Prediction"], label=f"Forecast ({best_model_name})", linestyle="--")
        ax.fill_between(forecast_df.index, forecast_df["Lower"], forecast_df["Upper"],
                        color='orange', alpha=0.2)

        ax.legend()
        ax.set_xlabel("Date")
        ax.set_ylabel(target_column)
        ax.set_title(f"Forecast using {best_model_name}")

        st.pyplot(fig)

    else:
        st.error("No model could be trained. Please check your data or parameters.")

Overwriting app.py


In [4]:
%%writefile requirements.txt
streamlit
pandas
numpy
matplotlib
statsmodels

Writing requirements.txt


In [43]:
print (df.columns)

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')


In [45]:
print(df[target_column].isna().sum())


0


In [46]:
df = df.dropna(subset=[target_column])


In [47]:
print(df[target_column].isna().sum())  # should print 0


0
