# 02 ARIMA Modeling
- ARIMA/SARIMA model for TSLA
- Parameter tuning (auto_arima)
- Model evaluation (MAE, RMSE, MAPE)

# 02 ARIMA Modeling
This notebook demonstrates how to build, tune, and evaluate an ARIMA/SARIMA model for TSLA stock price forecasting.

In [17]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [20]:
# Load processed TSLA data with robust date parsing and validation
file_path = '../data/processed/TSLA_processed.csv'
df = pd.read_csv(file_path, parse_dates=['Date'])  # Ensure Date is parsed as datetime

# Check for missing or malformed dates
if df['Date'].isnull().any():
    print("Warning: There are missing or malformed dates in your data.")
    df = df.dropna(subset=['Date'])

df = df.set_index('Date').sort_index()
print("Date range in data:", df.index.min(), "to", df.index.max())
print(df.head())

# Choose a split_date that is within your data's date range
split_date = '2024-01-01'
if split_date < str(df.index.min()) or split_date > str(df.index.max()):
    print(f"split_date {split_date} is outside the data range. Please adjust it.")
else:
    # Split data into train and test sets
    train = df.loc[df.index < split_date, 'Adj Close']
    test = df.loc[df.index >= split_date, 'Adj Close']
    print(f"Train shape: {train.shape}, Test shape: {test.shape}")
    print(f"Train range: {train.index.min()} to {train.index.max()}")
    print(f"Test range: {test.index.min()} to {test.index.max()}")

    # Only proceed if test set is not empty
    n_test = len(test)
    if n_test > 0:
        # Use auto_arima to find the best (p,d,q) parameters
        print('Running auto_arima for parameter selection...')
        stepwise_model = auto_arima(train, start_p=1, start_q=1, max_p=3, max_q=3, m=1,
                                    start_P=0, seasonal=False, d=None, trace=True,
                                    error_action='ignore', suppress_warnings=True, stepwise=True)
        print(f'Best ARIMA order: {stepwise_model.order}')

        # Fit SARIMAX model with the best parameters
        order = stepwise_model.order
        model = SARIMAX(train, order=order, enforce_stationarity=False, enforce_invertibility=False)
        model_fit = model.fit(disp=False)
        print(model_fit.summary())

        # Forecast over the test set period
        forecast = model_fit.forecast(steps=n_test)
        forecast = pd.Series(forecast, index=test.index)

        # Plot actual vs forecast
        plt.figure(figsize=(12,6))
        plt.plot(train.index, train, label='Train')
        plt.plot(test.index, test, label='Test', color='orange')
        plt.plot(forecast.index, forecast, label='Forecast', color='green')
        plt.title('ARIMA Forecast vs Actual')
        plt.xlabel('Date')
        plt.ylabel('Adj Close')
        plt.legend()
        plt.show()

        # Evaluate forecast performance
        mae = mean_absolute_error(test, forecast)
        rmse = np.sqrt(mean_squared_error(test, forecast))
        mape = np.mean(np.abs((test - forecast) / test)) * 100
        print(f'MAE: {mae:.4f}')
        print(f'RMSE: {rmse:.4f}')
        print(f'MAPE: {mape:.2f}%')

        # Plot residuals to check for patterns
        residuals = test - forecast
        plt.figure(figsize=(12,4))
        plt.plot(residuals)
        plt.title('Forecast Residuals (Test - Forecast)')
        plt.xlabel('Date')
        plt.ylabel('Residual')
        plt.show()

        plt.figure(figsize=(6,4))
        plt.hist(residuals, bins=30, edgecolor='k')
        plt.title('Distribution of Residuals')
        plt.xlabel('Residual')
        plt.ylabel('Frequency')
        plt.show()
    else:
        print("Test set is empty after splitting. Adjust your split_date or check your data.")

Date range in data: 1970-01-01 00:00:00.000000021 to 1970-01-01 00:00:00.000002536
                                    Price      Close       High        Low  \
Date                                                                         
1970-01-01 00:00:00.000000021  2015-07-29  17.587999  17.859333  17.466667   
1970-01-01 00:00:00.000000022  2015-07-30  17.785999  17.796000  17.474001   
1970-01-01 00:00:00.000000023  2015-07-31  17.743334  17.957333  17.674667   
1970-01-01 00:00:00.000000024  2015-08-03  17.332666  17.780666  17.138000   
1970-01-01 00:00:00.000000025  2015-08-04  17.752001  17.781334  17.222668   

                                    Open      Volume  Adj Close    Return  \
Date                                                                        
1970-01-01 00:00:00.000000021  17.618000  41851500.0  17.587999 -0.003776   
1970-01-01 00:00:00.000000022  17.512667  30519000.0  17.785999  0.011258   
1970-01-01 00:00:00.000000023  17.840000  33339000.0  17.74333

## Discussion
- Summarize the model's performance and discuss any patterns or issues observed in the residuals.
- Consider next steps: further tuning, adding exogenous variables, or comparing with LSTM.