# MLflow Experiment Setup

In [None]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.pyfunc
from mlflow.models.signature import infer_signature
import joblib
import pickle

# --- Setup MLflow experiment (same as ARIMA notebook) ---
mlflow.set_tracking_uri("sqlite:///mlflow.db") 
mlflow.set_experiment("model_experiment")
print("MLflow experiment 'model_experiment' is set up and ready for tracking.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *
from model import walk_forward_validation
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

Load Data

In [None]:
data = pd.read_csv('./vgi2.csv')
data.index = pd.to_datetime(data['date'], format='%Y-%m-%d')
data.pop('date')
data = pd.DataFrame(data, dtype=np.float64)


# train_split = data.index.get_loc('2021-10-29')
# close = data.pop('close')
# data.insert(5, 'close', close)
# data1 = data.iloc[809:, 0]  #3501, 5
# residuals = pd.read_csv('./ARIMA_residuals1.csv')
# residuals.index = pd.to_datetime(residuals['date'])  #trade_date
# residuals.pop('date')
# merge_data = pd.merge(data, residuals, on='date')
# #merge_data = merge_data.drop(labels='2007-01-04', axis=0)
# time = pd.Series(data.index[810:]) # thay vì 809, vị trí đầu tiên sẽ bị remove vì nan value dó đó index bị lùi 1 step


Load ARIMA residuals and merge with close, open, high, low, nmVolume

In [None]:
residuals = pd.read_csv('./ARIMA_residuals1.csv')
residuals.index = pd.to_datetime(residuals['date'])  #trade_date
residuals.pop('date')
merge_data = pd.merge(data, residuals, on='date')

In [None]:
residuals.head()

In [None]:
merge_data = merge_data.rename(columns={'0':'Residual'})
merge_data

Spliting Train - Validation - Test 

In [None]:
train = merge_data[merge_data.index <= '2020-11-24']
valid = merge_data[(merge_data.index <= '2021-10-29')&(merge_data.index > '2020-11-24')]
test_set = merge_data[(merge_data.index <= '2021-12-31')&(merge_data.index > '2021-10-29')]
training_set = pd.concat([train, valid], axis=0)
print('train shape:', train.shape)
print('validation shape:', valid.shape)
print('test shape:', test_set.shape)

Load the Predictions from ARIMA model on Test set

In [None]:
Lt = pd.read_csv('./ARIMA.csv')
Lt

In [None]:
Lt = Lt.drop('date', axis=1)
Lt = np.array(Lt)
Lt = Lt.flatten().tolist()

Load the Predictions from ARIMA model on Validation set

In [None]:
Vt = pd.read_csv('./ARIMA_Validation.csv')
Vt

In [None]:
Vt = Vt.drop('date', axis=1)
Vt = np.array(Vt)
Vt = Vt.flatten().tolist()

In [None]:
training_set


Convert to supervised data

In [None]:
def prepare_data_valiation(train_series, n_valid, n_in, n_out):
    values = train_series.values
    supervised_data = series_to_supervised(values, n_in, n_out)
    print('supervised_data', supervised_data)
    idx = train_series.shape[0] - n_valid
    train, valid = supervised_data.loc[:idx, :], supervised_data.loc[idx:, :]
    return train, valid, supervised_data

In [None]:
def prepare_data_test(series, n_test, n_in, n_out):
    values = series.values
    supervised_data = series_to_supervised(values, n_in, n_out)
    print('supervised_data', supervised_data)
    idx = series.shape[0] - n_test
    train, test = supervised_data.loc[:idx, :], supervised_data.loc[idx:, :]
    return train, test, supervised_data

#Train and Valid set will be split from Traing_set for Validation process

In [None]:
# from utils import *
n_timestamp = 1

train_supervised, valid_supervised, supervised_data_validation = prepare_data_valiation(training_set, n_valid=valid.shape[0], n_in=n_timestamp, n_out=1)

In [None]:
print(train_supervised.shape)
print(valid_supervised.shape)


#Spliting for Testing process

In [None]:
# from utils import *
n_timestamp = 1

training_set_supervised, test_supervised, supervised_data_test = prepare_data_test(merge_data, n_test=test_set.shape[0], n_in=n_timestamp, n_out=1)

In [None]:

print(training_set_supervised.shape)
print(test_supervised.shape)

In [None]:
merge_data.tail(10)

In [None]:
supervised_data_test.tail(1).T

In [None]:
training_set_supervised.head()

Min Max Scale

In [None]:
from sklearn.preprocessing import MinMaxScaler

data_sc = MinMaxScaler(feature_range=(0, 1))
train_scaled = data_sc.fit_transform(train_supervised)
valid_scaled = data_sc.transform(valid_supervised)

In [None]:
data_sc2 = MinMaxScaler(feature_range=(0, 1))
training_set_scaled = data_sc2.fit_transform(training_set_supervised)
test_set_scaled = data_sc2.transform(test_supervised)

In [None]:
train_scaled = pd.DataFrame(train_scaled, columns=train_supervised.columns)
valid_scaled = pd.DataFrame(valid_scaled, columns=valid_supervised.columns)

training_set_scaled = pd.DataFrame(training_set_scaled, columns=training_set_supervised.columns)
test_set_scaled = pd.DataFrame(test_set_scaled, columns=test_supervised.columns)


XGBoost Model with the data supervised from merge_data (close, open, high, low, nmVolume, Residual)

#ARIMA-XGBoost Model: XGBoost will predict the residuals, then plus ARIMA predictions into the final predictions

##Validation Process

In [None]:
time = valid.index
time

In [None]:
y, yhat = walk_forward_validation(train_scaled, valid_scaled)
plt.figure(figsize=(10, 6))
plt.plot(time, y, label='Residuals')
plt.plot(time, yhat, label='Predicted Residuals')
plt.title('ARIMA+XGBoost: Residuals Prediction')
plt.xlabel('Time', fontsize=12, verticalalignment='top')
plt.ylabel('Residuals', fontsize=14, horizontalalignment='center')
plt.legend()
plt.show()

In [None]:
train_min = train_supervised.min(axis=0)[5] # 5 means residual position
train_max = train_supervised.max(axis=0)[5]
print(train_min)
print(train_max)

In [None]:
# Valid set
y_hat_valid_unscaled = np.asarray(yhat)*(train_max - train_min) + train_min
y_valid_unscaled = np.asarray(y)*(train_max - train_min) + train_min

In [None]:
# def evaluation_metric(y_test,y_hat):
evaluation_metric(y_valid_unscaled, y_hat_valid_unscaled )

# def GetMAPE(y_hat, y_test):
GetMAPE(y_hat_valid_unscaled, y_valid_unscaled)



## MLflow Tracking: ARIMA-XGBoost Hybrid Model (Validation)

In [None]:
# Start MLflow run for ARIMA-XGBoost Hybrid model
mlflow_hybrid_run_id = None

with mlflow.start_run(run_name="ARIMA_XGBoost_Hybrid") as run:
    mlflow_hybrid_run_id = run.info.run_id
    
    # Log parameters
    mlflow.log_param("model_type", "ARIMA_XGBoost_Hybrid")
    mlflow.log_param("arima_order", "(0,1,0)")
    mlflow.log_param("n_timestamp", n_timestamp)
    mlflow.log_param("scaler_type", "MinMaxScaler")
    mlflow.log_param("scaler_range", "(0, 1)")
    mlflow.log_param("features", "close, open, high, low, nmVolume, Residual")
    
    # Log XGBoost residual prediction metrics (validation)
    val_residual_mse = metrics.mean_squared_error(y_valid_unscaled, y_hat_valid_unscaled)
    val_residual_rmse = np.sqrt(val_residual_mse)
    val_residual_mae = metrics.mean_absolute_error(y_valid_unscaled, y_hat_valid_unscaled)
    val_residual_r2 = metrics.r2_score(y_valid_unscaled, y_hat_valid_unscaled)
    val_residual_mape = GetMAPE(y_hat_valid_unscaled, y_valid_unscaled)
    
    mlflow.log_metric("val_residual_mse", val_residual_mse)
    mlflow.log_metric("val_residual_rmse", val_residual_rmse)
    mlflow.log_metric("val_residual_mae", val_residual_mae)
    mlflow.log_metric("val_residual_r2", val_residual_r2)
    mlflow.log_metric("val_residual_mape", val_residual_mape)
    
    print(f"✅ ARIMA-XGBoost Hybrid (Residual Validation) logged to MLflow - Run ID: {run.info.run_id}")
    print(f"   Residual Validation RMSE: {val_residual_rmse:.5f}, MAPE: {val_residual_mape:.3f}%")

In [None]:
finalpredicted_stock_price = [i + j for i, j in zip(Vt, y_hat_valid_unscaled)]
#print('final', finalpredicted_stock_price)
evaluation_metric(valid.iloc[:,0], finalpredicted_stock_price)

print('MAPE_ARIMA-XGBoost_Validation:', GetMAPE(finalpredicted_stock_price, valid.iloc[:,0]), '%')


plt.figure(figsize=(20, 5))
plt.plot(time, valid.iloc[:,0], label='Stock Price')
plt.plot(time, finalpredicted_stock_price, label='Predicted Stock Price')
plt.title(f'ARIMA+XGBoost: Stock Price Prediction on Validation, n_timestamp = {n_timestamp}')

plt.xlabel('Time', fontsize=12, verticalalignment='top')
plt.ylabel('Close', fontsize=14, horizontalalignment='center')
plt.legend()
plt.show()

# n_timestamp = 0
# MSE: 0.92375
# RMSE: 0.96112
# MAE: 0.56679
# R2: 0.95247
# MAPE_ARIMA-XGBoost_Validation: 1.5584695259033352 %

# n_timestamp = 1
# MSE: 0.83644
# RMSE: 0.91457
# MAE: 0.57544
# R2: 0.95696
# MAPE_ARIMA-XGBoost_Validation: 1.5858401261736605 %

# n_timestamp = 6
# MSE: 1.11843
# RMSE: 1.05756
# MAE: 0.65351
# R2: 0.94245
# MAPE_ARIMA-XGBoost_Validation: 1.8010548292247548 %

In [None]:
# Continue hybrid model run - log final validation metrics
with mlflow.start_run(run_id=mlflow_hybrid_run_id) as run:
    
    # Calculate final hybrid validation metrics
    val_mse = metrics.mean_squared_error(valid.iloc[:,0], finalpredicted_stock_price)
    val_rmse = np.sqrt(val_mse)
    val_mae = metrics.mean_absolute_error(valid.iloc[:,0], finalpredicted_stock_price)
    val_r2 = metrics.r2_score(valid.iloc[:,0], finalpredicted_stock_price)
    val_mape = GetMAPE(finalpredicted_stock_price, valid.iloc[:,0])
    
    # Log final validation metrics
    mlflow.log_metric("val_mse", val_mse)
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("val_r2", val_r2)
    mlflow.log_metric("val_mape", val_mape)
    
    print(f"✅ ARIMA-XGBoost Hybrid (Final Validation) logged to MLflow")
    print(f"   Final Validation RMSE: {val_rmse:.5f}, MAPE: {val_mape:.3f}%")

###Plus XGBoost residual prediction with ARIMA predictions

##Testing Process

In [None]:
time_test = test_set.index
y_test, yhat_test = walk_forward_validation(training_set_scaled, test_set_scaled)

plt.figure(figsize=(10, 6))
plt.plot(time_test, y_test, label='Residuals')
plt.plot(time_test, yhat_test, label='Predicted Residuals')
plt.title('ARIMA+XGBoost: Residuals Prediction')
plt.xlabel('Time', fontsize=12, verticalalignment='top')
plt.ylabel('Residuals', fontsize=14, horizontalalignment='center')
plt.legend()
plt.show()

In [None]:
training_set_min = training_set_supervised.min(axis=0)[5] # 5 means residual position
training_set_max = training_set_supervised.max(axis=0)[5]
print(training_set_min)
print(training_set_max)

# Valid set
y_hat_test_unscaled = np.asarray(yhat_test)*(training_set_max - training_set_min) + training_set_min
y_test_unscaled = np.asarray(y_test)*(training_set_max - training_set_min) + training_set_min

In [None]:
# def evaluation_metric(y_test,y_hat):
evaluation_metric(y_test_unscaled, y_hat_test_unscaled)

# def GetMAPE(y_hat, y_test):
GetMAPE(y_hat_test_unscaled, y_test_unscaled)

# MSE: 0.36510
# RMSE: 0.60423
# MAE: 0.44403
# R2: 0.27840
# 157.67322619794388

In [None]:
# Continue hybrid model run - log test residual metrics
with mlflow.start_run(run_id=mlflow_hybrid_run_id) as run:
    
    # Log test residual metrics
    test_residual_mse = metrics.mean_squared_error(y_test_unscaled, y_hat_test_unscaled)
    test_residual_rmse = np.sqrt(test_residual_mse)
    test_residual_mae = metrics.mean_absolute_error(y_test_unscaled, y_hat_test_unscaled)
    test_residual_r2 = metrics.r2_score(y_test_unscaled, y_hat_test_unscaled)
    test_residual_mape = GetMAPE(y_hat_test_unscaled, y_test_unscaled)
    
    mlflow.log_metric("test_residual_mse", test_residual_mse)
    mlflow.log_metric("test_residual_rmse", test_residual_rmse)
    mlflow.log_metric("test_residual_mae", test_residual_mae)
    mlflow.log_metric("test_residual_r2", test_residual_r2)
    mlflow.log_metric("test_residual_mape", test_residual_mape)
    
    print(f"✅ ARIMA-XGBoost Hybrid (Residual Test) logged to MLflow")
    print(f"   Residual Test RMSE: {test_residual_rmse:.5f}, MAPE: {test_residual_mape:.3f}%")

###Plus XGBoost residual prediction with ARIMA predictions

In [None]:

finalpredicted_stock_price2 = [i + j for i, j in zip(Lt, y_hat_test_unscaled)]
#print('final', finalpredicted_stock_price)
evaluation_metric(test_set.iloc[:, 0], finalpredicted_stock_price2)

print('MAPE_ARIMA-XGBoost_Testing:', GetMAPE(finalpredicted_stock_price2, test_set.iloc[:, 0]), '%')


plt.figure(figsize=(20, 5))
plt.plot(time_test, test_set.iloc[:, 0], label='Stock Price')
plt.plot(time_test, finalpredicted_stock_price2, label='Predicted Stock Price')
plt.title(f'ARIMA+XGBoost: Stock Price Prediction on Test set, n_timestamp = {n_timestamp}')
plt.xlabel('Time', fontsize=12, verticalalignment='top')
plt.ylabel('Close', fontsize=14, horizontalalignment='center')
plt.legend()
plt.show()

# n_timestamp = 6
# MSE: 0.36696
# RMSE: 0.60577
# MAE: 0.44506
# R2: 0.93019

# n_timestamp = 1
# MSE: 0.20460
# RMSE: 0.45233
# MAE: 0.32653
# R2: 0.96108
# MAPE_ARIMA-XGBoost_Testing: 0.9469896700985768 %

In [None]:
# Continue hybrid model run - log final test metrics and save model artifacts
with mlflow.start_run(run_id=mlflow_hybrid_run_id) as run:
    
    # Calculate final test metrics
    test_mse = metrics.mean_squared_error(test_set.iloc[:, 0], finalpredicted_stock_price2)
    test_rmse = np.sqrt(test_mse)
    test_mae = metrics.mean_absolute_error(test_set.iloc[:, 0], finalpredicted_stock_price2)
    test_r2 = metrics.r2_score(test_set.iloc[:, 0], finalpredicted_stock_price2)
    test_mape = GetMAPE(finalpredicted_stock_price2, test_set.iloc[:, 0])
    
    # Log final test metrics
    mlflow.log_metric("test_mse", test_mse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("test_mape", test_mape)
    
    # Save and log artifacts
    # Save scalers
    joblib.dump(data_sc, 'scaler_validation.pkl')
    joblib.dump(data_sc2, 'scaler_test.pkl')
    mlflow.log_artifact('scaler_validation.pkl', artifact_path='scalers')
    mlflow.log_artifact('scaler_test.pkl', artifact_path='scalers')
    
    # Save metadata
    metadata = {
        'n_timestamp': n_timestamp,
        'train_min': float(train_min),
        'train_max': float(train_max),
        'training_set_min': float(training_set_min),
        'training_set_max': float(training_set_max),
        'feature_columns': list(train_supervised.columns),
        'arima_order': '(0,1,0)'
    }
    with open('model_metadata.json', 'w') as f:
        import json
        json.dump(metadata, f, indent=4)
    mlflow.log_artifact('model_metadata.json', artifact_path='metadata')
    
    # Log ARIMA model reference
    mlflow.log_artifact('model_ARIMA.pkl', artifact_path='model')
    mlflow.log_artifact('ARIMA_residuals1.csv', artifact_path='data')
    
    print(f"✅ ARIMA-XGBoost Hybrid (Final Test & Artifacts) logged to MLflow - Run ID: {run.info.run_id}")
    print(f"   Final Test RMSE: {test_rmse:.5f}, MAPE: {test_mape:.3f}%")
    print(f"   Artifacts saved: scalers, metadata, ARIMA model, residuals")

## Save Complete Hybrid Model as Custom MLflow Model

In [None]:
import xgboost as xgb
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMAResults

class ARIMAXGBoostHybridModel(mlflow.pyfunc.PythonModel):
    """
    Custom MLflow model for ARIMA-XGBoost Hybrid
    
    Input: DataFrame with columns ['close', 'open', 'high', 'low', 'nmVolume'] and DatetimeIndex
    Output: Array of predicted stock prices
    """
    
    def load_context(self, context):
        """Load all model components"""
        import json
        
        # Load ARIMA model
        self.arima_model = ARIMAResults.load(context.artifacts["arima_model"])
        
        # Load XGBoost model (we'll save it in next cell)
        self.xgb_model = pickle.load(open(context.artifacts["xgboost_model"], 'rb'))
        
        # Load scaler
        self.scaler = joblib.load(context.artifacts["scaler"])
        
        # Load metadata
        with open(context.artifacts["metadata"]) as f:
            self.metadata = json.load(f)
        
        self.n_timestamp = self.metadata['n_timestamp']
        self.train_min = self.metadata['training_set_min']
        self.train_max = self.metadata['training_set_max']
        
    def predict(self, context, model_input):
        """
        Predict stock prices using ARIMA + XGBoost
        
        Parameters:
        - model_input: DataFrame with ['close', 'open', 'high', 'low', 'nmVolume']
        
        Returns:
        - Array of predicted close prices
        """
        # Step 1: Generate ARIMA predictions (walk-forward)
        arima_predictions = []
        history = list(model_input['close'].values)
        
        for i in range(len(model_input)):
            # Fit ARIMA on history up to current point
            temp_arima = sm.tsa.ARIMA(history[:i+1], order=(0, 1, 0))
            temp_arima_fit = temp_arima.fit()
            forecast = temp_arima_fit.forecast(steps=1)
            arima_predictions.append(float(forecast[0]))
        
        # Step 2: Calculate residuals
        residuals = model_input['close'].values - np.array(arima_predictions)
        residuals_df = pd.DataFrame(residuals, columns=['Residual'], index=model_input.index)
        
        # Step 3: Merge with features
        merge_data = pd.concat([model_input, residuals_df], axis=1)
        
        # Step 4: Convert to supervised format
        values = merge_data.values
        supervised_data = series_to_supervised(values, n_in=self.n_timestamp, n_out=1)
        
        # Step 5: Scale
        scaled_data = self.scaler.transform(supervised_data)
        
        # Step 6: XGBoost predicts residuals
        X_features = scaled_data[:, :-1]  # All except last column
        # Use .predict() directly with XGBRegressor (no DMatrix needed for pickled models)
        residual_predictions_scaled = self.xgb_model.predict(X_features)
        
        # Step 7: Unscale residuals
        residual_predictions = (residual_predictions_scaled * (self.train_max - self.train_min) + self.train_min)
        
        # Step 8: Final prediction = ARIMA + XGBoost residuals
        # Adjust arima_predictions to match length after supervised conversion
        arima_preds_adjusted = arima_predictions[self.n_timestamp:]
        final_predictions = np.array(arima_preds_adjusted) + residual_predictions
        
        return final_predictions

print("✅ ARIMAXGBoostHybridModel class defined")

In [None]:
# Train final XGBoost model and save complete hybrid model
from model import walk_forward_validation

# We need to extract the trained XGBoost model from walk_forward_validation
# For now, let's train a final XGBoost model on all training data

from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb

# Prepare final training data
train_X = training_set_scaled.iloc[:, :-1]
train_y = training_set_scaled.iloc[:, -1]

# Train final XGBoost model
final_xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
final_xgb_model.fit(train_X, train_y)

# Save XGBoost model
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(final_xgb_model, f)

print("✅ Final XGBoost model trained and saved")

In [None]:
# Log the complete hybrid model to MLflow
with mlflow.start_run(run_id=mlflow_hybrid_run_id) as run:
    
    # Define artifacts dictionary
    artifacts = {
        "arima_model": "model_ARIMA.pkl",
        "xgboost_model": "xgboost_model.pkl",
        "scaler": "scaler_test.pkl",
        "metadata": "model_metadata.json"
    }
    
    # Create sample input for signature
    sample_input = test_set[['close', 'open', 'high', 'low', 'nmVolume']].head(10)
    
    # Log the custom model
    mlflow.pyfunc.log_model(
        artifact_path="arima_xgboost_hybrid_model",
        python_model=ARIMAXGBoostHybridModel(),
        artifacts=artifacts,
        conda_env={
            'channels': ['defaults', 'conda-forge'],
            'dependencies': [
                'python=3.10',
                'pip',
                {
                    'pip': [
                        'mlflow',
                        'pandas',
                        'numpy',
                        'scikit-learn',
                        'xgboost',
                        'statsmodels',
                        'joblib'
                    ]
                }
            ],
            'name': 'arima_xgboost_env'
        }
    )
    
    print(f"✅ Complete ARIMA-XGBoost Hybrid Model saved to MLflow!")
    print(f"   Run ID: {run.info.run_id}")
    print(f"   Model URI: runs:/{run.info.run_id}/arima_xgboost_hybrid_model")
    print(f"\n📊 Model Summary:")
    print(f"   - Input: DataFrame with ['close', 'open', 'high', 'low', 'nmVolume']")
    print(f"   - Output: Array of predicted close prices")
    print(f"   - Components: ARIMA(0,1,0) + XGBoost + MinMaxScaler")
    print(f"   - Test RMSE: {test_rmse:.5f}")
    print(f"   - Test MAPE: {test_mape:.3f}%")

## Test Loading and Using the Saved Model

In [None]:
# Example: Load the saved model and make predictions
print("🔄 Loading saved hybrid model from MLflow...")

# Load the model
loaded_model = mlflow.pyfunc.load_model(f"runs:/{mlflow_hybrid_run_id}/arima_xgboost_hybrid_model")

# Prepare input data (use vgi2.csv format)
test_input = pd.read_csv('./vgi2.csv')
test_input.index = pd.to_datetime(test_input['date'], format='%Y-%m-%d')
test_input = test_input.drop('date', axis=1)
test_input = pd.DataFrame(test_input, dtype=np.float64)

# Get test period data
test_input_sample = test_input[(test_input.index > '2021-10-29') & (test_input.index <= '2021-12-31')]
test_input_sample = test_input_sample[['close', 'open', 'high', 'low', 'nmVolume']]

print(f"📥 Input shape: {test_input_sample.shape}")
print(f"   Predicting for period: {test_input_sample.index[0]} to {test_input_sample.index[-1]}")

# Make predictions
predictions_loaded = loaded_model.predict(test_input_sample)

print(f"\n📤 Output shape: {predictions_loaded.shape}")
print(f"   Sample predictions: {predictions_loaded[:5]}")

# Verify predictions match
print(f"\n✅ Model loaded and predictions generated successfully!")
print(f"   Use this model with any vgi2.csv formatted data")

## 📊 MLflow Experiment Summary

All models have been tracked in the **"model_experiment"** experiment with consistent metric names for easy comparison.

### **Models Saved:**

#### **1. Persistence Model (Baseline)**
- **Metrics**: `val_mse`, `val_rmse`, `val_mae`, `val_r2`, `val_mape`, `test_mse`, `test_rmse`, `test_mae`, `test_r2`, `test_mape`
- **Artifacts**: None

#### **2. ARIMA Model**
- **Metrics**: Same as above
- **Artifacts**: 
  - `model_ARIMA.pkl` - Fitted ARIMA model
  - `ARIMA_residuals1.csv` - Residuals for training
  - `ARIMA_Validation.csv` - Validation predictions
  - `ARIMA.csv` - Test predictions

#### **3. ARIMA-XGBoost Hybrid Model** ⭐
- **Metrics**: 
  - Residual metrics: `val_residual_mse`, `val_residual_rmse`, etc.
  - Final metrics: `val_mse`, `val_rmse`, `val_mae`, `val_r2`, `val_mape`, `test_mse`, `test_rmse`, `test_mae`, `test_r2`, `test_mape`
- **Artifacts**:
  - `arima_xgboost_hybrid_model/` - Complete deployable model (MLflow PythonModel)
  - `model_ARIMA.pkl` - ARIMA component
  - `xgboost_model.pkl` - XGBoost component
  - `scaler_validation.pkl`, `scaler_test.pkl` - Scalers
  - `model_metadata.json` - Configuration metadata

### **Saved Hybrid Model Usage:**

```python
# Load model
model = mlflow.pyfunc.load_model("runs:/<run_id>/arima_xgboost_hybrid_model")

# Input: DataFrame with columns ['close', 'open', 'high', 'low', 'nmVolume']
input_data = pd.read_csv('vgi2.csv')
# ... preprocess to match format ...

# Predict
predictions = model.predict(input_data)

# Output: Array of predicted close prices
```

### **View Results:**
```bash
mlflow ui --backend-store-uri sqlite:///mlflow.db
```
Then open: http://localhost:5000