In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import os
import matplotlib.pyplot as plt

def train_and_evaluate_xgboost_model(input_data: pd.DataFrame, output_model_file: str):
    """
    Trains an XGBoost regression model on the given data and evaluates its performance.
    
    Parameters:
        input_data (pd.DataFrame): The DataFrame containing the features and target variable.
        output_model_file (str): Path where the trained XGBoost model will be saved.
    """
    # Select features and target
    X = input_data[['Sentiment Score', 'crude_close', 'vix_close', 'sp_close']]
    y = input_data['CLOSE']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale features for better performance
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert data to DMatrix format for XGBoost
    dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
    dtest = xgb.DMatrix(X_test_scaled, label=y_test)

    # Define parameters for XGBoost model
    params = {
        'objective': 'reg:squarederror', # Regression objective
        'eval_metric': 'rmse',           # Evaluation metric
        'eta': 0.1,                      # Learning rate
        'max_depth': 6                   # Maximum depth of a tree
    }

    # Train the model
    xgboost_model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], early_stopping_rounds=10)

    # Predict on the test set
    y_pred = xgboost_model.predict(dtest)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R^2 Score: {r2}")

    # Optional: Plotting feature importance
    xgb.plot_importance(xgboost_model)
    plt.show()

    # Save the trained model to a file
    os.makedirs(os.path.dirname(output_model_file), exist_ok=True)
    xgboost_model.save_model(output_model_file)
    print(f"Model saved to {output_model_file}")






