In [1]:
import os
import pandas as pd

# Directory containing CSV files
directory = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22"

# Get a list of all CSV files
csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

# Initialize an empty list to store individual stock DataFrames
dfs = []

# Read and store each CSV file
for file in csv_files:
    stock_df = pd.read_csv(os.path.join(directory, file), parse_dates=["Date"])
    stock_df["Stock_ID"] = file.replace(".csv", "")  # Add stock ticker as an identifier
    dfs.append(stock_df)

# Concatenate all stock DataFrames
training_df = pd.concat(dfs, ignore_index=True)

# Sort by Date first, then by Stock_ID
training_df = training_df.sort_values(by=["Date", "Stock_ID"]).reset_index(drop=True)

# Reorder columns: Date → Stock_ID → Other Columns
cols = ["Date", "Stock_ID"] + [col for col in training_df.columns if col not in ["Date", "Stock_ID"]]
training_df = training_df[cols]

# Save merged data to a new CSV file
training_csv_path = os.path.join(directory, "15_22_merged_stocks.csv")
training_df.to_csv(training_csv_path, index=False)

print(f"Merging completed. Merged file saved as '{training_csv_path}'.")

training_df.head(10)

Merging completed. Merged file saved as '/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks.csv'.


Unnamed: 0,Date,Stock_ID,Open,High,Low,Close,Volume
0,2015-01-02,A,37.923997,38.043719,37.17804,37.35302,1529200
1,2015-01-02,AAPL,24.778673,24.789796,23.879976,24.320427,212818400
2,2015-01-02,ABBV,42.861468,43.490242,42.861468,43.156204,5086100
3,2015-01-02,ABT,37.202783,37.367216,36.701264,36.915028,3216600
4,2015-01-02,ACGL,18.764398,18.884845,18.472788,18.539352,1101600
5,2015-01-02,ACN,76.016365,76.372412,74.965176,75.312744,2021300
6,2015-01-02,ADBE,72.699997,73.199997,71.889999,72.339996,2349200
7,2015-01-02,ADI,45.003796,45.464503,44.429934,44.89064,1323200
8,2015-01-02,ADM,38.834026,39.131603,38.380218,38.700115,2039800
9,2015-01-02,ADP,66.412449,67.372858,66.044286,66.660553,1866600


In [2]:
import pandas as pd
import numpy as np

# Load the merged dataset
file_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks.csv"
df = pd.read_csv(file_path, parse_dates=["Date"])

# Ensure sorting by Date and Stock_ID
df = df.sort_values(by=["Stock_ID", "Date"]).reset_index(drop=True)

# Function to calculate technical indicators
def calculate_features(df):
    df["Return_1d"] = df.groupby("Stock_ID")['Close'].pct_change(1)
    df["Return_5d"] = df.groupby("Stock_ID")['Close'].pct_change(5)
    df["Return_10d"] = df.groupby("Stock_ID")['Close'].pct_change(10)
    df["Return_50d"] = df.groupby("Stock_ID")['Close'].pct_change(50)
    
    # Rolling volatility
    df["Volatility_5d"] = df.groupby("Stock_ID")["Return_1d"].rolling(5).std().reset_index(level=0, drop=True)
    df["Volatility_10d"] = df.groupby("Stock_ID")["Return_1d"].rolling(10).std().reset_index(level=0, drop=True)
    df["Volatility_20d"] = df.groupby("Stock_ID")["Return_1d"].rolling(20).std().reset_index(level=0, drop=True)
    
    # Momentum indicators
    df["SMA_10"] = df.groupby("Stock_ID")["Close"].rolling(10).mean().reset_index(level=0, drop=True)
    df["SMA_50"] = df.groupby("Stock_ID")["Close"].rolling(50).mean().reset_index(level=0, drop=True)
    df["SMA_200"] = df.groupby("Stock_ID")["Close"].rolling(200).mean().reset_index(level=0, drop=True)
    df["RSI_14"] = 100 - (100 / (1 + df.groupby("Stock_ID")["Return_1d"].rolling(14).apply(lambda x: np.mean(x[x > 0]) / np.mean(-x[x < 0]) if np.mean(-x[x < 0]) != 0 else np.inf).reset_index(level=0, drop=True)))
    
    # Volume-based features
    df["Volume_Change_5d"] = df.groupby("Stock_ID")["Volume"].pct_change(5)
    df["Volume_Change_10d"] = df.groupby("Stock_ID")["Volume"].pct_change(10)
    
    return df

# Apply feature calculations
df = calculate_features(df)

# Save the new dataset with features
output_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_features.csv"
df.to_csv(output_path, index=False)
print(f"Feature engineering complete. Saved to {output_path}")

df.head(10)

Feature engineering complete. Saved to /home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_features.csv


Unnamed: 0,Date,Stock_ID,Open,High,Low,Close,Volume,Return_1d,Return_5d,Return_10d,Return_50d,Volatility_5d,Volatility_10d,Volatility_20d,SMA_10,SMA_50,SMA_200,RSI_14,Volume_Change_5d,Volume_Change_10d
0,2015-01-02,A,37.923997,38.043719,37.17804,37.35302,1529200,,,,,,,,,,,,,
1,2015-01-05,A,37.131988,37.260918,36.561011,36.653103,2041800,-0.018738,,,,,,,,,,,,
2,2015-01-06,A,36.662315,36.85571,35.934778,36.082127,2080600,-0.015578,,,,,,,,,,,,
3,2015-01-07,A,36.395256,36.662327,36.183441,36.561024,3359700,0.013272,,,,,,,,,,,,
4,2015-01-08,A,37.058314,37.739802,37.003057,37.656918,2116300,0.029974,,,,,,,,,,,,
5,2015-01-09,A,37.758228,37.758228,37.104367,37.380646,1644900,-0.007337,0.00074,,,0.020747,,,,,,,0.07566,
6,2015-01-12,A,37.399065,37.500368,36.79125,36.938599,2770800,-0.011826,0.007789,,,0.019342,,,,,,,0.357038,
7,2015-01-13,A,37.27013,37.481944,36.220268,36.422871,2013100,-0.013962,0.009444,,,0.018992,,,,,,,-0.032443,
8,2015-01-14,A,35.943981,36.008446,35.188817,35.971611,5134000,-0.012389,-0.016121,,,0.018657,,,,,,,0.528113,
9,2015-01-15,A,36.220261,36.293934,34.986212,35.004627,2628900,-0.026882,-0.070433,,,0.007357,,,36.602454,,,,0.242215,


In [3]:
# Drop rows with NaN values
df = df.dropna().reset_index(drop=True)

# Save the cleaned dataset
training_csv_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_features.csv"
df.to_csv(training_csv_path, index=False)

print(f"Cleaned dataset saved as '{training_csv_path}'.")

df.head(10)

Cleaned dataset saved as '/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_features.csv'.


Unnamed: 0,Date,Stock_ID,Open,High,Low,Close,Volume,Return_1d,Return_5d,Return_10d,Return_50d,Volatility_5d,Volatility_10d,Volatility_20d,SMA_10,SMA_50,SMA_200,RSI_14,Volume_Change_5d,Volume_Change_10d
0,2015-10-16,A,33.127384,33.433691,32.830361,33.210922,1754300,0.005621,-0.012421,0.032016,-0.105587,0.012645,0.013504,0.015683,33.048491,33.453844,36.613815,54.321687,-0.45736,-0.501534
1,2015-10-19,A,33.127385,33.628613,33.034566,33.628613,3685800,0.012577,0.006668,0.025184,-0.091394,0.013907,0.012759,0.015913,33.1311,33.386192,36.595193,54.59602,1.730424,1.199952
2,2015-10-20,A,33.517234,33.897795,33.442976,33.712154,2635800,0.002484,0.019366,0.040688,-0.09994,0.012383,0.011631,0.015275,33.262904,33.311326,36.580488,51.671767,0.105992,0.437422
3,2015-10-21,A,33.91635,33.972039,33.229482,33.322304,2886400,-0.011564,0.024251,0.01013,-0.086652,0.010498,0.011392,0.015544,33.29632,33.248099,36.566689,54.01857,0.752094,-0.064497
4,2015-10-22,A,33.442976,34.29692,33.415131,33.498669,3696200,0.005293,0.014334,0.002222,-0.069304,0.00889,0.010707,0.015252,33.303746,33.198209,36.551377,48.912329,1.579344,0.422765
5,2015-10-23,A,33.879222,34.593935,33.582199,34.445423,2732600,0.028262,0.037172,0.024289,-0.03481,0.01458,0.013873,0.016288,33.385427,33.173363,36.53532,51.091822,0.557658,-0.154753
6,2015-10-26,A,34.324755,34.445422,34.055577,34.185528,1948400,-0.007545,0.016561,0.023339,-0.044323,0.015543,0.013943,0.014333,33.463394,33.141654,36.519344,53.340019,-0.471377,0.443366
7,2015-10-27,A,34.037012,34.399013,33.851376,34.389729,2557200,0.005973,0.020099,0.039854,-0.042825,0.01557,0.013263,0.014265,33.595197,33.110881,36.5066,50.276063,-0.02982,0.073011
8,2015-10-28,A,34.399015,34.909523,34.129837,34.825985,1780100,0.012686,0.045125,0.070471,-0.035651,0.01305,0.011372,0.014079,33.824462,33.085132,36.498615,50.13326,-0.38328,0.080551
9,2015-10-29,A,34.779576,35.058035,34.603216,34.993061,1352300,0.004797,0.044611,0.059584,-0.015379,0.013087,0.011004,0.013119,34.021239,33.0742,36.493723,49.782379,-0.634138,-0.056315


In [4]:
import pandas as pd

# Load the dataframe
csv_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_features.csv"
df = pd.read_csv(csv_path)

# Normalize 'Volume_Change_5d' and 'Volume_Change_10d' to range 1-1000
def normalize_column(df, column_name, new_min=1, new_max=1000):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) * (new_max - new_min) / (max_val - min_val) + new_min

# Normalize the two columns
normalize_column(df, 'Volume_Change_5d')
normalize_column(df, 'Volume_Change_10d')

# Save the normalized dataframe to a new file
normalized_csv_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_normalized.csv"
df.to_csv(normalized_csv_path, index=False)

print(f"Normalized dataframe saved at {normalized_csv_path}")


Normalized dataframe saved at /home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_normalized.csv


In [5]:
df.head(10)

Unnamed: 0,Date,Stock_ID,Open,High,Low,Close,Volume,Return_1d,Return_5d,Return_10d,Return_50d,Volatility_5d,Volatility_10d,Volatility_20d,SMA_10,SMA_50,SMA_200,RSI_14,Volume_Change_5d,Volume_Change_10d
0,2015-10-16,A,33.127384,33.433691,32.830361,33.210922,1754300,0.005621,-0.012421,0.032016,-0.105587,0.012645,0.013504,0.015683,33.048491,33.453844,36.613815,54.321687,1.0,1.0
1,2015-10-19,A,33.127385,33.628613,33.034566,33.628613,3685800,0.012577,0.006668,0.025184,-0.091394,0.013907,0.012759,0.015913,33.1311,33.386192,36.595193,54.59602,1.0,1.0
2,2015-10-20,A,33.517234,33.897795,33.442976,33.712154,2635800,0.002484,0.019366,0.040688,-0.09994,0.012383,0.011631,0.015275,33.262904,33.311326,36.580488,51.671767,1.0,1.0
3,2015-10-21,A,33.91635,33.972039,33.229482,33.322304,2886400,-0.011564,0.024251,0.01013,-0.086652,0.010498,0.011392,0.015544,33.29632,33.248099,36.566689,54.01857,1.0,1.0
4,2015-10-22,A,33.442976,34.29692,33.415131,33.498669,3696200,0.005293,0.014334,0.002222,-0.069304,0.00889,0.010707,0.015252,33.303746,33.198209,36.551377,48.912329,1.0,1.0
5,2015-10-23,A,33.879222,34.593935,33.582199,34.445423,2732600,0.028262,0.037172,0.024289,-0.03481,0.01458,0.013873,0.016288,33.385427,33.173363,36.53532,51.091822,1.0,1.0
6,2015-10-26,A,34.324755,34.445422,34.055577,34.185528,1948400,-0.007545,0.016561,0.023339,-0.044323,0.015543,0.013943,0.014333,33.463394,33.141654,36.519344,53.340019,1.0,1.0
7,2015-10-27,A,34.037012,34.399013,33.851376,34.389729,2557200,0.005973,0.020099,0.039854,-0.042825,0.01557,0.013263,0.014265,33.595197,33.110881,36.5066,50.276063,1.0,1.0
8,2015-10-28,A,34.399015,34.909523,34.129837,34.825985,1780100,0.012686,0.045125,0.070471,-0.035651,0.01305,0.011372,0.014079,33.824462,33.085132,36.498615,50.13326,1.0,1.0
9,2015-10-29,A,34.779576,35.058035,34.603216,34.993061,1352300,0.004797,0.044611,0.059584,-0.015379,0.013087,0.011004,0.013119,34.021239,33.0742,36.493723,49.782379,1.0,1.0


In [6]:
import pandas as pd

# Load the merged dataset
file_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_merged_stocks_normalized.csv"
df = pd.read_csv(file_path, parse_dates=["Date"])

# Define split dates
train_end_date = "2021-12-31"
valid_start_date = "2022-01-01"

# Split into training and validation sets
train_df = df[df["Date"] <= train_end_date]
valid_df = df[df["Date"] >= valid_start_date]

# Save to CSV
train_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_training_set.csv"
valid_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_validation_set.csv"

train_df.to_csv(train_path, index=False)
valid_df.to_csv(valid_path, index=False)

print(f"Training set saved: {train_path} ({len(train_df)} rows)")
print(f"Validation set saved: {valid_path} ({len(valid_df)} rows)")


Training set saved: /home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_training_set.csv (751921 rows)
Validation set saved: /home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/SP500_15_22/15_22_validation_set.csv (124284 rows)


In [7]:
train_df.head(10)

Unnamed: 0,Date,Stock_ID,Open,High,Low,Close,Volume,Return_1d,Return_5d,Return_10d,Return_50d,Volatility_5d,Volatility_10d,Volatility_20d,SMA_10,SMA_50,SMA_200,RSI_14,Volume_Change_5d,Volume_Change_10d
0,2015-10-16,A,33.127384,33.433691,32.830361,33.210922,1754300,0.005621,-0.012421,0.032016,-0.105587,0.012645,0.013504,0.015683,33.048491,33.453844,36.613815,54.321687,1.0,1.0
1,2015-10-19,A,33.127385,33.628613,33.034566,33.628613,3685800,0.012577,0.006668,0.025184,-0.091394,0.013907,0.012759,0.015913,33.1311,33.386192,36.595193,54.59602,1.0,1.0
2,2015-10-20,A,33.517234,33.897795,33.442976,33.712154,2635800,0.002484,0.019366,0.040688,-0.09994,0.012383,0.011631,0.015275,33.262904,33.311326,36.580488,51.671767,1.0,1.0
3,2015-10-21,A,33.91635,33.972039,33.229482,33.322304,2886400,-0.011564,0.024251,0.01013,-0.086652,0.010498,0.011392,0.015544,33.29632,33.248099,36.566689,54.01857,1.0,1.0
4,2015-10-22,A,33.442976,34.29692,33.415131,33.498669,3696200,0.005293,0.014334,0.002222,-0.069304,0.00889,0.010707,0.015252,33.303746,33.198209,36.551377,48.912329,1.0,1.0
5,2015-10-23,A,33.879222,34.593935,33.582199,34.445423,2732600,0.028262,0.037172,0.024289,-0.03481,0.01458,0.013873,0.016288,33.385427,33.173363,36.53532,51.091822,1.0,1.0
6,2015-10-26,A,34.324755,34.445422,34.055577,34.185528,1948400,-0.007545,0.016561,0.023339,-0.044323,0.015543,0.013943,0.014333,33.463394,33.141654,36.519344,53.340019,1.0,1.0
7,2015-10-27,A,34.037012,34.399013,33.851376,34.389729,2557200,0.005973,0.020099,0.039854,-0.042825,0.01557,0.013263,0.014265,33.595197,33.110881,36.5066,50.276063,1.0,1.0
8,2015-10-28,A,34.399015,34.909523,34.129837,34.825985,1780100,0.012686,0.045125,0.070471,-0.035651,0.01305,0.011372,0.014079,33.824462,33.085132,36.498615,50.13326,1.0,1.0
9,2015-10-29,A,34.779576,35.058035,34.603216,34.993061,1352300,0.004797,0.044611,0.059584,-0.015379,0.013087,0.011004,0.013119,34.021239,33.0742,36.493723,49.782379,1.0,1.0


In [8]:
valid_df.head(10)

Unnamed: 0,Date,Stock_ID,Open,High,Low,Close,Volume,Return_1d,Return_5d,Return_10d,Return_50d,Volatility_5d,Volatility_10d,Volatility_20d,SMA_10,SMA_50,SMA_200,RSI_14,Volume_Change_5d,Volume_Change_10d
1564,2022-01-03,A,155.740608,156.171591,150.774533,153.272263,1606300,-0.018565,-0.012939,0.034778,-0.001618,0.010788,0.012413,0.014346,154.043616,153.078423,146.84647,36.814534,1.0,1.0
1565,2022-01-04,A,152.302601,152.43973,146.631282,148.090744,2234000,-0.033806,-0.048944,0.006443,-0.041659,0.016928,0.017004,0.016179,154.13842,152.949674,146.984801,35.325437,1.0,1.0
1566,2022-01-05,A,147.7381,149.961571,145.485246,145.553818,2370500,-0.017131,-0.073789,-0.033543,-0.060277,0.013181,0.015774,0.016196,153.633244,152.762948,147.118386,33.602793,1.0,1.0
1567,2022-01-06,A,145.798702,146.885948,142.59573,146.063156,2298300,0.003499,-0.071877,-0.047606,-0.056095,0.013846,0.014128,0.015698,152.903145,152.589342,147.260072,33.329199,1.0,1.0
1568,2022-01-07,A,146.063134,146.66063,142.115747,142.174515,2058600,-0.026623,-0.089626,-0.078953,-0.063885,0.014019,0.015044,0.016531,151.684406,152.395288,147.377151,32.621562,1.0,1.0
1569,2022-01-10,A,140.35265,142.331245,137.97247,142.184326,2548100,6.9e-05,-0.072341,-0.084345,-0.066826,0.016297,0.01454,0.016238,150.374692,152.191647,147.476061,27.632004,1.0,1.0
1570,2022-01-11,A,142.027622,143.927856,140.862014,143.634003,2194200,0.010196,-0.030095,-0.077565,-0.067667,0.015318,0.015356,0.016509,149.166905,151.983153,147.58329,23.688397,1.0,1.0
1571,2022-01-12,A,144.770207,147.30711,144.525332,146.445145,2250800,0.019572,0.006124,-0.068118,-0.046817,0.017313,0.016909,0.016119,148.096439,151.839295,147.707401,24.077675,1.0,1.0
1572,2022-01-13,A,146.396189,146.474536,141.88069,142.194122,1741800,-0.029028,-0.026489,-0.096462,-0.079118,0.021824,0.017953,0.016809,146.578369,151.594961,147.799053,22.94689,1.0,1.0
1573,2022-01-14,A,141.087292,142.174539,139.441738,141.714172,2225400,-0.003375,-0.003238,-0.092574,-0.084257,0.018301,0.018064,0.016147,145.132626,151.334181,147.885626,25.566777,1.0,1.0


In [11]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Assuming 'training_df' is your dataframe
# Split the data into X (features) and y (target: 1-day return)
X_train = train_df.drop(columns=["Date", "Stock_ID", "Return_1d"])  # Drop non-feature columns
y_train = train_df["Return_1d"]

# Convert DataFrame to DMatrix for XGBoost (faster training)
train_dmatrix = xgb.DMatrix(X_train, label=y_train)


# Set up hyperparameters
params = {
    'learning_rate': 0.1,  # Learning rate
    'n_estimators': 1000,  # Number of trees
    'max_depth': 12,  # Max depth of trees
    'min_child_weight': 1,  # Minimum child weight
    'gamma': 0,  # No regularization
    'subsample': 0.8,  # Subsample ratio
    'colsample_bytree': 0.8,  # Column sample ratio
    'lambda': 0,  # L2 regularization
    'alpha': 0,  # L1 regularization
    'objective': 'reg:squarederror',  # Regression problem
    'booster': 'gbtree',  # Use tree-based booster
    'eval_metric': 'rmse',  # Evaluation metric: RMSE
    'tree_method': 'gpu_hist',  # Use GPU for training
    'predictor': 'gpu_predictor'  # Use GPU for prediction
}

# Train the model with the parameters
model = xgb.train(
    params=params, 
    dtrain=train_dmatrix, 
    num_boost_round=1000,  # Number of boosting rounds
    evals=[(train_dmatrix, 'train')],  # Evaluate on training set (you can add validation set here)
    early_stopping_rounds=50  # Stop training if no improvement after 50 rounds
)

# Save the trained model
model_path = '/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/xgboost_model.bin'
model.save_model(model_path)

print(f"Model trained successfully and saved at {model_path}")


[0]	train-rmse:0.02043
[1]	train-rmse:0.01996
[2]	train-rmse:0.01957
[3]	train-rmse:0.01920
[4]	train-rmse:0.01893



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()
Parameters: { "n_estimators", "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[5]	train-rmse:0.01870
[6]	train-rmse:0.01846
[7]	train-rmse:0.01822
[8]	train-rmse:0.01806
[9]	train-rmse:0.01787
[10]	train-rmse:0.01771
[11]	train-rmse:0.01755
[12]	train-rmse:0.01744
[13]	train-rmse:0.01734
[14]	train-rmse:0.01723
[15]	train-rmse:0.01712
[16]	train-rmse:0.01704
[17]	train-rmse:0.01696
[18]	train-rmse:0.01690
[19]	train-rmse:0.01684
[20]	train-rmse:0.01678
[21]	train-rmse:0.01670
[22]	train-rmse:0.01663
[23]	train-rmse:0.01654
[24]	train-rmse:0.01650
[25]	train-rmse:0.01643
[26]	train-rmse:0.01638
[27]	train-rmse:0.01634
[28]	train-rmse:0.01629
[29]	train-rmse:0.01623
[30]	train-rmse:0.01619
[31]	train-rmse:0.01615
[32]	train-rmse:0.01611
[33]	train-rmse:0.01605
[34]	train-rmse:0.01601
[35]	train-rmse:0.01594
[36]	train-rmse:0.01592
[37]	train-rmse:0.01587
[38]	train-rmse:0.01585
[39]	train-rmse:0.01582
[40]	train-rmse:0.01577
[41]	train-rmse:0.01574
[42]	train-rmse:0.01569
[43]	train-rmse:0.01564
[44]	train-rmse:0.01560
[45]	train-rmse:0.01557
[46]	train-rmse:0.015


    E.g. tree_method = "hist", device = "cuda"

  model.save_model(model_path)
  model.save_model(model_path)


Model trained successfully and saved at /home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/xgboost_model.bin


In [None]:
# Check for infinite values in X_train
print(X_train.isin([np.inf, -np.inf]).sum())

# Check for values that are too large (e.g., larger than a specific threshold)
print((X_train > 1e10).sum())  # Replace 1e10 with a threshold that fits your case


In [None]:
# Check for missing or infinite values in the features
print(X_train.isna().sum())  # Check for missing values
print(np.isinf(X_train).sum())  # Check for infinite values

# Check for missing or infinite values in the target
print(y_train.isna().sum())  # Check for missing values
print(np.isinf(y_train).sum())  # Check for infinite values
