# Real Estate ETF XGBoost Price Predictions

Using data from https://www.kaggle.com/datasets/stefanoleone992/mutual-funds-and-etfs?select=ETF+prices.csv

Data from November 2021

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


Data loading, cleaning, and preparation

In [2]:
# Load data
base_path = 'C:/Users/jacke/Documents/Data Projects/'

filenames = ["MutualFund prices - A-E.csv", "MutualFund prices - F-K.csv", "MutualFund prices - L-P.csv", "MutualFund prices - Q-Z.csv", "MutualFunds.csv", "ETF prices.csv", "ETFs.csv"]

desc_data = pd.read_csv(base_path + filenames[6])

target_sector = ['Real Estate']
target_symbols = desc_data[desc_data['fund_category'].isin(target_sector)]['fund_symbol'].unique()

price_data = pd.read_csv(base_path + filenames[5])
price_data['price_date'] = pd.to_datetime(price_data['price_date'])
pivot_data = price_data.pivot(index='price_date', columns='fund_symbol', values='adj_close')

# Drop data before 2018
pivot_data = pivot_data[pivot_data.index >= '2018-01-01']

# Drop rows and columns with all NaN values
pivot_data.dropna(axis=1, how='any', inplace=True)

target_symbols = [x for x in target_symbols if x in pivot_data.columns]

# Train test split
train_x, test_x = train_test_split(pivot_data, test_size=0.2, random_state=0)

# Separate target and features
train_y = train_x[target_symbols]
test_y = test_x[target_symbols]

train_x = train_x.drop(columns=target_symbols)
test_x = test_x.drop(columns=target_symbols)

# Prepare data for XGBoost
dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x, label=test_y)

Initial Boosted tree model as a baseline

In [3]:
# Define parameters for the XGBoost model
params = {
    'device': 'cuda',
}

num_boost_round = 100

# Train the model
bst = xgb.train(params, dtrain, num_boost_round)

Train RMSE is significantly lower than Test RMSE, indicating overfitting

In [4]:
# Predict on the test set
preds = bst.predict(dtest)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(train_y, bst.predict(dtrain)))
test_rmse = np.sqrt(mean_squared_error(test_y, preds))

# Display the RMSE value
print(f"Train RMSE: {train_rmse}, Test RMSE: {test_rmse}")

Train RMSE: 0.006372558891469219, Test RMSE: 0.3064820501898771


Utilizing new parameters to minimize overfitting

In [14]:
params = {
    'device': 'cuda',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eta': 0.2,
    'max_depth': 5,
}

bst = xgb.train(params, dtrain, num_boost_round)

In [15]:
# Predict on the test set
preds = bst.predict(dtest)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(train_y, bst.predict(dtrain)))
test_rmse = np.sqrt(mean_squared_error(test_y, preds))

# Display the RMSE value
print(f"Train RMSE: {train_rmse}, Test RMSE: {test_rmse}")

Train RMSE: 0.030515088199989257, Test RMSE: 0.2841305624385981


A reduction in Trest RMSE is achieved.  Further improvements of the model should use cross-validation methods to further refine parameters in line with the above.