In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor  # Import DummyRegressor
from xgboost import XGBRegressor

# Load your Excel file (update sheet name if needed)
df = pd.read_excel("Updated_BMS_Data11.xlsm", sheet_name="Sheet1", engine="openpyxl", header=0, skiprows=0, nrows=6109)

# Drop first column
df = df.iloc[:, 1:]

# Separate features and target
X = df.drop(columns=["tbsagrft"])
y = df["tbsagrft"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = MinMaxScaler()
scaler.fit(X_train)

# Transform both training and testing data
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Train XGBoost model
model_xgb = XGBRegressor(
    objective="reg:squarederror",
    booster="gbtree",         # this is the tree model
    n_estimators=100,         # number of trees
    learning_rate=0.1,        # shrinkage to prevent overfitting
    max_depth=6,              # controls tree complexity
    random_state=42
)
model_xgb.fit(X_train_normalized, y_train)

# Predict and evaluate for XGBoost
y_pred_xgb = model_xgb.predict(X_test_normalized)
print("XGBoost Model:")
print("MSE:", mean_squared_error(y_test, y_pred_xgb))
print("R² Score:", r2_score(y_test, y_pred_xgb))

# Create a Dummy Model
dummy_model = DummyRegressor(strategy="mean")  # Predicts the mean value
dummy_model.fit(X_train_normalized, y_train)

# Predict and evaluate for Dummy Model
y_pred_dummy = dummy_model.predict(X_test_normalized)
print("\nDummy Model:")
print("MSE:", mean_squared_error(y_test, y_pred_dummy))
print("R² Score:", r2_score(y_test, y_pred_dummy))


XGBoost Model:
MSE: 68.9565914588794
R² Score: 0.8064371771308242

Dummy Model:
MSE: 356.24961715870364
R² Score: -1.3063932040591197e-06
