In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, LeaveOneGroupOut


In [2]:
def report_metrics(name, y_true, y_pred):
    """
    Prints RMSE, MAE and R² for the given predictions.
    Args:
        name (str): label for the dataset (e.g. 'Train', 'Val', 'Test').
        y_true (array-like): ground-truth target values.
        y_pred (array-like): model predictions.
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"{name:25s} | RMSE: {rmse:7.4f} | MAE: {mae:7.4f} | R²: {r2:7.4f}")


In [3]:
# === Configuration ===
grid_size = 1                               # can be 1, 2, 3, 4, or 5
file_prefix = f"{grid_size}x{grid_size}" 

# === Load pre-split datasets ===
df_train = pd.read_csv(f"{file_prefix}_train_apple.csv")
df_val   = pd.read_csv(f"{file_prefix}_val_apple.csv")
df_test  = pd.read_csv(f"{file_prefix}_test_apple.csv")

print(f"Loaded datasets — Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")


In [4]:
X_train = df_train.drop(columns=["apple_content"])
y_train = df_train["apple_content"]

X_val   = df_val.drop(columns=["apple_content"])
y_val   = df_val["apple_content"]

X_test  = df_test.drop(columns=["apple_content"])
y_test  = df_test["apple_content"]


In [None]:
# === Scale input features with Min-Max normalization ===
input_scaler = MinMaxScaler()

X_train_scaled = input_scaler.fit_transform(X_train)
X_val_scaled   = input_scaler.transform(X_val)
X_test_scaled  = input_scaler.transform(X_test)


In [6]:
# === Convert to XGBoost DMatrix format ===
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dval   = xgb.DMatrix(X_val_scaled, label=y_val)
dtest  = xgb.DMatrix(X_test_scaled)

# === Training parameters ===
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 2,        
    'eta': 0.15,             
    'subsample': 0.8,        
    'colsample_bytree': 1.0, 
    'lambda': 2.0,           
    'alpha': 0.1,            
    'seed': 42
}


In [7]:
# === Train the model with early stopping ===
evals = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=400,
    evals=evals,
    early_stopping_rounds=40,
    verbose_eval=False
)

print(f"✅ Model trained successfully for grid size: {grid_size}×{grid_size}")


In [8]:
# === Generate predictions ===
train_preds = model.predict(dtrain)
test_preds  = model.predict(dtest)
print("✅ Predictions generated.")


In [None]:
# === Evaluate performance on Train and Test sets ===
report_metrics("Train", y_train, train_preds)
report_metrics("Test",  y_test,  test_preds)
