In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# 1. Load Data
# Replace 'your_dataset.csv' with your actual file path
df = pd.read_csv('spectral_feature_data_notaveraged.csv')

# 2. Define Targets (The columns you want to predict)
# Update this list to match the exact column names in your CSV
target_cols = [col for col in df.columns if col.startswith("p")]

# Select features: All columns that are NOT in the exclusion list
# This assumes your CSV contains: Spectral_Cols, ph, ec, Target_Cols, and ID
non_feature_cols = [col for col in df.columns if col.startswith('p4')]

feature_cols = [col for col in df.columns if col not in non_feature_cols]

#print(f"{target_cols}\n\n{non_feature_cols}\n\n{feature_cols}")
 
#input features
prediction_columns = [col for col in feature_cols if not col.startswith("p")]
prediction_columns.extend(["p1.pH.index", 'p1.EC.ds_m'])

# Verify that 'ph' and 'ec' are actually in feature_cols
# If your spectral columns are named 'Band_1', etc., and you have 'ph' and 'ec', 
# this logic automatically grabs them as long as they aren't in 'non_feature_cols'.
print(f"Features selected: {len(feature_cols)} columns (including spectra pH and EC if present).")
print(f"Features going to be predicted: {len(feature_cols) - len(prediction_columns)}")



Features selected: 122 columns (including spectra pH and EC if present).
Features going to be predicted: 10


In [2]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

X = df[prediction_columns]
print(X.columns)

Index(['400', '410', '420', '430', '440', '450', '460', '470', '480', '490',
       ...
       '1420', '1430', '1440', '1450', '1460', '1470', '1480', '1490',
       'p1.pH.index', 'p1.EC.ds_m'],
      dtype='object', length=112)


In [3]:
import time
start_time = time.time()

print(f"Process started at: {time.ctime(start_time)}")

Process started at: Tue Dec 30 20:18:08 2025


In [None]:
print("Starting Grid Search with XGBoost...")
df_save = pd.DataFrame(['feature', 'mae', 'rmse', 'r2_score'] + list(param_grid.keys())).T

# 5. Processing Loop for each Target
for target in target_cols:
    print(f"\n--- Processing Target: {target} ---")
    
    # Check if target exists in dataframe
    if target not in df.columns:
        print(f"Warning: Column '{target}' not found in CSV. Skipping.")
        continue
    
    # --- FIX STARTS HERE ---
    # 1. Create a mask for rows where the current target is NOT NaN
    mask = df[target].notna()
    
    # 2. Apply this mask to both X and y so they remain aligned
    #    We use .loc to grab the specific rows defined by the mask
    y_clean = df.loc[mask, target]
    X_clean = X.loc[mask]
    
    # Optional: Safety check for empty data
    print(f"Data points available for {target}: {len(y_clean)}")
    if len(y_clean) < 5: 
        print(f"Skipping {target}: Not enough data points (found {len(y_clean)}).")
        continue
    # --- FIX ENDS HERE ---
    
    # Split data (using the CLEAN variables)
    X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)
    
    # Initialize XGBoost Regressor
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    
    # Run Grid Search
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2,      # 2-fold cross-validation
        n_jobs=10, # Use all available processors
        verbose=1
    )
    
    try:
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error training {target}: {e}")
        continue
    
    # Get best results
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Evaluate
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    
    print(f"Best Params for {target}: {best_params}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

    # Save Best Params to the DataFrame (Optional)
    # Note: This adds the params to all rows, or you can save to a separate results list/dict
    row = pd.DataFrame([[target, mse, rmse, r2] + [best_params[key] for key in param_grid.keys()]], columns=df_save.columns)
    df_save = pd.concat([df_save, row], ignore_index=True)




Starting Grid Search with XGBoost...

--- Processing Target: p1.pH.index ---
Data points available for p1.pH.index: 44590
Fitting 2 folds for each of 324 candidates, totalling 648 fits


In [None]:
print(f"Processes ended at: {time.ctime(time.time())}")

print(f"Total processing time: {(time.time() - start_time)} seconds")

Processes ended at: Tue Dec 30 19:01:10 2025
Total processing time: 2296.3321101665497 seconds


In [None]:
# 7. Save the final results to CSV

output_filename = 'soil_properties_xgboost_results(using more wavelengths).csv'
df_save.to_csv(output_filename, index=False)

print(f"\nProcessing complete. Results saved to '{output_filename}'.")


Processing complete. Results saved to 'soil_properties_xgboost_results(averaged_cols).csv'.
