#Code Required for All Models

Imports

In [None]:
# Relevant imports
!pip install scikit-optimize
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import uniform, randint
from skopt import BayesSearchCV
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from xgboost import plot_importance
%matplotlib inline



Regressor Object

In [None]:
model = xgb.XGBRegressor()

Loading data

In [None]:
# Loading csv file
dataframe = pd.read_csv("./test_data.csv")
dataframe.head()


k = 10  # Number of folds
n = 10000 # Number of iterations

# Model 1 - Coordinates Only

Selecting our features and targets, and converting them into DMatrices

In [None]:
# Defining features and targets
X, y = dataframe.iloc[:, 1:3], dataframe[['LST']]

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.1)

# Converting the training and testing datasets into DMatrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)


Creating evaluation set and training model

Testing model on the test data set after training


In [None]:
# Optimal parameters for Model 1 following Bayesian Search
params = {"objective": "reg:squarederror", "learning_rate":0.0990999161353275, "max_depth": 3, "min_child_weight": 10, "subsample": 1, "lambda": 5, "gamma": 0, "alpha": 5, "tree_method":"exact", "eta":0}

# K-FOLD CROSS VALIDATION
results = xgb.cv(
   params,
   dtrain_reg,
   num_boost_round = n,
   nfold = k,
   early_stopping_rounds = 100,
   metrics = ['rmse', 'mae']
)

print(f"Average RMSE: {results['test-rmse-mean'].mean()}")
print(f"Average MAE: {results['test-mae-mean'].mean()}")


# Initialize KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Convert the data to DMatrix format
X_array = X.values
y_array = y.values.flatten()

r2_scores = []

# Perform manual cross-validation
for train_index, test_index in kf.split(X_array):
    X_train, X_test = X_array[train_index], X_array[test_index]
    y_train, y_test = y_array[train_index], y_array[test_index]

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test, y_test)

    # Train the model
    booster = xgb.train(params, dtrain, num_boost_round=n)

    # Make predictions
    y_pred = booster.predict(dtest)

    # Calculate R-squared for this fold
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

# Calculate the average R-squared value
average_r2 = np.mean(r2_scores)

print(f"Average R-Squared: {average_r2}")

Average RMSE: 2.856462669051263
Average MAE: 2.256796093009686
Average R-Squared: 0.8211318065016574


#Model 2 - Coordinates and GLOBE Observer Labels only

Defining features and targets from csv

In [None]:
# Definining features and targets
X, y = dataframe.iloc[:, 1:11], dataframe[['LST']] # USES ALL LAND COVER

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.1)

# Converting the training and testing datasets into DMatrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

Cross-validation

In [None]:
params = {"objective": "reg:squarederror", "max_depth": 50, "lambda": 5, "alpha": 4, "tree_method": "exact", "eta": 0, "gamma": 4, "learning_rate": 0.2351916922866857, "min_child_weight": 10, "subsample": 0.5789593986704742}

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold = k,
   early_stopping_rounds = 100,
   metrics = {'rmse','mae'},
)

print(f"Average RMSE: {results['test-rmse-mean'].mean()}")
print(f"Average MAE: {results['test-mae-mean'].mean()}")

kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Convert the data to DMatrix format
X_array = X.values
y_array = y.values.flatten()

r2_scores = []

# Perform manual cross-validation
for train_index, test_index in kf.split(X_array):
    X_train, X_test = X_array[train_index], X_array[test_index]
    y_train, y_test = y_array[train_index], y_array[test_index]

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test, y_test)

    # Train the model
    booster = xgb.train(params, dtrain, num_boost_round=n)

    # Make predictions
    y_pred = booster.predict(dtest)

    # Calculate R-squared for this fold
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

# Calculate the average R-squared value
average_r2 = np.mean(r2_scores)

print(f"Average R-Squared: {average_r2}")


Average RMSE: 3.0139600952821866
Average MAE: 2.326997339874897
Average R-Squared: 0.7868356040375915


#Model 3 - Coordinates, GLOBE Observer, CEO Labels

In [None]:
X, y = dataframe.iloc[:, 1:26].drop(dataframe.columns[11], axis=1), dataframe[['LST']]

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.1)

# Converting the training and testing datasets into DMatrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

Cross-validation

In [None]:
params = {"objective": "reg:squarederror"}#, "max_depth": 3, "lambda": 1, "alpha": 0, "tree_method": "exact", "eta": 0, "gamma": 0, "learning_rate": 0.09447565846479973, "min_child_weight": 10, "subsample": 1.0}

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold = k,
   early_stopping_rounds = 100,
   metrics = {'rmse','mae'},
)


print(f"Average RMSE: {results['test-rmse-mean'].mean()}")
print(f"Average MAE: {results['test-mae-mean'].mean()}")

kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Convert the data to DMatrix format
X_array = X.values
y_array = y.values.flatten()

r2_scores = []

# Perform manual cross-validation
for train_index, test_index in kf.split(X_array):
    X_train, X_test = X_array[train_index], X_array[test_index]
    y_train, y_test = y_array[train_index], y_array[test_index]

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test, y_test)

    # Train the model
    booster = xgb.train(params, dtrain, num_boost_round=n)

    # Make predictions
    y_pred = booster.predict(dtest)

    # Calculate R-squared for this fold
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

# Calculate the average R-squared value
average_r2 = np.mean(r2_scores)
print(f"Average R-Squared: {average_r2}")

Average RMSE: 2.9003299179770363
Average MAE: 2.1310074205354397
Average R-Squared: 0.8030478361103602


#Sample of Bayesian Search used to optimize all three models.
Move code cell accordingly depending on which model you would like to optimize

In [None]:
param_space = {
    'learning_rate': (0.01, 0.3, 'uniform'),
    'max_depth': (3, 50),
    'subsample': (0.1, 1.0),
    'min_child_weight': (1, 10),
    'gamma': (0, 10),
    'alpha': (0, 10),
    'eta': (0, 1),
    'lambda': (0, 5),

}

bayes_search = BayesSearchCV(
    estimator=xgb.XGBRegressor(),
    search_spaces=param_space,
    n_iter=100,
    cv=10,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

bayes_search.fit(X_train, y_train)

print(f"{bayes_search.best_params_}")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac