# Evaluate with train test split on train

In [1]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import numpy as np
import cupy as cp



# Load data
X_train = pd.read_csv('X_train_preprocessed.csv', sep=',', index_col='row_index')
y_train = pd.read_csv('y_train_preprocessed.csv', sep=',', index_col='row_index')

In [2]:
# Split the dataset
X_train_subset, X_test, y_train_subset, y_test = train_test_split(X_train, y_train, test_size=0.05, random_state=0)
X_train_subset, X_val, y_train_subset, y_val = train_test_split(X_train_subset, y_train_subset, test_size=0.03, random_state=0)

# Convert data to GPU-compatible format
dtrain = xgb.DMatrix(X_train_subset, label=y_train_subset)
dval = xgb.DMatrix(X_val, label=y_val)

In [3]:
# Define initial parameters
# Set up GPU parameters in XGBoost
params = {
    'objective': 'multi:softmax',  # Use 'multi:softmax' or 'multi:softprob' for multiclass problems
    'num_class': len(np.unique(y_train)),  # Number of classes in your target
    'tree_method': 'hist',  # GPU-accelerated histogram algorithm
    'device' : 'cuda',
    'predictor': 'gpu_predictor',  # Use GPU for predictions
    'eval_metric': 'mlogloss',  # Evaluation metric for multiclass problems
    'max_depth': 15,
    'learning_rate': 0.13,
    'n_estimators': 100,
    'max_bin': 256,
}




"""
# Cross-validation with early stopping
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    nfold=3,  # 5-fold cross-validation
    early_stopping_rounds=10,
    verbose_eval=True
)

# Get the best number of boosting rounds
best_num_boost_round = len(cv_results['train-mlogloss-mean'])
print(f"Best number of boosting rounds: {best_num_boost_round}")
"""


# Train the final model
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=12000,
    evals=[(dtrain, 'train'), (dval, 'validation')],
    early_stopping_rounds=10,
    verbose_eval=False
)


Parameters: { "n_estimators", "predictor" } are not used.



In [4]:
# train evaluation
dtrain = xgb.DMatrix(X_train)
train_predictions = bst.predict(dtrain)

# Convert predictions and true labels back to CPU for metrics
train_predictions = cp.asnumpy(train_predictions)
y_train = y_train.to_numpy()

# Evaluate
accuracy = accuracy_score(y_train, train_predictions)
f1 = f1_score(y_train, train_predictions, average='weighted')

print(f"train Accuracy: {accuracy:.4f}")
print(f"train F1 Score: {f1:.4f}")




# Test evaluation
dtest = xgb.DMatrix(X_test)
test_predictions = bst.predict(dtest)

# Convert predictions and true labels back to CPU for metrics
test_predictions = cp.asnumpy(test_predictions)
y_test = y_test.to_numpy()

# Evaluate
accuracy = accuracy_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

train Accuracy: 0.9933
train F1 Score: 0.9933
Test Accuracy: 0.9200
Test F1 Score: 0.9199


# Fully train on X_train

In [5]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import numpy as np
import cupy as cp

# Load data
X_train = pd.read_csv('X_train_preprocessed.csv', sep=',', index_col='row_index')
y_train = pd.read_csv('y_train_preprocessed.csv', sep=',', index_col='row_index')

In [6]:
# Split data
X_train_subset, X_val, y_train_subset, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=0)

# Convert data to GPU-compatible format
dtrain = xgb.DMatrix(X_train_subset, label=y_train_subset)
dval = xgb.DMatrix(X_val, label=y_val)

In [7]:
# Define initial parameters
params = {
    'objective': 'multi:softmax',  # Use 'multi:softmax' or 'multi:softprob' for multiclass problems
    'num_class': len(np.unique(y_train)),  # Number of classes in your target
    'tree_method': 'hist',  # GPU-accelerated histogram algorithm
    'device' : 'cuda',
    'predictor': 'gpu_predictor',  # Use GPU for predictions
    'eval_metric': 'mlogloss',  # Evaluation metric for multiclass problems
    'max_depth': 15,
    'learning_rate': 0.13,
    'n_estimators': 100,
    'max_bin': 256,
}


# Train the final model
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=12000,
    evals=[(dtrain, 'train'), (dval, 'validation')],
    early_stopping_rounds=10,
    verbose_eval=False
)

Parameters: { "n_estimators", "predictor" } are not used.



In [8]:
X_test = pd.read_csv('X_test_preprocessed.csv', sep=',', index_col='row_index')

In [9]:
# Test evaluation
dtest = xgb.DMatrix(X_test)
test_predictions = bst.predict(dtest)

# Convert predictions and true labels back to CPU for metrics
test_predictions = cp.asnumpy(test_predictions)

In [10]:
# Function to decode y_prediction_encoded

def decode_y(df):
    forward = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
    backward = {v: k for k, v in forward.items()}
    df['piezo_groundwater_level_category'] = df['piezo_groundwater_level_category'].map(backward)
    return df

In [11]:
ix = X_test.index

df_pred = pd.DataFrame(test_predictions, index=ix, columns=['piezo_groundwater_level_category'])
df_pred
df_pred = decode_y(df_pred)
df_pred.to_csv('predictionsXX.csv', index_label="row_index")
