**Actual Xgboost Classifier Code**

In [19]:
%%time

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

# Step 1: Load the CSV file
data = pd.read_csv('Sample_Data_4228.csv')

# Step 2: Preprocess the data
# Split the data into features (X) and labels (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Step 3: Normalize the features using z-score normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Split the data into training and testing sets
# 80% of the data will be used for training, and 20% for testing
# The random_state parameter ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Define hyperparameters with regularization
params = {
    'max_depth': 4,                  # Maximum depth of each tree (controls model complexity)
    'learning_rate': 0.08,           # Learning rate (step size) for each boosting iteration
    'n_estimators': 150,             # Number of boosting iterations (number of trees)
    'min_child_weight': 0.0625,      # Minimum sum of instance weight (Hessian) needed in a child
    'subsample': 0.8,                # Subsample ratio of the training instances
    'colsample_bytree': 0.8,         # Subsample ratio of columns when constructing each tree
    'reg_alpha': 0.1,                # L1 regularization term (Lasso)
    'reg_lambda': 1.0,               # L2 regularization term (Ridge)
    'objective': 'multi:softmax',    # Objective function for multiclass classification
}

# Step 6: Create the XGBoost model with hyperparameters
model = xgb.XGBClassifier(**params)

# Step 7: Train the model
model.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = model.predict(X_test)

# Step 9: Evaluate the model
# Calculate the accuracy of the model's predictions
accuracy = accuracy_score(y_test, y_pred)
# Calculate the F1 score of the model's predictions
# The 'weighted' average is used to account for class imbalance
f1 = f1_score(y_test, y_pred, average='weighted')
# Calculate the precision score of the model's predictions
precision = precision_score(y_test, y_pred, average='weighted')
# Calculate the recall score of the model's predictions
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9018912529550828
Precision: 0.9075368932868014
Recall: 0.9018912529550828
F1 Score: 0.8956718772301853
CPU times: total: 29.6 s
Wall time: 29.9 s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Cross-Validation of Xgboost Classifier Model**

In [None]:
%%time

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Step 1: Load the CSV file
data = pd.read_csv('Sample_Data_4228.csv')

# Step 2: Preprocess the data
# Split the data into features (X) and labels (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Step 3: Normalize the features using z-score normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Split the data into training and testing sets
# 80% of the data will be used for training, and 20% for testing
# The random_state parameter ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Define hyperparameters with a range of values for grid search
params = {
    'max_depth': [3, 4, 5],                          # Maximum depth of each tree (controls model complexity)
    'learning_rate': [0.05, 0.1, 0.15],               # Learning rate (step size) for each boosting iteration
    'n_estimators': [100, 150, 200],                  # Number of boosting iterations (number of trees)
    'min_child_weight': [0.05, 0.1, 0.2],             # Minimum sum of instance weight (Hessian) needed in a child
    'subsample': [0.7, 0.8, 0.9],                     # Subsample ratio of the training instances
    'colsample_bytree': [0.7, 0.8, 0.9],              # Subsample ratio of columns when constructing each tree
    'reg_alpha': [0.1, 0.5, 1.0],                     # L1 regularization term (Lasso)
    'reg_lambda': [0.1, 0.5, 1.0],                    # L2 regularization term (Ridge)
    'objective': ['multi:softmax'],                   # Objective function for multiclass classification
}

# Step 6: Create the XGBoost model
model = xgb.XGBClassifier()

# Step 7: Perform grid search cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='f1_weighted', cv=2, verbose=0, n_jobs=-1)

# Wrap GridSearchCV with tqdm for progress bar and ETA updates
with tqdm(total=len(params), bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} ETA: {remaining_s}') as pbar:
    grid_search.fit(X_train, y_train)
    pbar.close()

# Step 8: Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Step 9: Make predictions using the best model
y_pred = best_model.predict(X_test)

# Step 10: Evaluate the model
# Calculate the accuracy of the model's predictions
accuracy = accuracy_score(y_test, y_pred)
# Calculate the F1 score of the model's predictions
# The 'weighted' average is used to account for class imbalance
f1 = f1_score(y_test, y_pred, average='weighted')
# Calculate the precision score of the model's predictions
precision = precision_score(y_test, y_pred, average='weighted')
# Calculate the recall score of the model's predictions
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

  0%|          | 0/9 ETA: 0