In [5]:
pip install xgboost scikit-learn

Mean Squared Error: 0.25


In [17]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost classifier model
xgb_classifier = xgb.XGBClassifier(tree_method='hist', device='cuda', n_jobs=-1, random_state=42)

# Hyperparameters explanation:
# 1. 'n_estimators': Number of boosting rounds (trees) to build. More trees can lead to overfitting but can improve performance.
# 2. 'learning_rate': Step size shrinkage to prevent overfitting. Smaller values require more boosting rounds but can lead to better generalization.
# 3. 'max_depth': Maximum depth of a tree. Controls the complexity of the trees. Deeper trees can capture more information but can lead to overfitting.
# 4. 'subsample': Subsample ratio of the training instances. Controls the fraction of samples used for fitting the individual trees.
#                 Lower values prevent overfitting but might increase bias.
# 5. 'colsample_bytree': Subsample ratio of columns when constructing each tree. Controls the fraction of features used for building the trees.
#                        Lower values prevent overfitting but might increase bias.
# 6. 'gamma': Minimum loss reduction required to make a further partition on a leaf node. Regularization parameter that controls the tree complexity.
#             Higher values make the algorithm more conservative.
# 7. 'reg_alpha': L1 regularization term on weights. Encourages sparsity in the weight vectors.
# 8. 'reg_lambda': L2 regularization term on weights. Controls the smoothness of the weights.
# 9. 'scale_pos_weight': Controls the balance of positive and negative weights, useful for imbalanced datasets.
# 10. 'objective': Specifies the learning task and the corresponding loss function. For classification, 'binary:logistic' or 'multi:softmax' are common.
# 11. 'eval_metric': Metric to be used for model evaluation during training. Common choices include 'error' (classification error) and 'logloss' (negative log-likelihood).
# 12. 'random_state': Random number seed for reproducibility.

# Train the XGBoost classifier model
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_classifier.predict(X_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 1.00


# With Grid Search cv


In [21]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost classifier model
xgb_classifier = xgb.XGBClassifier(tree_method='hist', device='cuda', n_jobs=-1, random_state=42)

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.3, 0.7],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 1],
    'reg_lambda': [0, 1],
    'n_estimators': [100, 200],
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

# Make predictions on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)


Fitting 2 folds for each of 256 candidates, totalling 512 fits
Best Parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}
Best Accuracy: 0.9333333333333333
Accuracy on Test Set: 1.0
