# Using Gradient Boosting Classifier

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.datasets import load_breast_cancer

# Load the Breast Cancer Wisconsin dataset from the UCI repository
data = load_breast_cancer(as_frame=True)
df = data.frame

# Split the dataset into features (X) and target variable (y)
X = df.drop("target", axis=1)
y = df["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gb_classifier.predict(X_test)

# Calculate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_mat)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Confusion Matrix:
[[40  3]
 [ 2 69]]
Accuracy: 0.956140350877193


# With K-Fold Cross Validation

In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer

# Load the Breast Cancer Wisconsin dataset from the UCI repository
data = load_breast_cancer(as_frame=True)
df = data.frame

# Split the dataset into features (X) and target variable (y)
X = df.drop("target", axis=1)
y = df["target"]

# Initialize the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Perform k-fold cross-validation and obtain predicted labels
k = 5  # Number of folds
y_pred = cross_val_predict(gb_classifier, X, y, cv=k)

# Calculate the confusion matrix
confusion_mat = confusion_matrix(y, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_mat)

Confusion Matrix:
[[200  12]
 [ 10 347]]


# With Extreme Gradient Boosting

In [13]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer

# Load the Breast Cancer Wisconsin dataset from the UCI repository
data = load_breast_cancer(as_frame=True)
df = data.frame

# Split the dataset into features (X) and target variable (y)
X = df.drop("target", axis=1)
y = df["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train the classifier on the training data
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_classifier.predict(X_test)

# Calculate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_mat)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred_labels)

# Print the accuracy score
print("Accuracy:", accuracy)

Confusion Matrix:
[[40  3]
 [ 2 69]]
Accuracy: 0.9649122807017544


# With Light GBM

In [12]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

# Load the Breast Cancer Wisconsin dataset from the UCI repository
data = load_breast_cancer(as_frame=True)
df = data.frame

# Split the dataset into features (X) and target variable (y)
X = df.drop("target", axis=1)
y = df["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data into LightGBM's Dataset format
train_data = lgb.Dataset(X_train, label=y_train)

# Set the parameters for LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_logloss'
}

# Train the LightGBM model
gbm = lgb.train(params, train_data)

# Make predictions on the test data
y_pred = gbm.predict(X_test)
y_pred_labels = np.round(y_pred)

# Calculate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred_labels)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_mat)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred_labels)

# Print the accuracy score
print("Accuracy:", accuracy)

[LightGBM] [Info] Number of positive: 286, number of negative: 169
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628571 -> initscore=0.526093
[LightGBM] [Info] Start training from score 0.526093
Confusion Matrix:
[[40  3]
 [ 1 70]]
Accuracy: 0.9649122807017544
