In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv('04.reduced_telco_data.csv')
# data.head()

In [3]:
y = data.iloc[:,6].values
tmp = data.drop(data.columns[6], axis=1)
X = tmp.iloc[:,:].values

In [4]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,
    test_size=0.2,
    random_state= 42,
    stratify=y,
)

In [9]:
model1 = GradientBoostingClassifier(
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Contribution of each tree
    max_depth=3,           # Shallow trees (default is often 3)
    subsample=0.8,         # Use Stochastic Gradient Boosting (SGB) for regularization
    random_state=42
)

model1.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,roc_auc_score

model = model1
y_train_pred = model.predict(X_train)
train_accuracy_explicit = accuracy_score(y_train, y_train_pred)
y_test_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1score = f1_score(y_test, y_test_pred)
auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])


print(f"Training_Accuracy: {train_accuracy_explicit}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1score}")
print(classification_report(y_test, y_test_pred))

Training_Accuracy: 0.7674107142857143
Accuracy: 0.775625
Precision: 0.7035879112699122
Recall: 0.9005335365853658
F1-Score: 0.7899707480150439
              precision    recall  f1-score   support

           0       0.88      0.67      0.76      5952
           1       0.70      0.90      0.79      5248

    accuracy                           0.78     11200
   macro avg       0.79      0.78      0.77     11200
weighted avg       0.80      0.78      0.77     11200



In [6]:
# to check best one automatically
from sklearn.model_selection import GridSearchCV

gbr = GradientBoostingClassifier(random_state=42)


gbr_param_grid = {
    # Number of boosting stages/trees
    "n_estimators": [100, 200, 300],
    # Contribution of each tree to the final prediction (key tuning parameter)
    "learning_rate": [0.01, 0.1, 0.2],
    # Maximum depth of the individual regression estimators
    "max_depth": [3, 5, 8],
    # Fraction of samples to be used for fitting the individual base learners.
    # This is a form of Stochastic Gradient Boosting (SGB) that reduces variance.
    "subsample": [0.8, 1.0],
    # Minimum number of samples required to split an internal node
    "min_samples_split": [2, 5],
    "max_features": ["sqrt", None],
}
scoring = ["accuracy", "precision", "recall", "f1", "roc_auc", "neg_log_loss"]

grid_search = GridSearchCV(
    estimator=gbr,  # selected model
    param_grid=gbr_param_grid,  # grid
    cv=5,  # train data will devide to 5 and the model is trained 5 times: each time on 4 parts, and validated on the 1 remaining part. The average score across these 5 folds is used to evaluate a given parameter combination.
    scoring=scoring,
    refit="accuracy",  # then using scoring eith a multiple value need to define this
    verbose=2,
    n_jobs=-1,  # number of cpu cores(-1 all)
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


0,1,2
,estimator,GradientBoost...ndom_state=42)
,param_grid,"{'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 5, ...], 'max_features': ['sqrt', None], 'min_samples_split': [2, 5], ...}"
,scoring,"['accuracy', 'precision', ...]"
,n_jobs,-1
,refit,'accuracy'
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [7]:
print(grid_search.best_params_)
print(f"{grid_search.best_score_}")

{'learning_rate': 0.1, 'max_depth': 3, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 1.0}
0.7678571428571429


acording to the GridSearchCV the best results 

{'learning_rate': 0.1, 'max_depth': 3, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 1.0}
0.7678571428571429

In [20]:
model2 = GradientBoostingClassifier(
    learning_rate=0.1,     # Contribution of each tree
    max_depth=3,           # Shallow trees (default is often 3)
    max_features = None,
    min_samples_split= 5,
    n_estimators=200,      # Number of trees
    subsample=1.0,         # Use Stochastic Gradient Boosting (SGB) for regularization
    random_state=42
)

model2.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,roc_auc_score

model = model2
y_train_pred = model.predict(X_train)
train_accuracy_explicit = accuracy_score(y_train, y_train_pred)
y_test_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1score = f1_score(y_test, y_test_pred)
auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])


print(f"Training_Accuracy: {train_accuracy_explicit}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1score}")
print(classification_report(y_test, y_test_pred))

Training_Accuracy: 0.7699776785714286
Accuracy: 0.7775892857142858
Precision: 0.7058999253174011
Recall: 0.9005335365853658
F1-Score: 0.7914259398811019
              precision    recall  f1-score   support

           0       0.88      0.67      0.76      5952
           1       0.71      0.90      0.79      5248

    accuracy                           0.78     11200
   macro avg       0.80      0.78      0.78     11200
weighted avg       0.80      0.78      0.78     11200



In [21]:
# saving the model
import joblib
import os

# Define the filename and path
model_filename = 'GradientBoostingClassifier_model.joblib'

# Save the model
joblib.dump(model2, model_filename)

['GradientBoostingClassifier_model.joblib']