In [1]:
import pandas as pd

blood_transfusion = pd.read_csv("../datasets/blood_transfusion.csv")
data = blood_transfusion.drop(columns="Class")
target = blood_transfusion["Class"]

## Dummy classifier

In [25]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate, cross_val_score

dummy = DummyClassifier(strategy="most_frequent")
cv_result = cross_validate(dummy, data, target, cv=10, scoring=["accuracy", "balanced_accuracy"])

print(f"Average accuracy: {cv_result['test_accuracy'].mean():.3f}")
print(f"Average balanced accuracy: "
      f"{cv_result['test_balanced_accuracy'].mean():.3f}")

Average accuracy: 0.762
Average balanced accuracy: 0.500


In [9]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
cv_result_tree = cross_validate(tree, data, target, cv=10, scoring=["accuracy", "balanced_accuracy"])

print(f"Average accuracy: {cv_result_tree['test_accuracy'].mean():.3f}")
print(f"Average balanced accuracy: "
      f"{cv_result_tree['test_balanced_accuracy'].mean():.3f}")


Average accuracy: 0.627
Average balanced accuracy: 0.509


In [20]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

random = RandomForestClassifier(n_estimators=300, n_jobs=-1)
cv_result_random = cross_validate(random, data, target, cv=10, scoring=["accuracy", "balanced_accuracy"],
                            n_jobs=-1)

print(f"Average accuracy: {cv_result_random['test_accuracy'].mean():.3f}")
print(f"Average balanced accuracy: "
      f"{cv_result_random['test_balanced_accuracy'].mean():.3f}")


Average accuracy: 0.669
Average balanced accuracy: 0.530


In [54]:
boosting = GradientBoostingClassifier(n_estimators=300, random_state=0)
cv_result_boosting = cross_validate(boosting, data, target, cv=10, scoring=["accuracy", "balanced_accuracy"],
                            n_jobs=-1)

print(f"Average accuracy: {cv_result_boosting['test_accuracy'].mean():.3f}")
print(f"Average balanced accuracy: "
      f"{cv_result_boosting['test_balanced_accuracy'].mean():.3f}")


Average accuracy: 0.691
Average balanced accuracy: 0.537


In [27]:
from sklearn.model_selection import KFold
from sklearn.

n_try = 10
scores_rf, scores_gbdt = [], []
for seed in range(n_try):
    cv = KFold(n_splits=10, shuffle=True, random_state=seed)

    rf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    scores = cross_val_score(
        rf, data, target, cv=cv, scoring="balanced_accuracy", n_jobs=-1
    )
    scores_rf.append(scores.mean())

    gbdt = GradientBoostingClassifier(n_estimators=300)
    scores = cross_val_score(
        gbdt, data, target, cv=cv, scoring="balanced_accuracy", n_jobs=-1
    )
    scores_gbdt.append(scores.mean())




compare = [s_gbdt > s_rf for s_gbdt, s_rf in zip(scores_gbdt, scores_rf)]
sum(compare)

10

In [30]:
scores_gbdt

[0.6116416991509557,
 0.6082180772939754,
 0.5974543503674362,
 0.6166012516291297,
 0.5948119960417708,
 0.5987164058221208,
 0.5981547417134458,
 0.5995651060915913,
 0.6195456729019899,
 0.602401617037815]

In [29]:
scores_rf

[0.5967984312894067,
 0.5952372773691187,
 0.5898847668914124,
 0.5972254909875587,
 0.5930820018261497,
 0.5923257447082079,
 0.5846185210443695,
 0.5867442886420675,
 0.6140910708871163,
 0.5897907363740011]

In [59]:
ghist = HistGradientBoostingClassifier( max_iter=1000, early_stopping=True, random_state=0)
score_ghist = cross_validate(ghist, data, target, cv=10, scoring="balanced_accuracy",
                            n_jobs=-1, return_estimator=True)
score_ghist["test_score"].mean()

0.5546955624355004

In [75]:
score_ghist["estimator"][1].n_iter_

27

In [76]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

n_try = 10
scores_rf, scores_gbdt, scores_ghist = [], [], []
for seed in range(n_try):
    cv = KFold(n_splits=10, shuffle=True, random_state=seed)

    rf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    scores = cross_val_score(
        rf, data, target, cv=cv, scoring="balanced_accuracy", n_jobs=-1
    )
    scores_rf.append(scores.mean())

    gbdt = GradientBoostingClassifier(n_estimators=300)
    scores = cross_val_score(
        gbdt, data, target, cv=cv, scoring="balanced_accuracy", n_jobs=-1
    )
    scores_gbdt.append(scores.mean())

    ghist = HistGradientBoostingClassifier(max_iter=1000, early_stopping=True)
    scores = cross_val_score(ghist, data, target, cv=cv, scoring="balanced_accuracy",
                            n_jobs=-1)
    scores_ghist.append(scores.mean())


In [78]:
pd.DataFrame({"Random":scores_rf, "Boosting":scores_gbdt, "HistBoosting":scores_ghist})

Unnamed: 0,Random,Boosting,HistBoosting
0,0.598795,0.611642,0.623253
1,0.59243,0.608218,0.611034
2,0.585083,0.596592,0.60785
3,0.600522,0.616601,0.619452
4,0.598287,0.591419,0.601229
5,0.598516,0.598716,0.599582
6,0.580766,0.598155,0.62721
7,0.579632,0.596232,0.620644
8,0.598771,0.619546,0.619333
9,0.598349,0.602402,0.607141


In [79]:
compare = [s_gbdt > s_rf for s_gbdt, s_rf in zip(scores_gbdt, scores_ghist)]
sum(compare)

1

In [85]:
from imblearn.ensemble import BalancedBaggingClassifier

base_clase = HistGradientBoostingClassifier(max_iter=1000, early_stopping=True)
balanced_bagging = BalancedBaggingClassifier(base_clase, n_estimators=50)

cv_result = cross_validate(balanced_bagging, data, target, cv=10, n_jobs=-1, scoring="balanced_accuracy")
cv_result["test_score"].mean()

0.5944702442380461