# 📝 Exercise M3.01

The goal is to write an exhaustive search to find the best parameters
combination maximizing the model generalization performance.

Here we use a small subset of the Adult Census dataset to make the code
faster to execute. Once your code works on the small subset, try to
change `train_size` to a larger value (e.g. 0.8 for 80% instead of
20%).

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

adult_census = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.8, random_state=42)

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat_preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0)

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42))
])


Use the previously defined model (called `model`) and using two nested `for`
loops, make a search of the best combinations of the `learning_rate` and
`max_leaf_nodes` parameters. In this regard, you will need to train and test
the model by setting the parameters. The evaluation of the model should be
performed using `cross_val_score` on the training set. We will use the
following parameters search:
- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
  the ability of a new tree to correct the error of the previous sequence of
  trees
- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the
  depth of each tree.

In [3]:
for parameter in model.get_params():
    print(parameter)

memory
steps
verbose
preprocessor
classifier
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__verbose_feature_names_out
preprocessor__cat_preprocessor
preprocessor__cat_preprocessor__categories
preprocessor__cat_preprocessor__dtype
preprocessor__cat_preprocessor__handle_unknown
preprocessor__cat_preprocessor__unknown_value
classifier__categorical_features
classifier__early_stopping
classifier__l2_regularization
classifier__learning_rate
classifier__loss
classifier__max_bins
classifier__max_depth
classifier__max_iter
classifier__max_leaf_nodes
classifier__min_samples_leaf
classifier__monotonic_cst
classifier__n_iter_no_change
classifier__random_state
classifier__scoring
classifier__tol
classifier__validation_fraction
classifier__verbose
classifier__warm_start


In [7]:
model.get_params()['classifier__max_leaf_nodes']

31

In [8]:
model.get_params()["classifier__learning_rate"]

0.1

In [4]:
# Write your code here.
from sklearn.model_selection import cross_validate
best_score = 0
best_params = {}
for learning_rate in [0.01,0.1,1,10]:
    for max_leaf in [3,10,30]:
        model.set_params(classifier__learning_rate = learning_rate,classifier__max_leaf_nodes = max_leaf)
        cv_results = cross_validate(model, data, target, cv=2) #without trainsets
        scores = cv_results["test_score"]
        mean_score = scores.mean()
        print(f"score: {mean_score:.3f}")
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'learning-rate': learning_rate, 'max leaf nodes': max_leaf}
            print(f"Found new best model with score {best_score:.3f}!\n"
                 f"with learning_rate = {learning_rate} and max_leaf ={max_leaf}")

print(f"The best accuracy obtained is {best_score:.3f}")
print(f"The best parameters found are:\n {best_params}")

score: 0.799
Found new best model with score 0.799!
with learning_rate = 0.01 and max_leaf =3
score: 0.820
Found new best model with score 0.820!
with learning_rate = 0.01 and max_leaf =10
score: 0.847
Found new best model with score 0.847!
with learning_rate = 0.01 and max_leaf =30
score: 0.857
Found new best model with score 0.857!
with learning_rate = 0.1 and max_leaf =3
score: 0.869
Found new best model with score 0.869!
with learning_rate = 0.1 and max_leaf =10
score: 0.872
Found new best model with score 0.872!
with learning_rate = 0.1 and max_leaf =30
score: 0.868
score: 0.861
score: 0.859
score: 0.281
score: 0.436
score: 0.480
The best accuracy obtained is 0.872
The best parameters found are:
 {'learning-rate': 0.1, 'max leaf nodes': 30}



Now use the test set to score the model using the best parameters
that we found using cross-validation in the training set.

In [6]:
# Write your code here.
from sklearn.model_selection import cross_validate
best_score = 0
best_params = {}
for learning_rate in [0.01,0.1,1,10]:
    for max_leaf in [3,10,30]:
        model.set_params(classifier__learning_rate = learning_rate,classifier__max_leaf_nodes = max_leaf)
        cv_results = cross_validate(model, data_train, target_train, cv=2) #with train sets
        scores = cv_results["test_score"]
        mean_score = scores.mean()
        print(f"score: {mean_score:.3f}")
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'learning-rate': learning_rate, 'max leaf nodes': max_leaf}
            print(f"Found new best model with score {best_score:.3f}!\n"
                 f"with learning_rate = {learning_rate} and max_leaf ={max_leaf}")

print(f"The best accuracy obtained is {best_score:.3f}")
print(f"The best parameters found are:\n {best_params}")

score: 0.798
Found new best model with score 0.798!
with learning_rate = 0.01 and max_leaf =3
score: 0.818
Found new best model with score 0.818!
with learning_rate = 0.01 and max_leaf =10
score: 0.847
Found new best model with score 0.847!
with learning_rate = 0.01 and max_leaf =30
score: 0.853
Found new best model with score 0.853!
with learning_rate = 0.1 and max_leaf =3
score: 0.867
Found new best model with score 0.867!
with learning_rate = 0.1 and max_leaf =10
score: 0.870
Found new best model with score 0.870!
with learning_rate = 0.1 and max_leaf =30
score: 0.864
score: 0.860
score: 0.853
score: 0.282
score: 0.528
score: 0.373
The best accuracy obtained is 0.870
The best parameters found are:
 {'learning-rate': 0.1, 'max leaf nodes': 30}


In [7]:
# Alternative code..
import itertools

learning_rate = [0.01,0.1,1,10]
max_leaf = [3,10,30]


hyper_paramcomb = list(itertools.product(learning_rate, max_leaf))
hyper_paramcomb

[(0.01, 3),
 (0.01, 10),
 (0.01, 30),
 (0.1, 3),
 (0.1, 10),
 (0.1, 30),
 (1, 3),
 (1, 10),
 (1, 30),
 (10, 3),
 (10, 10),
 (10, 30)]

In [9]:
for lr, mln in hyper_paramcomb:
    model.set_params(
         classifier__learning_rate=lr, 
        classifier__max_leaf_nodes=mln)
    
hyper_paramcomb = list((lr,mln)
                  for lr in learning_rate
                  for mln in max_leaf)
hyper_paramcomb

[(0.01, 3),
 (0.01, 10),
 (0.01, 30),
 (0.1, 3),
 (0.1, 10),
 (0.1, 30),
 (1, 3),
 (1, 10),
 (1, 30),
 (10, 3),
 (10, 10),
 (10, 30)]


Done