In [1]:
import pandas as pd
from pathlib import Path

data_folder = Path("/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/datasets")
figure_folders = Path("/home/luba/Documents/DS/projects-courses-ongoing/sklearn-course-inria-[doing]/figures")

adult_census = pd.read_csv(data_folder.joinpath("adult-census.csv"))
adult_census = adult_census.drop(columns="education-num")
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name,])

In [2]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)
numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [3]:
import time

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

start = time.time()
cv_results = cross_validate(model, data, target)
elapsed_time = time.time() - start

scores = cv_results["test_score"]

print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f} "
      f"with a fitting time of {elapsed_time:.3f}")

The mean cross-validation accuracy is: 0.873 +/- 0.002 with a fitting time of 4.585


In [4]:
from sklearn.preprocessing import StandardScaler

scaling_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ("std-scaler", scaling_preprocessor, numerical_columns),
    ('ordinal-enconder', categorical_preprocessor, categorical_columns)
])

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

start = time.time()
cv_results = cross_validate(model, data, target)
elapsed_time = time.time() - start

scores = cv_results["test_score"]

print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f} "
      f"with a fitting time of {elapsed_time:.3f}")

The mean cross-validation accuracy is: 0.874 +/- 0.002 with a fitting time of 4.293


In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

scaling_preprocessor = StandardScaler()
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocessor = ColumnTransformer([
    ("std-scaler", scaling_preprocessor, numerical_columns),
    ('ordinal-enconder', categorical_preprocessor, categorical_columns)
])

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

start = time.time()
cv_results = cross_validate(model, data, target)
elapsed_time = time.time() - start

scores = cv_results["test_score"]

print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f} "
      f"with a fitting time of {elapsed_time:.3f}")

The mean cross-validation accuracy is: 0.873 +/- 0.003 with a fitting time of 8.283
