In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import seaborn as sns

import pytelligence as pt

%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 2000

sns.set_theme()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# bc = load_breast_cancer()
# X = pd.DataFrame(bc.data, columns=bc.feature_names)
# y = pd.Series(bc.target, name="class")
# df = pd.concat([X, y], axis=1)
# df.head()

In [3]:
df = pd.read_csv(
    "./data/breast_cancer_cat/breast-cancer.data",
    names=[
        "class",
        "age",
        "menopause",
        "tumor-size",
        "inv-nodes",
        "node-caps",
        "deg-malig",
        "breast",
        "breast-quad",
        "irradiat",
    ],
)
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [4]:
# pt.feat_analysis.get_distribution(df)

In [5]:
# sns.jointplot(data=df, x="deg-malig", y="breast", hue="class")

## Training

In [6]:
df.columns

Index(['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [7]:
setup, X_sample, y_sample = pt.modelling.prepare_data(
    train_data=df,
    config_path="./config_bc_cat.yml",
)

[32m[I 2022-08-14 19:26:56][0m %%% PREPARING DATA[0m
[32m[I 2022-08-14 19:26:56][0m Read ./config_bc_cat.yml: 
 {'modelling': {'target_clf': 'class', 'numeric_cols': ['deg-malig'], 'categorical_cols': ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat'], 'feature_scaling': False}}[0m
[32m[I 2022-08-14 19:26:56][0m Created preprocessing pipeline with following steps: ['ohe'][0m
[32m[I 2022-08-14 19:26:56][0m Applied preprocessing transformations[0m
[32m[I 2022-08-14 19:26:56][0m Encoded target variable using classes: [(0, 'no-recurrence-events'), (1, 'recurrence-events')][0m


In [8]:
X_sample

Unnamed: 0,deg-malig,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,tumor-size_0-4,tumor-size_10-14,tumor-size_15-19,tumor-size_20-24,tumor-size_25-29,tumor-size_30-34,tumor-size_35-39,tumor-size_40-44,tumor-size_45-49,tumor-size_5-9,tumor-size_50-54,inv-nodes_0-2,inv-nodes_12-14,inv-nodes_15-17,inv-nodes_24-26,inv-nodes_3-5,inv-nodes_6-8,inv-nodes_9-11,node-caps_?,node-caps_no,node-caps_yes,breast_left,breast_right,breast-quad_?,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_no,irradiat_yes
0,3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
1,2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0
2,2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
4,2,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0


In [9]:
compare_df, algo_list, model_list = pt.modelling.compare_algorithms(
    setup=setup,
    include=[
        "lr",
        # "dt",
        # "rf",
        # "ridge",
        # "perceptron",
        # "passive-aggressive",
        # "extratree",
        # "extratrees",
        # "knn",
        "nb",
        # "linearsvc",
        # "rbfsvc",
    ],
    sort="f1",
    return_models=True,
)
compare_df

[32m[I 2022-08-14 19:26:58][0m %%% COMPARING ALGORITHMS[0m
[33m[W 2022-08-14 19:26:58][0m The algorithms ['lr'] work suboptimally without scaled features. Consider turning it on within the config and rerun pt.modelling.prepare_data().[0m
[32m[I 2022-08-14 19:26:58][0m Evaluating lr...[0m
[32m[I 2022-08-14 19:27:10][0m Evaluating nb...[0m


Unnamed: 0,algorithm,accuracy,precision,recall,f1,roc_auc,Fit time (s)
0,nb,0.549,0.447,0.835,0.552,0.659,0.042
1,lr,0.678,0.527,0.329,0.338,0.653,0.177


In [10]:
le = setup.y_clf_encoder
preds = model_list[-1].predict(X_sample)
le.inverse_transform(preds)

array(['recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events'], dtype=object)

In [11]:
print(*[(i, class_) for i, class_ in enumerate(le.classes_)])

(0, 'no-recurrence-events') (1, 'recurrence-events')


In [18]:
compare_df_tune, model_list, opt_history_dict = (
        pt.modelling.tune_hyperparams(
               setup=setup,
               include=["lr", "nb"],
               optimize="f1",
               n_trials=150,
               return_models=True,
        )
)
compare_df_tune

[32m[I 2022-08-14 19:29:55][0m %%% TUNING HYPERPARAMETERS[0m
[33m[W 2022-08-14 19:29:55][0m The algorithms ['lr'] work suboptimally without scaled features. Consider turning it on within the config and rerun pt.modelling.prepare_data().[0m
[32m[I 2022-08-14 19:29:55][0m Algorithms: ['lr', 'nb'][0m
[32m[I 2022-08-14 19:29:55][0m Metric: f1[0m
[32m[I 2022-08-14 19:29:55][0m Trials per algorithm: 150[0m
[32m[I 2022-08-14 19:29:58][0m Trial 0 finished with value: 0.3792 and parameters: {'C': 11.189818280707982, 'l1_ratio': 0.8296462967310544}. Best is trial 0 with value: 0.3792[0m
[32m[I 2022-08-14 19:29:58][0m Trial 1 finished with value: 0.3447 and parameters: {'C': 0.8099045984445684, 'l1_ratio': 0.6656252949892254}. Best is trial 0 with value: 0.3792[0m
[32m[I 2022-08-14 19:29:59][0m Trial 2 finished with value: 0.0000 and parameters: {'C': 8.109518418383147e-05, 'l1_ratio': 0.8086845489193774}. Best is trial 0 with value: 0.3792[0m
[32m[I 2022-08-14 19:29:59]

Unnamed: 0,algorithm,metric,hyperparams
0,nb,0.57791,"{'priors': None, 'var_smoothing': 6.7292206875..."
1,lr,0.408769,"{'C': 2.005915918230618, 'class_weight': None,..."


In [21]:
opt_history_dict["lr"].show()

In [26]:
best_feature_list = pt.modelling.reduce_feature_space(
    setup=setup,
    algorithm= "nb",
    metric="f1",
    reference_metric=compare_df_tune.iloc[0]["metric"],
    acceptable_loss=0.95,
    hyperparams=compare_df_tune.iloc[0]["hyperparams"]
)

[32m[I 2022-08-14 14:46:32][0m %%% REDUCING FEATURE SPACE[0m
[32m[I 2022-08-14 14:46:32][0m Algorithm selected for feature space reduction: nb[0m
[32m[I 2022-08-14 14:46:32][0m Metric to optimize for: f1[0m
[32m[I 2022-08-14 14:46:32][0m Minimum acceptable metric: 0.546 or 0.95 * reference metric (0.575)[0m
[32m[I 2022-08-14 14:46:49][0m Current metric: 0.579, removing worst feature: breast-quad_right_up[0m
[32m[I 2022-08-14 14:46:49][0m Updating reference metric...[0m
[32m[I 2022-08-14 14:47:03][0m Current metric: 0.59, removing worst feature: breast-quad_left_up[0m
[32m[I 2022-08-14 14:47:03][0m Updating reference metric...[0m
[32m[I 2022-08-14 14:47:18][0m Current metric: 0.593, removing worst feature: tumor-size_25-29[0m
[32m[I 2022-08-14 14:47:18][0m Updating reference metric...[0m
[32m[I 2022-08-14 14:47:28][0m Current metric: 0.593, removing worst feature: menopause_ge40[0m
[32m[I 2022-08-14 14:47:38][0m Current metric: 0.595, removing worst fe

In [28]:
compare_df_final, model_list = (
        pt.modelling.tune_hyperparams(
               setup=setup,
               include=["nb"],
               optimize="f1",
               n_trials=50,
               feature_list=best_feature_list,
               return_models=True,
        )
)
compare_df_final

[32m[I 2022-08-14 14:51:42][0m %%% TUNING HYPERPARAMETERS[0m
[32m[I 2022-08-14 14:51:42][0m Algorithms selected for tuning: ['nb'][0m
[32m[I 2022-08-14 14:51:42][0m Metric to optimize for: f1[0m
[32m[I 2022-08-14 14:51:42][0m Trials per algorithm: 50[0m
[32m[I 2022-08-14 14:51:43][0m Trial 0 finished with value: 0.5933 and parameters: {'var_smoothing': 3.059370889442078e-05}. Best is trial 0 with value: 0.5933[0m
[32m[I 2022-08-14 14:51:43][0m Trial 1 finished with value: 0.5120 and parameters: {'var_smoothing': 0.00026776899047403573}. Best is trial 0 with value: 0.5933[0m
[32m[I 2022-08-14 14:51:43][0m Trial 2 finished with value: 0.4706 and parameters: {'var_smoothing': 0.0012182405692114067}. Best is trial 0 with value: 0.5933[0m
[32m[I 2022-08-14 14:51:43][0m Trial 3 finished with value: 0.5652 and parameters: {'var_smoothing': 3.91702322838022e-12}. Best is trial 0 with value: 0.5933[0m
[32m[I 2022-08-14 14:51:44][0m Trial 4 finished with value: 0.4610 a

Unnamed: 0,algorithm,metric,hyperparams
0,nb,0.622082,"{'priors': None, 'var_smoothing': 6.4098528065..."


In [37]:
setup.prep_pipe

In [36]:
pt.modelling.export_model(
        setup=setup,
        model=model_list[0],
        target_dir="./",
    )