In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import seaborn as sns

import pycarrot as pc

%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 2000

sns.set_theme()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# bc = load_breast_cancer()
# X = pd.DataFrame(bc.data, columns=bc.feature_names)
# y = pd.Series(bc.target, name="class")
# df = pd.concat([X, y], axis=1)
# df.head()

In [2]:
df = pd.read_csv(
    "./data/breast_cancer_cat/breast-cancer.data",
    names=[
        "class",
        "age",
        "menopause",
        "tumor-size",
        "inv-nodes",
        "node-caps",
        "deg-malig",
        "breast",
        "breast-quad",
        "irradiat",
    ],
)
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [3]:
# pc.feat_analysis.get_distribution(df)

In [4]:
# sns.jointplot(data=df, x="deg-malig", y="breast", hue="class")

## Training

In [5]:
df.columns

Index(['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [32]:
config = pc.init_config("./config_bc_cat.yml")
config

{'modelling': {'target_clf': 'class',
  'numeric_cols': ['deg-malig'],
  'categorical_cols': ['age',
   'menopause',
   'tumor-size',
   'inv-nodes',
   'node-caps',
   'breast',
   'breast-quad',
   'irradiat'],
  'normalization': False}}

In [33]:
setup, X_sample, y_sample = pc.modelling.prepare_data(
    train_data=df,
    config=config,
)

In [8]:
X_sample

Unnamed: 0,deg-malig,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,tumor-size_0-4,tumor-size_10-14,tumor-size_15-19,tumor-size_20-24,tumor-size_25-29,tumor-size_30-34,tumor-size_35-39,tumor-size_40-44,tumor-size_45-49,tumor-size_5-9,tumor-size_50-54,inv-nodes_0-2,inv-nodes_12-14,inv-nodes_15-17,inv-nodes_24-26,inv-nodes_3-5,inv-nodes_6-8,inv-nodes_9-11,node-caps_?,node-caps_no,node-caps_yes,breast_left,breast_right,breast-quad_?,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_no,irradiat_yes
0,3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
1,2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0
2,2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
4,2,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0


In [10]:
compare_df, algo_list, model_list = pc.modelling.compare_algorithms(
    setup=setup,
    include=[
        "lr",
        "dt",
        "rf",
        "ridge",
        "perceptron",
        "passive-aggressive",
        "extratree",
        "extratrees",
        "knn",
        "nb",
        "linearsvc",
        "rbfsvc",
    ],
    sort="f1",
    return_models=True,
)
compare_df

Unnamed: 0,algorithm,accuracy,precision,recall,f1,roc_auc,Fit time (s)
0,nb,0.549,0.447,0.835,0.552,0.659,0.105
1,linearsvc,0.671,0.575,0.365,0.392,0.605,0.104
2,lr,0.685,0.581,0.376,0.382,0.634,0.268
3,ridge,0.668,0.549,0.353,0.375,0.609,0.123
4,extratree,0.636,0.387,0.376,0.358,0.561,0.091
5,rf,0.671,0.403,0.353,0.348,0.649,2.813
6,extratrees,0.661,0.357,0.365,0.342,0.613,2.41
7,knn,0.717,0.534,0.271,0.335,0.621,0.07
8,dt,0.633,0.351,0.329,0.33,0.55,0.065
9,rbfsvc,0.703,0.569,0.306,0.325,0.676,0.111


In [11]:
algo_list

['nb',
 'linearsvc',
 'lr',
 'ridge',
 'extratree',
 'rf',
 'extratrees',
 'knn',
 'dt',
 'rbfsvc',
 'passive-aggressive',
 'perceptron']

In [12]:
model_list

[GaussianNB(),
 LinearSVC(),
 LogisticRegression(),
 RidgeClassifier(),
 ExtraTreeClassifier(),
 RandomForestClassifier(),
 ExtraTreesClassifier(),
 KNeighborsClassifier(),
 DecisionTreeClassifier(),
 SVC(),
 PassiveAggressiveClassifier(),
 Perceptron()]

In [13]:
le = setup.y_clf_encoder
preds = model_list[-1].predict(X_sample)
le.inverse_transform(preds)

array(['recurrence-events', 'recurrence-events', 'recurrence-events',
       'recurrence-events', 'no-recurrence-events'], dtype=object)

In [14]:
print(*[(i, class_) for i, class_ in enumerate(le.classes_)])

(0, 'no-recurrence-events') (1, 'recurrence-events')


In [15]:
# algorithm, reference_metric = compare_df.loc[
#     compare_df["algorithm"] == algo_list[0], ["algorithm", "f1"]
# ].values[0]
# best_feature_list = pc.modelling.reduce_feature_space(
#     setup, algorithm, "f1", reference_metric, acceptable_loss=0.5
# )
# best_feature_list

In [40]:
compare_df, model_list = (
        pc.modelling.tune_hyperparams(
               setup=setup,
               include=["lr"],
               optimize="f1",
               n_trials=10,
               return_models=True,
        )
)
compare_df

[32m[I 2022-07-13 16:09:28,690][0m A new study created in memory with name: study_lr[0m


The algorithms ['lr'] work suboptimally without normalized data. Consider turning it on within the config.


[32m[I 2022-07-13 16:09:29,092][0m Trial 0 finished with value: 0.34474168231498864 and parameters: {'C': 0.876416376055759, 'l1_ratio': 0.7659965014916003}. Best is trial 0 with value: 0.34474168231498864.[0m
[32m[I 2022-07-13 16:09:30,003][0m Trial 1 finished with value: 0.39035965801607364 and parameters: {'C': 8.358042343998237, 'l1_ratio': 0.8999417981053001}. Best is trial 1 with value: 0.39035965801607364.[0m
[32m[I 2022-07-13 16:09:30,201][0m Trial 2 finished with value: 0.0 and parameters: {'C': 4.122508868448464e-06, 'l1_ratio': 0.22485086407784372}. Best is trial 1 with value: 0.39035965801607364.[0m
[32m[I 2022-07-13 16:09:30,459][0m Trial 3 finished with value: 0.13563218390804596 and parameters: {'C': 0.09500046100744401, 'l1_ratio': 0.46084297942600483}. Best is trial 1 with value: 0.39035965801607364.[0m
[32m[I 2022-07-13 16:09:31,053][0m Trial 4 finished with value: 0.3600398671096346 and parameters: {'C': 0.7829666622477595, 'l1_ratio': 0.221885530674847

Unnamed: 0,algorithm,metric,hyperparams
0,lr,0.39036,"{'C': 8.358042343998237, 'class_weight': None,..."


In [36]:
model_list

[LogisticRegression(C=2.150879902627906, l1_ratio=0.6131957285273479,
                    max_iter=1000, penalty='elasticnet', solver='saga')]

In [17]:
import optuna
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [18]:
def objective(trial, solver):
    lr_c = trial.suggest_float("c", 1e-6, 1e2, log=True)
    if solver in  ["newton-cg", "lbfgs", "sag"]:
        lr_penalty = trial.suggest_categorical("penalty", ["l2", "none"])
    elif solver == "liblinear":
        lr_penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    elif solver == "saga":
        lr_penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet", "none"])
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    model = LogisticRegression(C=lr_c,
    solver=solver,
    penalty=lr_penalty,
    max_iter=1000,
    )
    scores = cross_val_score(model, setup.X_train, setup.y_clf_train, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
    mean_f1 = scores.mean()

    return mean_f1

In [19]:
def get_best_solver():
    log = pd.DataFrame(data={}, columns=["solver", "score"])
    for solver in ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]:
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        model = LogisticRegression(solver=solver, max_iter=1000)
        scores = cross_val_score(model, setup.X_train, setup.y_clf_train, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        mean_f1 = scores.mean()
        log.loc[len(log)] = [solver, mean_f1]
    print(log)
    best_solver = log.sort_values(by="score", ascending=False).iloc[0]["solver"]
    print(f"Best solver: {best_solver}")
    return best_solver

In [20]:
from sklearn.exceptions import ConvergenceWarning
import functools
from sklearn.utils import parallel_backend
import warnings
with parallel_backend("multiprocessing"):
  with warnings.catch_warnings():
    # warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
    best_solver = get_best_solver()
    study = optuna.create_study(direction="maximize")
    study.optimize(functools.partial(objective, solver=best_solver), n_trials=5)
    study.best_trial

[32m[I 2022-07-12 23:39:39,742][0m A new study created in memory with name: no-name-0bf23601-4d13-46a1-a37b-33b8a4c0c564[0m


      solver     score
0  newton-cg  0.360776
1      lbfgs  0.360776
2  liblinear  0.363495
3        sag  0.360776
4       saga  0.360776
Best solver: liblinear


[32m[I 2022-07-12 23:39:43,192][0m Trial 0 finished with value: 0.3366882813322442 and parameters: {'c': 15.716189191915063, 'penalty': 'l2'}. Best is trial 0 with value: 0.3366882813322442.[0m
[32m[I 2022-07-12 23:39:45,864][0m Trial 1 finished with value: 0.0 and parameters: {'c': 0.00013917466067257176, 'penalty': 'l2'}. Best is trial 0 with value: 0.3366882813322442.[0m
[32m[I 2022-07-12 23:39:49,042][0m Trial 2 finished with value: 0.3735945862416451 and parameters: {'c': 0.2757801172519201, 'penalty': 'l2'}. Best is trial 2 with value: 0.3735945862416451.[0m
[32m[I 2022-07-12 23:39:52,148][0m Trial 3 finished with value: 0.0 and parameters: {'c': 0.006448537884634547, 'penalty': 'l2'}. Best is trial 2 with value: 0.3735945862416451.[0m
[32m[I 2022-07-12 23:39:55,228][0m Trial 4 finished with value: 0.005128205128205128 and parameters: {'c': 0.08881746409157767, 'penalty': 'l1'}. Best is trial 2 with value: 0.3735945862416451.[0m


In [21]:
study.best_trial

FrozenTrial(number=2, values=[0.3735945862416451], datetime_start=datetime.datetime(2022, 7, 12, 23, 39, 45, 892275), datetime_complete=datetime.datetime(2022, 7, 12, 23, 39, 49, 40267), params={'c': 0.2757801172519201, 'penalty': 'l2'}, distributions={'c': LogUniformDistribution(high=100.0, low=1e-06), 'penalty': CategoricalDistribution(choices=('l1', 'l2'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)

In [114]:
def fx(c, penalty):
    print(f"c: {c}")
    print(f"Penalty: {penalty}")
fx(**study.best_trial.params)

c: 0.16668851711944807
Penalty: l2


In [120]:
study.best_trial.params

{'c': 0.16668851711944807, 'penalty': 'l2'}

In [119]:
study.best_trial.values[0]

0.39335622982681806