In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import seaborn as sns

import pycarrot as pc

%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 2000

sns.set_theme()

In [2]:
# bc = load_breast_cancer()
# X = pd.DataFrame(bc.data, columns=bc.feature_names)
# y = pd.Series(bc.target, name="class")
# df = pd.concat([X, y], axis=1)
# df.head()

In [3]:
df = pd.read_csv(
    "./data/breast_cancer_cat/breast-cancer.data",
    names=[
        "class",
        "age",
        "menopause",
        "tumor-size",
        "inv-nodes",
        "node-caps",
        "deg-malig",
        "breast",
        "breast-quad",
        "irradiat",
    ],
)
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [4]:
# pc.feat_analysis.get_distribution(df)

In [5]:
# sns.jointplot(data=df, x="deg-malig", y="breast", hue="class")

## Training

In [6]:
df.columns

Index(['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [27]:
config = pc.init_config("./config_bc_cat.yml")
config

{'modelling': {'target_clf': 'class',
  'numeric_cols': ['deg-malig'],
  'categorical_cols': ['age',
   'menopause',
   'tumor-size',
   'inv-nodes',
   'node-caps',
   'breast',
   'breast-quad',
   'irradiat']}}

In [28]:
setup, X_sample, y_sample = pc.modelling.prepare_data(
    train_data=df,
    config=config,
)

In [29]:
X_sample

Unnamed: 0,deg-malig,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,tumor-size_0-4,tumor-size_10-14,tumor-size_15-19,tumor-size_20-24,tumor-size_25-29,tumor-size_30-34,tumor-size_35-39,tumor-size_40-44,tumor-size_45-49,tumor-size_5-9,tumor-size_50-54,inv-nodes_0-2,inv-nodes_12-14,inv-nodes_15-17,inv-nodes_24-26,inv-nodes_3-5,inv-nodes_6-8,inv-nodes_9-11,node-caps_?,node-caps_no,node-caps_yes,breast_left,breast_right,breast-quad_?,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_no,irradiat_yes
0,3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
1,2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0
2,2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
4,2,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0


In [30]:
compare_df, model_dict = pc.modelling.compare_algorithms(
    setup=setup,
    include=[
        "lr",
        "dt",
        "rf",
        "ridge",
        "perceptron",
        "passive-aggressive",
        "extratree",
        "extratrees",
        "knn",
        "nb",
        "linearsvc",
        "rbfsvc",
    ],
    sort="f1",
    return_models=True,
)
compare_df

Unnamed: 0,algorithm,accuracy,precision,recall,f1,roc_auc,Fit time (s)
0,nb,0.549,0.447,0.835,0.552,0.659,0.054
1,dt,0.651,0.397,0.424,0.402,0.577,0.041
2,linearsvc,0.671,0.575,0.365,0.392,0.605,0.051
3,lr,0.685,0.581,0.376,0.382,0.634,0.171
4,extratree,0.633,0.388,0.4,0.377,0.578,0.043
5,ridge,0.668,0.549,0.353,0.375,0.609,0.061
6,extratrees,0.664,0.36,0.365,0.344,0.635,1.31
7,rf,0.675,0.391,0.329,0.337,0.645,1.982
8,knn,0.717,0.534,0.271,0.335,0.621,0.036
9,rbfsvc,0.703,0.569,0.306,0.325,0.676,0.066


In [16]:
model_dict

{'lr': LogisticRegression(),
 'dt': DecisionTreeClassifier(),
 'rf': RandomForestClassifier(),
 'ridge': RidgeClassifier(),
 'perceptron': Perceptron(),
 'passive-aggressive': PassiveAggressiveClassifier(),
 'extratree': ExtraTreeClassifier(),
 'extratrees': ExtraTreesClassifier(),
 'knn': KNeighborsClassifier(),
 'nb': GaussianNB(),
 'linearsvc': LinearSVC(),
 'rbfsvc': SVC()}

In [17]:
le = setup.y_clf_encoder
preds = model_dict["lr"].predict(X_sample)
le.inverse_transform(preds)

array(['no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events'], dtype=object)

In [18]:
print(*[(i, class_) for i, class_ in enumerate(le.classes_)])

(0, 'no-recurrence-events') (1, 'recurrence-events')


In [35]:
algorithm, reference_metric = compare_df.loc[
    compare_df["algorithm"] == "extratrees", ["algorithm", "f1"]
].values[0]
best_feature_list = pc.modelling.reduce_feature_space(
    setup, algorithm, "f1", reference_metric, acceptable_loss=0.5
)
best_feature_list

['deg-malig']

In [54]:
trans = pc.modelling.internals.OHE()
trans.fit(df)
df_t = trans.transform(df)
df_t.head()

Unnamed: 0,deg-malig,class_no-recurrence-events,class_recurrence-events,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,tumor-size_0-4,tumor-size_10-14,tumor-size_15-19,tumor-size_20-24,tumor-size_25-29,tumor-size_30-34,tumor-size_35-39,tumor-size_40-44,tumor-size_45-49,tumor-size_5-9,tumor-size_50-54,inv-nodes_0-2,inv-nodes_12-14,inv-nodes_15-17,inv-nodes_24-26,inv-nodes_3-5,inv-nodes_6-8,inv-nodes_9-11,node-caps_?,node-caps_no,node-caps_yes,breast_left,breast_right,breast-quad_?,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_no,irradiat_yes
0,3,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
1,2,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0
2,2,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
4,2,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0


In [59]:
df["breast-quad"].value_counts()

left_low     110
left_up       97
right_up      33
right_low     24
central       21
?              1
Name: breast-quad, dtype: int64