In [2]:
import pandas as pd
import seaborn as sns

import pycarrot as pc

%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 2000

sns.set_theme()

In [5]:
df = pd.read_csv(
    "./data/breast_cancer_cat/breast-cancer.data",
    names=[
        "class",
        "age",
        "menopause",
        "tumor-size",
        "inv-nodes",
        "node-caps",
        "deg-malig",
        "breast",
        "breast-quad",
        "irradiat",
    ],
)
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [None]:
pc.feat_analysis.get_distribution(df)

In [None]:
sns.jointplot(data=df, x="deg-malig", y="breast", hue="class")

## Training

In [8]:
df.columns

Index(['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [11]:
config = pc.init_config("./config.yml")
config

{'modelling': {'target_clf': 'class',
  'numeric_cols': ['deg-malig'],
  'categorical_cols': ['age',
   'menopause',
   'tumor-size',
   'inv-nodes',
   'node-caps',
   'breast',
   'breast-quad',
   'irradiat']}}

In [None]:
setup, X_sample, y_sample = pc.modelling.prepare_data(
    train_data=df,
    config=config,
)

In [None]:
metric = "f1"

In [None]:
compare_df, model_dict = pc.modelling.compare_algorithms(
    setup=setup,
    include=[
        "lr",
        "dt",
        "rf",
        "ridge",
        "perceptron",
        "passive-aggressive",
        "extratree",
        "extratrees",
        "knn",
        "nb",
        "linearsvc",
        "rbfsvc",
    ],
    sort="f1",
    return_models=True,
)
compare_df

In [None]:
model_dict

In [None]:
algorithm, reference_metric = compare_df.loc[
    compare_df["algorithm"] == "extratrees", ["algorithm", metric]
].values[0]
best_feature_list = pc.modelling.reduce_feature_space(
    setup, algorithm, metric, reference_metric, acceptable_loss=0.5
)
best_feature_list