In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml, time, sys, os

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)
sns.set_style("darkgrid")

DATASET = "BC_Wisconsin"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = True

In [2]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")
        d = "/content/gdrive/MyDrive/datasets"
        if not os.path.isdir(d): os.makedirs(d)
    if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
    if COLAB:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
    else:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Data Set ##

In [3]:
UCI = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/"

for filename in ["wdbc.data", "wdbc.names"]:
    source = f"{UCI}/{filename}"
    target = f"{ROOT}/orig/{filename}"
    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename}")

Using local copy of wdbc.data
Using local copy of wdbc.names


In [4]:
names = ['id_number', 'diagnosis', 'radius_mean',
         'texture_mean', 'perimeter_mean', 'area_mean',
         'smoothness_mean', 'compactness_mean', 'concavity_mean',
         'concave_points_mean', 'symmetry_mean',
         'fractal_dimension_mean', 'radius_se', 'texture_se',
         'perimeter_se', 'area_se', 'smoothness_se',
         'compactness_se', 'concavity_se', 'concave_points_se',
         'symmetry_se', 'fractal_dimension_se',
         'radius_worst', 'texture_worst', 'perimeter_worst',
         'area_worst', 'smoothness_worst',
         'compactness_worst', 'concavity_worst',
         'concave_points_worst', 'symmetry_worst',
         'fractal_dimension_worst']

if not os.path.isfile(f"{ROOT}/data/wdbc.data"):
    print("Reading original data ...")
    df = pd.read_csv(f"{ROOT}/orig/wdbc.data",header=None, names=names)

    print("Encoding target ...")
    df.diagnosis = df.diagnosis.map( {"M":0, "B":1} )

    print("Drop unique identifieer ...")
    df.drop(columns=["id_number"], inplace=True)

    print("Save to folder data/  ...")
    df.to_csv(f"{ROOT}/data/wdbc.data", index=False)

print("Load from folder data/ ...")
df = pd.read_csv(f"{ROOT}/data/wdbc.data")

print(df.shape)
df.head(10)

Load from folder data/ ...
(569, 31)


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave_points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,0,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,0,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,0,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,0,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,0,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


## Preprocessing

In [5]:
X = df.values[:,1:]
y = df.diagnosis.values
X.shape, y.shape

((569, 30), (569,))

## Model Selection

In [6]:
from sklearn.ensemble import RandomForestClassifier

### Baseline Model

In [7]:
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()
scores = cross_val_score(model, X, y, cv=10)
scores.mean(), scores.std()

(0.9596177944862155, 0.033312023852656684)

## GridSearch
Define the parameter search space

In [8]:
from sklearn.model_selection import GridSearchCV
parameter_space = {
    "criterion": ['gini', 'entropy'],
    'max_depth': range(1,20),
    'max_features': np.linspace(0.1, 0.9, 9),
    "n_estimators": range(2,21),
}

Define the search

In [9]:
grid_search = GridSearchCV(model, parameter_space, n_jobs=-1)

Carry out the search

In [10]:
start = time.time()
grid_search.fit(X, y)
end = time.time()

Report Results

In [11]:
print("Fit Time:", end - start)
print("Best param:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fit Time: 391.5864338874817
Best param: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 0.7000000000000001, 'n_estimators': 13}
Best score: 0.9736376339077782


### Optuna

In [12]:
import optuna
def objective(trial):

    # Parameter space
    parameter_space = {
        "criterion": trial.suggest_categorical('criterion', ['gini','entropy']),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "max_features": trial.suggest_float("max_features", 0.1, 0.9),
        "n_estimators": trial.suggest_int("n_estimators", 2, 10),
    }

    # Setup model using hyper-parameters values
    model = RandomForestClassifier(**parameter_space)

    # Scoring model
    score = cross_val_score(model, X, y, n_jobs=-1, cv=10)

    return score.mean()

Define Search

In [14]:
study = optuna.create_study(direction="maximize")

[32m[I 2023-03-08 15:45:31,438][0m A new study created in memory with name: no-name-2cc8ce4c-16cc-4ebf-81ae-41c369e68143[0m


Carry out the search

In [15]:
start = time.time()
study.optimize(objective, n_trials=10)
end = time.time()

[32m[I 2023-03-08 15:45:54,253][0m Trial 0 finished with value: 0.950814536340852 and parameters: {'criterion': 'gini', 'max_depth': 20, 'max_features': 0.8328654395960886, 'n_estimators': 5}. Best is trial 0 with value: 0.950814536340852.[0m
[32m[I 2023-03-08 15:45:54,331][0m Trial 1 finished with value: 0.950814536340852 and parameters: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 0.11066141147667077, 'n_estimators': 5}. Best is trial 0 with value: 0.950814536340852.[0m
[32m[I 2023-03-08 15:45:54,448][0m Trial 2 finished with value: 0.9578947368421054 and parameters: {'criterion': 'gini', 'max_depth': 13, 'max_features': 0.37662625624935453, 'n_estimators': 9}. Best is trial 2 with value: 0.9578947368421054.[0m
[32m[I 2023-03-08 15:45:54,589][0m Trial 3 finished with value: 0.9490914786967419 and parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 0.6184230327530436, 'n_estimators': 9}. Best is trial 2 with value: 0.9578947368421054.[0m
[32m[I

Report results

In [16]:
print("Fit Time:", end - start)
print("Best Param:", study.best_params)
print("Best score:", study.best_value)

Fit Time: 4.6513752937316895
Best Param: {'criterion': 'entropy', 'max_depth': 7, 'max_features': 0.20874186668407893, 'n_estimators': 6}
Best score: 0.9701441102756891


### Hyperopt

In [23]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [24]:
parmeter_space = {
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.uniform('max_features', 0.1,0.9),
    'n_estimators': hp.choice('n_estimators', range(1,10)),
}

Define objective function (to maximise/minimise)...

In [25]:
def objective(params):
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X, y).mean()

Define search ...

In [29]:
best = 0
best_param = {}

def f(params):
    global best, best_param
    acc = objective(params)
    if acc > best:
        best = acc
        best_param = params
        print( 'new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()

Carry out the search ...

In [30]:
start = time.time()
hyperopt_search = fmin(f, parmeter_space, algo=tpe.suggest, max_evals=300, trials=trials)
end = time.time()

new best:                                              
0.9561092997981679                                     
{'criterion': 'gini', 'max_depth': 11, 'max_features': 0.26861250019929833, 'n_estimators': 9}
new best:                                                                         
0.9596025461884802                                                                
{'criterion': 'entropy', 'max_depth': 14, 'max_features': 0.3518688955438558, 'n_estimators': 6}
new best:                                                                          
0.9630957925787922                                                                 
{'criterion': 'entropy', 'max_depth': 19, 'max_features': 0.39655671669430526, 'n_estimators': 6}
new best:                                                                           
0.9648967551622418                                                                  
{'criterion': 'entropy', 'max_depth': 19, 'max_features': 0.52272811785714, 'n_estimators': 5}

Report results ...

In [31]:
print("Fit Time:", end - start)
print("Best Param:", best_param)
print("Best score:", best)

Fit Time: 37.04037356376648
Best Param: {'criterion': 'entropy', 'max_depth': 18, 'max_features': 0.7227971374073994, 'n_estimators': 9}
Best score: 0.9666511411271541


### Ray-Tune
BAD

In [32]:
#from ray.tune.sklearn import TuneGridSearchCV