In [None]:
from Benchmark.protocol import *

In [None]:

# K : number of folds in KFolds
K = 5
# R : number repetition of KFolds
R = 5
# RHO : levels of noised applied on training datasets (symmetrical noise, same noise applied on all classes)
# function of the minority class : rho*balance (0.0 : imbalanced, ]0.0, 1.0] : noise<=balance, ]1.0, 2.0] : noise > balance)
RHO = [0.0, 0.05, 0.1, 0.20, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 1.0, 1.25]


# functions used to preprocess split datasets
PREPROCESSING = {"regular" : lambda x : x, "onehot" : lambda x : pd.get_dummies(x)}

p = Protocol(K, R, RHO, PREPROCESSING)

In [None]:
p.splitDataset("breastcancer_full", 34.4, n_jobs=10)

In [None]:
# FULL TESTS
n_trees = 100
n_jobs = 8
xgbparams = {"nthread": 1, "eta" : 0.3, "min_child_weight" : 0, "lambda": 1, "tree_method" : "auto"}
toZeroOne = lambda ys : (ys.copy()+1.0)/2

In [None]:
skm = {
     "regular": {
        "khiops" : sklearnFit(PyKhiopsClassifier, evaluateProbaKhiops),
        "khiopsRF" : sklearnFit(PyKhiopsClassifier, evaluateProbaKhiops, n_trees=n_trees)
    },
    "onehot" : {
        "skRF" : sklearnFit(RandomForestClassifier, evaluateProba,
            {"n_estimators": n_trees, "n_jobs": 1}),
        "logisticRegression" : sklearnFit(LogisticRegression, evaluateScore,
            {"solver" : "lbfgs", "max_iter" : 20000, "n_jobs": 1}),
        "linearSVC" : sklearnFit(LinearSVC, evaluateScore,
            {"max_iter" : 20000, "dual" : False}),
        "XGB_SQUERR" : xgbFit('reg:squarederror', n_trees, params=xgbparams),
        "XGB_HINGE" : xgbFit('binary:hinge', n_trees, params=xgbparams, prepy = toZeroOne),
        "XGB_UNHINGED" : xgbFit(buildUnhinged(), n_trees, params=xgbparams, prepy = toZeroOne),
        "XGB_RAMP" : xgbFit(buildModifiedRamp(ep = 1e-16, r = 0.5), n_trees, params=xgbparams, prepy = toZeroOne)
    }
}

# Ordered by ascending cells amount
for i in range(p.K*p.R):
    for f in p.DATASETS.sort_values("cells", ascending=True)[:].T:
        p.runModels(f, skm, i, n_jobs=n_jobs)

In [None]:
wekam = {
    "regular": {
        "wekaRF" : wekaFit(params={"n_estimators" : n_trees, "n_jobs" : n_jobs})
    }
}

# Ordered by ascending cells amount
for i in range(p.K*p.R):
    for f in p.DATASETS.sort_values("cells", ascending=True)[:].T:
        #weka runs cannot be parralelized. we can only count on the weka parralel implementation of RF
        p.runModels(f, wekam, i, n_jobs=1)
