In [1]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_classes=8, n_informative=4, n_clusters_per_class=1, flip_y=0.1, random_state=42)

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier

### Normal Random Forest

In [4]:
rfc = RandomForestClassifier(n_jobs=-1, random_state=42)
rfc.fit(X_train, y_train)
from sklearn.metrics import f1_score
f1_score(y_test, rfc.predict(X_test), average="macro")

0.6433595829865271

### PS Optimized

In [5]:
X_train_, X_valid, y_train_, y_valid = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [6]:
from PSO import PSO
params = {
        "n_estimators": (10, 500),
        "min_samples_split": (2, 16),
        "min_samples_leaf": (1, 10),
        "max_features": (0.01, 0.99),
    }
model, parameters, score = PSO(
    RandomForestClassifier,
    params,
    X_train_,
    y_train_,
    X_valid,
    y_valid,
    n_iter=50,
    n_particles=5,
    n_jobs=-1,
    random_state=42,
)()
f1_score(y_test, model.predict(X_test), average="macro")

0.6590776870053444

#### Times

In [7]:
from PSO import PSO
params = {
        "n_estimators": (10, 500),
        "min_samples_split": (2, 16),
        "min_samples_leaf": (1, 10),
        "max_features": (0.01, 0.99),
    }
pso = PSO(
    RandomForestClassifier,
    params,
    X_train_,
    y_train_,
    X_valid,
    y_valid,
    n_iter=50,
    n_particles=5,
    n_jobs=-1,
    random_state=42,
)

In [8]:
%%time
pso()

CPU times: user 4min 9s, sys: 22.1 s, total: 4min 31s
Wall time: 2min 3s


(RandomForestClassifier(max_features=0.41547770845826437, n_estimators=230,
                        n_jobs=-1, random_state=42),
 {'n_estimators': 230,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_features': 0.41547770845826437},
 0.6765567842596312)

### RandomizedSearchCV

50 iter

In [9]:
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

params = {
        "n_estimators": randint(10, 500),
        "min_samples_split": randint(2, 16),
        "min_samples_leaf": randint(1, 10),
        "max_features": uniform(0.01, 0.99),
    }

clf = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    params,
    random_state=42,
    n_jobs=-1,
    scoring="f1_macro",
    cv=3,
    n_iter=50,
)
clf.fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test), average="macro")

0.6491559675364309

#### Times

In [10]:
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

params = {
        "n_estimators": randint(10, 500),
        "min_samples_split": randint(2, 16),
        "min_samples_leaf": randint(1, 10),
        "max_features": uniform(0.01, 0.99),
    }

clf = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    params,
    random_state=42,
    n_jobs=-1,
    scoring="f1_macro",
    cv=3,
    n_iter=50,
)

In [11]:
%%time
clf.fit(X_train, y_train)

CPU times: user 2.01 s, sys: 146 ms, total: 2.16 s
Wall time: 32.6 s


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f055c129580>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f055c129af0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f055c926040>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f055c926a90>},
                   random_state=42, scoring='f1_macro')

250 iter

In [12]:
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

params = {
        "n_estimators": randint(10, 500),
        "min_samples_split": randint(2, 16),
        "min_samples_leaf": randint(1, 10),
        "max_features": uniform(0.01, 0.99),
    }

clf = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    params,
    random_state=42,
    n_jobs=-1,
    scoring="f1_macro",
    cv=3,
    n_iter=250,
)
clf.fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test), average="macro")

0.6549530616325707

#### Times

In [13]:
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

params = {
        "n_estimators": randint(10, 500),
        "min_samples_split": randint(2, 16),
        "min_samples_leaf": randint(1, 10),
        "max_features": uniform(0.01, 0.99),
    }

clf = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    params,
    random_state=42,
    n_jobs=-1,
    scoring="f1_macro",
    cv=3,
    n_iter=250,
)

In [14]:
%%time
clf.fit(X_train, y_train)

CPU times: user 1.97 s, sys: 97.7 ms, total: 2.06 s
Wall time: 2min 43s


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   n_iter=250, n_jobs=-1,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f055c8c4ac0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f05993ae370>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f055c8c4a90>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f05993aacd0>},
                   random_state=42, scoring='f1_macro')

### BayesSearchCV

In [15]:
!pip3 install scikit-optimize



In [16]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

params = {
        "n_estimators": Integer(10, 500),
        "min_samples_split": Integer(2, 16),
        "min_samples_leaf": Integer(1, 10),
        "max_features": Real(0.01, 0.99, prior="uniform"),
    }

clf = BayesSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    params,
    random_state=42,
    n_jobs=-1,
    scoring="f1_macro",
    cv=3,
    n_iter=50,
)
clf.fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test), average="macro")

0.6580131289120079

#### Times

In [17]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

params = {
        "n_estimators": Integer(10, 500),
        "min_samples_split": Integer(2, 16),
        "min_samples_leaf": Integer(1, 10),
        "max_features": Real(0.01, 0.99, prior="uniform"),
    }

clf = BayesSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    params,
    random_state=42,
    n_jobs=-1,
    scoring="f1_macro",
    cv=3,
    n_iter=50,
)

In [18]:
%%time
clf.fit(X_train, y_train)

CPU times: user 2min 47s, sys: 6min 15s, total: 9min 3s
Wall time: 2min 21s


BayesSearchCV(cv=3,
              estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
              n_jobs=-1, random_state=42, scoring='f1_macro',
              search_spaces={'max_features': Real(low=0.01, high=0.99, prior='uniform', transform='identity'),
                             'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='identity'),
                             'min_samples_split': Integer(low=2, high=16, prior='uniform', transform='identity'),
                             'n_estimators': Integer(low=10, high=500, prior='uniform', transform='identity')})