In [None]:
%%bash

pip install optuna

In [None]:
import io
import os
import pathlib
import requests
import tarfile

import optuna
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn import dummy, ensemble, feature_extraction, metrics, model_selection, pipeline, preprocessing, tree
import xgboost as xgb
import yellowbrick


In [None]:
DATA_ARCHIVE = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
DATA_DIR = pathlib.Path("./sample_data")

response = requests.get(DATA_ARCHIVE, stream=True)
with tarfile.open(fileobj=response.raw, mode="r|gz") as t:
    t.extractall(DATA_DIR)


In [None]:
def _reviews_to_df(filepath, sentiment):
    d = {"text": [], "sentiment": []}
    review_filepaths = sorted(filepath.glob("*.txt"))
    for review_filepath in review_filepaths:
        with open(review_filepath, 'r') as f:
            review = f.read()
        d["text"].append(review)
        d["sentiment"].append("positive" if sentiment == "pos" else "negative")
    df = pd.DataFrame.from_dict(d)
    return df


def _combine_reviews():
    dfs = []
    for split in ["train", "test"]:
        for sentiment in ["pos", "neg"]:
            df = _reviews_to_df(DATA_DIR / "aclImdb" / split / sentiment, sentiment)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df


def partition_reviews(seed=42, test_size=0.2):
    combined_df = _combine_reviews()
    random_state = np.random.RandomState(seed)
    train_df, _val_df = model_selection.train_test_split(
        combined_df,
        random_state=random_state,
        shuffle=True,
        stratify=combined_df["sentiment"],
        test_size=test_size
    )

    # split the _val_df equally into val and test sets
    val_df, test_df = model_selection.train_test_split(
        _val_df,
        random_state=random_state,
        shuffle=True,
        stratify=_val_df["sentiment"],
        test_size=0.5
    )

    return train_df, val_df, test_df


In [None]:
train_df, val_df, test_df = partition_reviews(seed=42)

In [None]:
vectorizer_hyperparams = {
    "lowercase": True,
    "max_df": 0.9, # ignore words that show up in more than 90% of reviews
    "min_df": 0.1, # ignore words than show up in less than 10% of reviews
    "norm": "l2", # normalize rows using
    "dtype": np.float32,
}

features_preprocessing = pipeline.make_pipeline(
    feature_extraction.text.TfidfVectorizer(
        **vectorizer_hyperparams,
    ),
    preprocessing.FunctionTransformer(lambda csr: csr.toarray(), lambda arr: sparse.csr_matrix(arr))
)

label_encoder = preprocessing.LabelEncoder()

In [None]:
X_train = features_preprocessing.fit_transform(train_df.loc[:, "text"])
y_train = label_encoder.fit_transform(train_df.loc[:, "sentiment"])

X_val = features_preprocessing.transform(val_df.loc[:, "text"])
y_val = label_encoder.fit_transform(val_df.loc[:, "sentiment"])

X_test = features_preprocessing.transform(test_df.loc[:, "text"])
y_test = label_encoder.fit_transform(test_df.loc[:, "sentiment"])


In [None]:
def objective(hyperparams):
    if "max_depth" in hyperparams:
        hyperparams["max_depth"] = int(hyperparams["max_depth"])

    model_fn = xgb.XGBClassifier(**hyperparams)
    model_fn.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    score = model_fn.score(X_val, y_val)

    return {"loss": -score, "status": hyperopt.STATUS_OK, "model": model_fn}


In [None]:
sampling_space = {
    "max_depth": hyperopt.hp.quniform("max_depth", 1, 8, 1), # tree
    "min_child_weight": hyperopt.hp.loguniform("min_child_weight", -2, 3),
    "subsample": hyperopt.hp.uniform("subsample", 0.5, 1), # stochastic
    "colsample_bytree": hyperopt.hp.uniform("colsample_bytree", 0.5, 1),
    "reg_alpha": hyperopt.hp.uniform("reg_alpha", 0, 10),
    "reg_lambda": hyperopt.hp.uniform("reg_lambda", 1, 10),
    "gamma": hyperopt.hp.loguniform("gamma", -10, 10), # regularization
    "learning_rate": hyperopt.hp.loguniform("learning_rate", -7, 0), # boosting
    "random_state": 42,
    "early_stopping_rounds": 10,
}

In [None]:
trials = hyperopt.Trials()

In [None]:
hpo_results = hyperopt.fmin(
    lambda hyperparams: objective(hyperparams),
    space=sampling_space,
    trials=trials,
    algo=hyperopt.tpe.suggest,
    max_evals=10,
    timeout=60 * 5,
    show_progressbar=True,
)

 90%|█████████ | 9/10 [05:09<00:34, 34.44s/trial, best loss: -0.7784]


In [None]:
hpo_results

{'colsample_bytree': 0.9376799749854374,
 'gamma': 0.017408782850783067,
 'learning_rate': 0.2506239064201415,
 'max_depth': 7.0,
 'min_child_weight': 0.19473436518367768,
 'reg_alpha': 3.455189323244122,
 'reg_lambda': 6.766646758063646,
 'subsample': 0.6198252152979623}

In [None]:
hpo_results["max_depth"] = int(hpo_results["max_depth"])
hpo_results["early_stopping_rounds"] = 10
hpo_results["random_state"] = 42
xgb_classifier = xgb.XGBClassifier(**hpo_results)
xgb_classifier.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

In [None]:
print(f"Training accuracy: {xgb_classifier.score(X_train, y_train)}")
print(f"Validation accuracy: {xgb_classifier.score(X_val, y_val)}")

Training accuracy: 0.8729
Validation accuracy: 0.7784


In [None]:
hyperopt.pyll.stochastic.sample(hyperopt.hp.choice("value", ['a', 'b', 'c']))

'b'

In [None]:
hyperopt.pyll.stochastic.sample(hyperopt.hp.pchoice("value", [(0.05, 'a'), (0.05, 'b'), (0.9, 'c')]))

'c'

In [None]:
hyperopt.pyll.stochastic.sample(hyperopt.hp.uniform("value", 0, 1))

0.8514742274424599

In [None]:
hyperopt.pyll.stochastic.sample(hyperopt.hp.loguniform("value", -6, 0))

0.3622469072927101

In [None]:
# 1. Define an objective function to be maximized.
def objective(trial):
    ...

    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'silent': 1,
        'objective': 'binary:logistic',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    }

    bst = xgb.train(param, dtrain)
    ...
    return accuracy

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)