In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

/home/suriya/dcu/Group-Project


## Model Hyperparameter Search

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
from src.features.rssi_distance import extract_feature2, postproc_feature_dicts
from src.featutils import aggregate_features_from_folder

In [5]:
train_dir = "data/tc4tl_training_data_v1/tc4tl/data/train/"
train_key = pd.read_csv("data/tc4tl_training_data_v1/tc4tl/docs/tc4tl_train_key.tsv", sep="\t")
test_dir = "data/tc4tl_data_v5/tc4tl/data/test/"
test_mdata = pd.read_csv("data/tc4tl_data_v5/tc4tl/docs/tc4tl_test_metadata.tsv", sep="\t")
test_key = pd.read_csv("data/tc4tl_test_key/tc4tl/docs/tc4tl_test_key.tsv", sep="\t")

In [None]:
trainset, _ = aggregate_features_from_folder(train_dir, train_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts)
testset, _  = aggregate_features_from_folder(test_dir, test_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(trainset['PredictedDistance'].values.reshape(-1, 1), trainset['Distance'])


In [None]:
out = rf.predict(testset['PredictedDistance'].values.reshape(-1, 1))
(out == testset['Distance']).sum()

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

rf = RandomForestClassifier(max_features=0.01)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
trainset.CoarseGrain = trainset.CoarseGrain.replace({
    'Y' : 0., 'N' : 1.
})

In [None]:
grid_search.fit(trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance'])

In [None]:
grid_search.best_params_

In [None]:
best_estimator = grid_search.best_estimator_

In [None]:
testset.CoarseGrain = testset.CoarseGrain.replace({
    'Y' : 0., 'N' : 1.
})

In [None]:
out = best_estimator.predict(testset[['PredictedDistance', 'CoarseGrain']])
(out == testset['Distance']).sum()

In [None]:
f"Hyperparameter Search leads to {(3664 - 2853)/2853}% increase in accuracy!"

## tune-sklearn

In [None]:
from tune_sklearn import TuneGridSearchCV

_param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    # 'n_estimators': [100, 200, 300, 1000]
}

tune_search = TuneGridSearchCV(
    RandomForestClassifier(),
    _param_grid,
    early_stopping=True,
    max_iters=10
)
grid_search.fit(trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance'])

### `tune-sklearn` is just marginally faster than `GridSearchCV`

## hyperopt-sklearn

In [None]:
from hpsklearn import HyperoptEstimator

# Load Data
train_features, train_labels = trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance']
test_features, test_labels = testset[['PredictedDistance', 'CoarseGrain']], testset['Distance']

# Create the estimator object
estim = HyperoptEstimator()

# Search the space of classifiers and preprocessing steps and their
# respective hyperparameters in sklearn to fit a model to the data
estim.fit(train_features, train_labels)

# Report the accuracy of the classifier on a given set of data
score = estim.score(test_features, test_labels)

# Return instances of the classifier and preprocessing steps
model = estim.best_model()

In [None]:
(estim.predict(test_features) == test_labels).sum()

In [None]:
3770 / len(test_features), 3664/len(test_features)

## Preprocessing Tunables

In [29]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.ridge import RidgeClassifier
# from sklearn.gaussian_process.gpc import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

_classifiers = [
    ExtraTreeClassifier,
    DecisionTreeClassifier,
    MLPClassifier,
    KNeighborsClassifier,
    SGDClassifier,
    RidgeClassifier,
    # GaussianProcessClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
]

In [25]:
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

def evaluate(features, labels, classifiers=[], verbose=False):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=69)
    scores = []
    if len(classifiers) == 0:
        classifiers = _classifiers
    iterate_through = tqdm(classifiers) if verbose else classifiers
    for _classifier in iterate_through:
        model = _classifier()
        model.fit(X_train, y_train)
        scores.append((model.predict(X_test) == y_test).mean())
        if verbose:
            iterate_through.set_description(f"{_classifier.__name__}: {scores[-1]}")
    return max(scores)

In [33]:
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def objective(params):
    # TX, N = params[0][-1], params[1][-1]
    TX, N = params['TX'], params['N']
    trainset, _ = aggregate_features_from_folder(
        train_dir, train_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts,
        tunables={'TX' : TX, 'N' : N}, testing=True, verbose=False
    )
    trainset.CoarseGrain = trainset.CoarseGrain.replace({
        'Y' : 0., 'N' : 1.
    })
    train_features, train_labels = trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance']
    return -evaluate(train_features, train_labels)

In [34]:
from hyperopt import hp, fmin, tpe

space = {
    'TX' : hp.uniform('TX', -80, -40),
    'N' : hp.uniform('N', 0.1, 5)
}
best = fmin(fn=objective, algo=tpe.suggest,
           space=space, max_evals=20)

 30%|███       | 6/20 [00:11<00:26,  1.91s/trial, best loss: -0.6272727272727273]

  overwrite_a=True).T



 55%|█████▌    | 11/20 [00:21<00:17,  1.97s/trial, best loss: -0.6272727272727273]

  overwrite_a=True).T



100%|██████████| 20/20 [00:39<00:00,  1.96s/trial, best loss: -0.6272727272727273]


In [35]:
best

{'N': 4.115621637416503, 'TX': -43.23022252718384}

In [36]:
trainset, _ = aggregate_features_from_folder(
        train_dir, train_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts,
        tunables=best, testing=False, verbose=True
    )
trainset.CoarseGrain = trainset.CoarseGrain.replace({
        'Y' : 0., 'N' : 1.
    })
train_features, train_labels = trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance']
evaluate(train_features, train_labels, verbose=True, classifiers=[])

100%|██████████| 15552/15552 [00:12<00:00, 1198.79it/s]


  0%|          | 0/11 [00:00<?, ?it/s]

0.5690629261640366

## src.hyperopt

In [39]:
from src.hyperopt import optimize_preproc

In [56]:
def make_data(tunables):
    trainset, _ = aggregate_features_from_folder(
                train_dir, train_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts,
                tunables=tunables, testing=True, verbose=False
            )
    trainset.CoarseGrain = trainset.CoarseGrain.replace({
        'Y' : 0., 'N' : 1.
    })
    train_features, train_labels = trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance']
    return train_features, train_labels

space = {
    'TX' : hp.uniform('TX', -80, -40),
    'N' : hp.uniform('N', 0.1, 5)
}

best_params = optimize_preproc(make_data, space)

 75%|███████▌  | 15/20 [00:30<00:10,  2.07s/trial, best loss: -0.5969696969696969]

  overwrite_a=True).T



100%|██████████| 20/20 [00:40<00:00,  2.03s/trial, best loss: -0.5969696969696969]


In [42]:
from src.hyperopt import optimize_hp

In [57]:
train_features, train_labels = make_data(best_params)
estim, best_model = optimize_hp(train_features, train_labels)

100%|██████████| 1/1 [00:01<00:00,  1.13s/trial, best loss: 0.45499999999999996]
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


100%|██████████| 2/2 [00:02<00:00,  2.24s/trial, best loss: 0.43500000000000005]
100%|██████████| 3/3 [00:02<00:00,  2.89s/trial, best loss: 0.43500000000000005]
100%|██████████| 4/4 [00:00<00:00,  2.25trial/s, best loss: 0.43500000000000005]
100%|██████████| 5/5 [00:00<00:00,  6.37trial/s, best loss: 0.43500000000000005]
100%|██████████| 6/6 [00:00<00:00,  4.56trial/s, best loss: 0.43500000000000005]
100%|██████████| 7/7 [00:13<00:00, 13.07s/trial, best loss: 0.43500000000000005]
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but

In [58]:
best_model['learner']

XGBClassifier(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.8427257640994569, colsample_bynode=1,
              colsample_bytree=0.6526318542603939, gamma=0.1546020413234132,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.005093704505070369, max_delta_step=0, max_depth=1,
              min_child_weight=22, missing=nan, monotone_constraints='()',
              n_estimators=6000, n_jobs=1, num_parallel_tree=1,
              objective='multi:softprob', random_state=1,
              reg_alpha=0.594809519556231, reg_lambda=3.5750379568729524,
              scale_pos_weight=1, seed=1, subsample=0.9958139856405576,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [59]:
testset, _ = aggregate_features_from_folder(
            test_dir, test_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts,
            tunables=best_params, testing=True, verbose=True
        )
testset.CoarseGrain = testset.CoarseGrain.replace({
    'Y' : 0., 'N' : 1.
})
test_features, test_labels = trainset[['PredictedDistance', 'CoarseGrain']], trainset['Distance']

100%|██████████| 1000/1000 [00:01<00:00, 642.65it/s]


In [60]:
estim.score(test_features, test_labels)

0.33359053497942387

In [61]:
estim.score(train_features, train_labels)

0.55