# OpMet Challenge 2021: Falkland Rotors - Hyperparameter Tuning in sklearn using dask

Following on from the previous basic ML using scikit-learn notebook, we now proceed to using hyperparameter tuning to select the best hyperparameters for our three model types from sklearn (decision tree, random forest and neural network). In this notebook though, we will demonstrate how to distribute training using the dask-ml library.

### Import packages

In this notebook, in addition to the standard python auxillary libraries, we are using the following:
* matplotlib - plotting 
* pandas - loading tabular data
* scikit learn - machine learning
* dask-ml - distributed hyperparameter tuning

This has been tested with the conda environment based on the requirements.yaml file in this repository, as well as the `scitools/experimental-current` Met Office managed conda environement (August 20201).

In [2]:
import pathlib
import datetime
import math
import functools
import numpy
import os

In [8]:
import pandas

In [3]:
import matplotlib

In [4]:
%matplotlib inline

In [48]:
import dask_ml

In [37]:
import sklearn
import sklearn.tree
import sklearn.preprocessing
import sklearn.ensemble
import sklearn.neural_network
import sklearn.metrics

In [3]:
try:
    root_data_dir = os.environ['OPMET_ROTORS_DATA_ROOT']
except KeyError:
    root_data_dir = '/data/users/shaddad/ds_cop/2021_opmet_challenge/ML'
root_data_dir = pathlib.Path(root_data_dir)
print(root_data_dir)

/Users/stephen.haddad/data/ml_challenges


## Load Falklands Rotor Data

In [63]:
falklands_data_path = root_data_dir.joinpath( 'Rotors')
falklands_data_path

PosixPath('/Users/stephen.haddad/data/ml_challenges/Rotors')

In [5]:
falklands_new_training_data_path = pathlib.Path(falklands_data_path, 'new_training.csv')

In [9]:
falklands_training_df = pandas.read_csv(falklands_new_training_data_path, header=0).loc[1:,:]
falklands_training_df

Unnamed: 0,DTG,air_temp_obs,dewpoint_obs,wind_direction_obs,wind_speed_obs,wind_gust_obs,air_temp_1,air_temp_2,air_temp_3,air_temp_4,...,windspd_18,winddir_19,windspd_19,winddir_20,windspd_20,winddir_21,windspd_21,winddir_22,windspd_22,Rotors 1 is true
1,01/01/2015 00:00,283.9,280.7,110.0,4.1,-9999999.0,284.000,283.625,283.250,282.625,...,5.8,341.0,6.0,334.0,6.1,330.0,6.0,329.0,5.8,
2,01/01/2015 03:00,280.7,279.7,90.0,7.7,-9999999.0,281.500,281.250,280.750,280.250,...,6.8,344.0,5.3,348.0,3.8,360.0,3.2,12.0,3.5,
3,01/01/2015 06:00,279.8,278.1,100.0,7.7,-9999999.0,279.875,279.625,279.125,278.625,...,6.0,345.0,5.5,358.0,5.0,10.0,4.2,38.0,4.0,
4,01/01/2015 09:00,279.9,277.0,120.0,7.2,-9999999.0,279.625,279.250,278.875,278.250,...,3.1,338.0,3.5,354.0,3.9,9.0,4.4,22.0,4.6,
5,01/01/2015 12:00,279.9,277.4,120.0,8.7,-9999999.0,279.250,278.875,278.375,277.875,...,1.6,273.0,2.0,303.0,2.3,329.0,2.5,338.0,2.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20101,31/12/2020 06:00,276.7,275.5,270.0,3.6,-9999999.0,277.875,277.750,277.625,277.500,...,12.1,223.0,11.8,221.0,11.4,219.0,11.3,215.0,11.4,
20102,31/12/2020 09:00,277.9,276.9,270.0,3.1,-9999999.0,277.875,277.625,277.875,277.875,...,10.2,230.0,10.8,230.0,11.6,227.0,12.3,222.0,12.0,
20103,31/12/2020 12:00,283.5,277.1,220.0,3.6,-9999999.0,281.125,280.625,280.125,279.625,...,10.3,218.0,11.9,221.0,12.8,222.0,11.9,225.0,10.6,
20104,31/12/2020 15:00,286.1,276.9,250.0,3.6,-9999999.0,284.625,284.125,283.625,283.000,...,9.4,218.0,8.6,212.0,8.3,218.0,8.7,226.0,10.1,


In [14]:
num_levels = 22

In [22]:
temp_feature_names = [f'air_temp_{i1}' for i1 in range(1,num_levels+1)]
humidity_feature_names = [f'sh_{i1}' for i1 in range(1,num_levels+1)]
wind_direction_feature_names = [f'winddir_{i1}' for i1 in range(1,num_levels+1)]
wind_speed_feature_names = [f'windspd_{i1}' for i1 in range(1,num_levels+1)]
target_feature_name = 'rotors'
test_set_name = 'test'

In [11]:
falklands_training_df = falklands_training_df.rename({'Rotors 1 is true': 'rotors'},axis=1)
falklands_training_df.loc[falklands_training_df[falklands_training_df['rotors'].isna()].index, target_feature_name] = 0
falklands_training_df['DTG'] = pandas.to_datetime(falklands_training_df['DTG'])
falklands_training_df = falklands_training_df.drop_duplicates(subset='DTG')
falklands_training_df[target_feature_name]  = falklands_training_df[target_feature_name] .astype(bool)

In [12]:
falklands_training_df.shape

(17507, 95)

### Specify and create input features

In [17]:
def get_v_wind(wind_dir_name, wind_speed_name, row1):
    return math.cos(math.radians(row1[wind_dir_name])) * row1[wind_speed_name]

def get_u_wind(wind_dir_name, wind_speed_name, row1):
    return math.sin(math.radians(row1[wind_dir_name])) * row1[wind_speed_name]

In [24]:
u_feature_template = 'u_wind_{level_ix}'
v_feature_template = 'v_wind_{level_ix}'
u_wind_feature_names = []
v_wind_features_names = []
for wsn1, wdn1 in zip(wind_speed_feature_names, wind_direction_feature_names):
    level_ix = int( wsn1.split('_')[1])
    u_feature = u_feature_template.format(level_ix=level_ix)
    u_wind_feature_names += [u_feature]
    falklands_training_df[u_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_u_wind, wdn1, wsn1), axis='columns')
    v_feature = v_feature_template.format(level_ix=level_ix)
    v_wind_features_names += [v_feature]
    falklands_training_df[v_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_v_wind, wdn1, wsn1), axis='columns')

In [25]:
falklands_training_df[target_feature_name].value_counts()

False    17058
True       449
Name: rotors, dtype: int64

In [26]:
falklands_training_df.columns

Index(['DTG', 'air_temp_obs', 'dewpoint_obs', 'wind_direction_obs',
       'wind_speed_obs', 'wind_gust_obs', 'air_temp_1', 'air_temp_2',
       'air_temp_3', 'air_temp_4',
       ...
       'u_wind_18', 'v_wind_18', 'u_wind_19', 'v_wind_19', 'u_wind_20',
       'v_wind_20', 'u_wind_21', 'v_wind_21', 'u_wind_22', 'v_wind_22'],
      dtype='object', length=139)

### Split into traing/validate/test sets

In [27]:
test_fraction = 0.1
validation_fraction = 0.1

In [28]:
num_no_rotors = sum(falklands_training_df[target_feature_name] == False)
num_with_rotors = sum(falklands_training_df[target_feature_name] == True)

In [29]:
data_no_rotors = falklands_training_df[falklands_training_df[target_feature_name] == False]
data_with_rotors = falklands_training_df[falklands_training_df[target_feature_name] == True]

In [30]:
data_no_rotors = falklands_training_df[falklands_training_df[target_feature_name] == False]
data_with_rotors = falklands_training_df[falklands_training_df[target_feature_name] == True]

In [31]:
data_test = pandas.concat([data_no_rotors.sample(int(test_fraction * num_no_rotors)), data_with_rotors.sample(int(test_fraction * num_with_rotors))])
data_test[target_feature_name].value_counts()

False    1705
True       44
Name: rotors, dtype: int64

In [32]:
falklands_training_df[test_set_name] = False
falklands_training_df.loc[data_test.index,test_set_name] = True

In [33]:
falklands_training_df[test_set_name] = False
falklands_training_df.loc[data_test.index,test_set_name] = True

In [34]:
data_working = falklands_training_df[falklands_training_df[test_set_name] == False]
data_working_no_rotors = data_working[data_working[target_feature_name] == False]
data_working_with_rotors = data_working[data_working[target_feature_name] == True]

# Preprocess data into input for ML algorithm

In [35]:
input_feature_names = temp_feature_names + humidity_feature_names + u_wind_feature_names + v_wind_features_names

In [38]:
preproc_dict = {}
for if1 in input_feature_names:
    scaler1 = sklearn.preprocessing.StandardScaler()
    scaler1.fit(data_working[[if1]])
    preproc_dict[if1] = scaler1

In [39]:
target_encoder = sklearn.preprocessing.LabelEncoder()
target_encoder.fit(data_working[[target_feature_name]])

  return f(*args, **kwargs)


LabelEncoder()

Apply transformation to each input column

In [40]:
def preproc_input(data_subset, pp_dict):
    return numpy.concatenate([scaler1.transform(data_subset[[if1]]) for if1,scaler1 in pp_dict.items()],axis=1)

def preproc_target(data_subset, enc1):
     return enc1.transform(data_subset[[target_feature_name]])

In [41]:
X_working = preproc_input(data_working, preproc_dict)
y_working = preproc_target(data_working, target_encoder)

  return f(*args, **kwargs)


create target feature from rotors

In [42]:
y_working.shape, X_working.shape

((15758,), (15758, 88))

In [43]:
X_test = preproc_input(data_test, preproc_dict)
y_test = preproc_target(data_test, target_encoder)

  return f(*args, **kwargs)


In [44]:
train_test_tuples = [
    (X_working, y_working),
    (X_test, y_test),    
]

### Create a dask cluster
We set up a dask cluster to distribute our hyperparameter tuning. Not strictly necessary for a problem of this size, but demonstrates how it can be done very easily and can be scaled up by just increasing the number and size of your dask workers.

In this example, I am using a local cluster to demonstrate. I used the dask-labextension package, which is an add-on to Jupyter Lab, to set up the cluster parameters for me.

https://github.com/dask/dask-labextension

You will need to edit the URL of the cluster in the cell below for each cluster you create. 
If you use the dask-lab jupyter lab extension, you can drag the cluster onto the notebook and this cell


In [45]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:65258")
client

0,1
Connection method: Direct,
Dashboard: http://127.0.0.1:8787/status,

0,1
Comm: tcp://127.0.0.1:65258,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: 22 minutes ago,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:65273,Total threads: 2
Dashboard: http://127.0.0.1:65275/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:65261,
Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-sm67d7q4,Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-sm67d7q4
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.2%,Last seen: Just now
Memory usage: 104.36 MiB,Spilled bytes: 0 B
Read bytes: 4.01 kiB,Write bytes: 4.01 kiB

0,1
Comm: tcp://127.0.0.1:65281,Total threads: 2
Dashboard: http://127.0.0.1:65282/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:65263,
Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-scm7my6m,Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-scm7my6m
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.4%,Last seen: Just now
Memory usage: 104.74 MiB,Spilled bytes: 0 B
Read bytes: 4.00 kiB,Write bytes: 4.00 kiB

0,1
Comm: tcp://127.0.0.1:65278,Total threads: 2
Dashboard: http://127.0.0.1:65279/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:65262,
Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-lskhi7nl,Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-lskhi7nl
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.4%,Last seen: Just now
Memory usage: 104.01 MiB,Spilled bytes: 0 B
Read bytes: 3.99 kiB,Write bytes: 3.99 kiB

0,1
Comm: tcp://127.0.0.1:65272,Total threads: 2
Dashboard: http://127.0.0.1:65274/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:65260,
Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-qe0afy6l,Local directory: /Users/stephen.haddad/prog/data_science_cop/challenges/dask-worker-space/worker-qe0afy6l
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.4%,Last seen: Just now
Memory usage: 104.23 MiB,Spilled bytes: 0 B
Read bytes: 6.01 kiB,Write bytes: 6.01 kiB


### train some classifiers

Set up some classifiers, then set up dask objects to run distibuted hyperparameter tuning.

In [46]:
# nn_hidden_layers_specs = [(50,)*2, (50,)*4, (50,)*8, (100,)*2, (100,)*5, (100,)*8, (200,)*4, (400,)*4, (500,)*4]
nn_hidden_layers_specs = [(50,)*2, (50,)*8, (100,)*2, (100,)*5,]

classifiers_params = {
    'decision_tree': {'class': sklearn.tree.DecisionTreeClassifier, 'opts': {'max_depth':[5,10,15,20], 'class_weight':['balanced']}},
    'random_forest': {'class': sklearn.ensemble.RandomForestClassifier, 'opts': {'max_depth':[5,10,15,20], 'class_weight':['balanced']}},
     'ann': {'class': sklearn.neural_network.MLPClassifier, 'opts': {'hidden_layer_sizes': nn_hidden_layers_specs}},   
}



In [49]:
%%time
classifiers_dict = {}             
for clf_name, clf_params in classifiers_params.items():
    print(clf_name)
    clf1 = clf_params['class']()
    cv1 = sklearn.model_selection.KFold(n_splits=5, shuffle=True)
    hpt1 = dask_ml.model_selection.GridSearchCV(clf1, 
                                                clf_params['opts'],
                                                cv=cv1,
                                               )
    res1 = hpt1.fit(X_working, y_working)
    classifiers_dict[clf_name] = hpt1

decision_tree
random_forest
ann
CPU times: user 394 ms, sys: 193 ms, total: 586 ms
Wall time: 2min 55s


In [50]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_test_tuples:
        print(sklearn.metrics.precision_recall_fscore_support(clf1.predict(X1), y1))

(array([0.91376278, 1.        ]), array([1.        , 0.23423944]), array([0.9549384 , 0.37956888]), array([14029,  1729]))
(array([0.90029326, 0.70454545]), array([0.99160207, 0.15422886]), array([0.94374424, 0.25306122]), array([1548,  201]))
(array([0.94378949, 1.        ]), array([1.        , 0.31940063]), array([0.971082  , 0.48416019]), array([14490,  1268]))
(array([0.93255132, 0.65909091]), array([0.99065421, 0.20138889]), array([0.96072508, 0.30851064]), array([1605,  144]))
(array([1., 1.]), array([1., 1.]), array([1., 1.]), array([15353,   405]))
(array([0.98592375, 0.27272727]), array([0.98131932, 0.33333333]), array([0.98361615, 0.3       ]), array([1713,   36]))


In [51]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_test_tuples:
        print(sklearn.metrics.balanced_accuracy_score(clf1.predict(X1), y1))


0.6171197223828803
0.5729154614524278
0.6597003154574133
0.5960215472481828
1.0
0.6573263280793928


In [64]:
data_working[target_feature_name].value_counts()

False    15353
True       405
Name: rotors, dtype: int64

In [54]:
data_test[target_feature_name].value_counts()

False    1705
True       44
Name: rotors, dtype: int64

In [55]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_test_tuples:
        print(sklearn.metrics.confusion_matrix(clf1.predict(X1), y1))

[[14029     0]
 [ 1324   405]]
[[1535   13]
 [ 170   31]]
[[14490     0]
 [  863   405]]
[[1590   15]
 [ 115   29]]
[[15353     0]
 [    0   405]]
[[1681   32]
 [  24   12]]


In this sort of classification problem, there are 4 sorts of results:
* true postive(hits) - should be positive classification and is
* true negative - should be negative and is
* false negative (miss) - should be classified positive but is classified negative
* false positive (false alarm) - should be classified negative but is classified as positive by the algorithm

Given less than 100% accuracy, changing parameters can shift results between false negatives and false positive, depending on which is more damaging for how the prediction will be used. If we decide that predicting a rotor that doesn't happen is more costly, we would penalise false positives. If we decide that a rotor event happening when not forecast is more damaging, we penalise false negatives. Tis can be done by optimising for an F-score other F1. F1 balances tese out, but instead one use a different weighting in the F-score formula for one or the other.


### Resample the data 

Our yes/no classes for classification are very unbalanced, so we can try doing a naive resampling so we have equal representation fo the two classes in our sample set.

In [56]:
data_working_resampled = pandas.concat([
    data_working[data_working[target_feature_name] == True].sample(n=int(1e4), replace=True), 
    data_working[data_working[target_feature_name] == False].sample(n=int(1e4), replace=False),],
    ignore_index=True)

In [57]:
X_working_resampled = preproc_input(data_working_resampled, preproc_dict)
y_working_resampled = preproc_target(data_working_resampled, target_encoder)

  return f(*args, **kwargs)


In [58]:
train_test_res_tuples = [
    (X_working_resampled, y_working_resampled),
    (X_test, y_test),    
]

In [59]:
%%time
classifiers_res_dict = {}                    
for clf_name, clf_params in classifiers_params.items():
    print(clf_name)
    clf1 = clf_params['class']()
    cv1 = sklearn.model_selection.KFold(n_splits=5, shuffle=True)
    hpt1 = dask_ml.model_selection.GridSearchCV(clf1, 
                                                clf_params['opts'],
                                                cv=cv1,
                                               )
    res1 = hpt1.fit(X_working, y_working)
    classifiers_res_dict[clf_name] = hpt1

decision_tree
random_forest
ann
CPU times: user 410 ms, sys: 79 ms, total: 489 ms
Wall time: 2min 51s


In [60]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_test_res_tuples:
        print(sklearn.metrics.precision_recall_fscore_support(clf1.predict(X1), y1))


(array([0.9156, 1.    ]), array([1.        , 0.92216894]), array([0.9559407 , 0.95950873]), array([ 9156, 10844]))
(array([0.90146628, 0.70454545]), array([0.9916129 , 0.15577889]), array([0.94439324, 0.25514403]), array([1550,  199]))
(array([0.9441, 1.    ]), array([1.        , 0.94705938]), array([0.97124634, 0.97280996]), array([ 9441, 10559]))
(array([0.9313783 , 0.61363636]), array([0.9894081, 0.1875   ]), array([0.95951662, 0.28723404]), array([1605,  144]))
(array([1., 1.]), array([1., 1.]), array([1., 1.]), array([10000, 10000]))
(array([0.98533724, 0.25      ]), array([0.98073555, 0.30555556]), array([0.98303101, 0.275     ]), array([1713,   36]))


In [61]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_test_res_tuples:
        print(sklearn.metrics.balanced_accuracy_score(clf1.predict(X1), y1))    

0.9610844706750277
0.5736958988490841
0.9735296903115825
0.5884540498442368
1.0
0.6431455536096518


In [62]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_test_res_tuples:
        print(sklearn.metrics.confusion_matrix(clf1.predict(X1), y1))

[[ 9156     0]
 [  844 10000]]
[[1537   13]
 [ 168   31]]
[[ 9441     0]
 [  559 10000]]
[[1588   17]
 [ 117   27]]
[[10000     0]
 [    0 10000]]
[[1680   33]
 [  25   11]]


## Further work

Improving results
* Outer cross-validation loop
* visualising the metrics
* using an F-score to penalise false positives or false negatives
 * using that score as the optimisation criteria for the hyper parameter tuning