# OpMet Challenge 2021: Falkland Rotors - Hyperparameter Tuning in sklearn using dask

Following on from the previous basic ML using scikit-learn notebook, we now proceed to using hyperparameter tuning to select the best hyperparameters for our three model types from sklearn (decision tree, random forest and neural network). In this notebook though, we will demonstrate how to distribute training using the dask-ml library.

### Import packages

In this notebook, in addition to the standard python auxillary libraries, we are using the following:
* matplotlib - plotting 
* pandas - loading tabular data
* scikit learn - machine learning
* dask-ml - distributed hyperparameter tuning

This has been tested with the conda environment based on the requirements.yaml file in this repository, as well as the `scitools/experimental-current` Met Office managed conda environement (August 20201).

In [1]:
import pathlib
import datetime
import math
import functools
import numpy

In [2]:
import pandas

In [3]:
import matplotlib

In [4]:
%matplotlib inline

In [5]:
import dask_ml

In [6]:
import sklearn
import sklearn.tree
import sklearn.preprocessing
import sklearn.ensemble
import sklearn.neural_network
import sklearn.metrics

In [7]:
root_data_dir = pathlib.Path.home().joinpath('data','ml_challenges')
root_data_dir

PosixPath('/Users/stephen.haddad/data/ml_challenges')

## Exploring Falklands Rotor Data

In [8]:
falklands_dir = 'Rotors'
falklands_data_path = pathlib.Path(root_data_dir, falklands_dir)

In [9]:
falklands_new_training_data_path = pathlib.Path(falklands_data_path, 'new_training.csv')

In [10]:
falklands_training_df = pandas.read_csv(falklands_new_training_data_path, header=0).loc[1:,:]
falklands_training_df

Unnamed: 0,DTG,air_temp_obs,dewpoint_obs,wind_direction_obs,wind_speed_obs,wind_gust_obs,air_temp_1,air_temp_2,air_temp_3,air_temp_4,...,windspd_18,winddir_19,windspd_19,winddir_20,windspd_20,winddir_21,windspd_21,winddir_22,windspd_22,Rotors 1 is true
1,01/01/2015 00:00,283.9,280.7,110.0,4.1,-9999999.0,284.000,283.625,283.250,282.625,...,5.8,341.0,6.0,334.0,6.1,330.0,6.0,329.0,5.8,
2,01/01/2015 03:00,280.7,279.7,90.0,7.7,-9999999.0,281.500,281.250,280.750,280.250,...,6.8,344.0,5.3,348.0,3.8,360.0,3.2,12.0,3.5,
3,01/01/2015 06:00,279.8,278.1,100.0,7.7,-9999999.0,279.875,279.625,279.125,278.625,...,6.0,345.0,5.5,358.0,5.0,10.0,4.2,38.0,4.0,
4,01/01/2015 09:00,279.9,277.0,120.0,7.2,-9999999.0,279.625,279.250,278.875,278.250,...,3.1,338.0,3.5,354.0,3.9,9.0,4.4,22.0,4.6,
5,01/01/2015 12:00,279.9,277.4,120.0,8.7,-9999999.0,279.250,278.875,278.375,277.875,...,1.6,273.0,2.0,303.0,2.3,329.0,2.5,338.0,2.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20101,31/12/2020 06:00,276.7,275.5,270.0,3.6,-9999999.0,277.875,277.750,277.625,277.500,...,12.1,223.0,11.8,221.0,11.4,219.0,11.3,215.0,11.4,
20102,31/12/2020 09:00,277.9,276.9,270.0,3.1,-9999999.0,277.875,277.625,277.875,277.875,...,10.2,230.0,10.8,230.0,11.6,227.0,12.3,222.0,12.0,
20103,31/12/2020 12:00,283.5,277.1,220.0,3.6,-9999999.0,281.125,280.625,280.125,279.625,...,10.3,218.0,11.9,221.0,12.8,222.0,11.9,225.0,10.6,
20104,31/12/2020 15:00,286.1,276.9,250.0,3.6,-9999999.0,284.625,284.125,283.625,283.000,...,9.4,218.0,8.6,212.0,8.3,218.0,8.7,226.0,10.1,


In [11]:
falklands_training_df = falklands_training_df.drop_duplicates(subset='DTG')

In [12]:
falklands_training_df.shape

(17507, 95)

### Specify and create input features

In [28]:
num_levels = 22

In [29]:
temp_feature_names = [f'air_temp_{i1}' for i1 in range(1,num_levels+1)]
humidity_feature_names = [f'sh_{i1}' for i1 in range(1,num_levels+1)]
wind_direction_feature_names = [f'winddir_{i1}' for i1 in range(1,)]
wind_speed_feature_names = [f'windspd_{i1}' for i1 in range(1,23)]
target_feature_name = 'rotors_present'
test_set_name = 'test'

In [14]:
def get_v_wind(wind_dir_name, wind_speed_name, row1):
    return math.cos(math.radians(row1[wind_dir_name])) * row1[wind_speed_name]

def get_u_wind(wind_dir_name, wind_speed_name, row1):
    return math.sin(math.radians(row1[wind_dir_name])) * row1[wind_speed_name]

In [15]:
u_feature_template = 'u_wind_{level_ix}'
v_feature_template = 'v_wind_{level_ix}'
u_wind_feature_names = []
v_wind_features_names = []
for wsn1, wdn1 in zip(wind_speed_feature_names, wind_direction_feature_names):
    level_ix = int( wsn1.split('_')[1])
    u_feature = u_feature_template.format(level_ix=level_ix)
    u_wind_feature_names += [u_feature]
    falklands_training_df[u_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_u_wind, wdn1, wsn1), axis='columns')
    v_feature = v_feature_template.format(level_ix=level_ix)
    v_wind_features_names += [v_feature]
    falklands_training_df[v_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_v_wind, wdn1, wsn1), axis='columns')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falklands_training_df[u_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_u_wind, wdn1, wsn1), axis='columns')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falklands_training_df[v_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_v_wind, wdn1, wsn1), axis='columns')


In [16]:
falklands_training_df[target_feature_name] =  falklands_training_df['Rotors 1 is true']
falklands_training_df.loc[falklands_training_df[falklands_training_df['Rotors 1 is true'].isna()].index, target_feature_name] = 0.0
falklands_training_df[target_feature_name]  = falklands_training_df[target_feature_name] .astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falklands_training_df[target_feature_name] =  falklands_training_df['Rotors 1 is true']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falklands_training_df[target_feature_name]  = falklands_training_df[targe

In [17]:
falklands_training_df[target_feature_name].value_counts()

False    17058
True       449
Name: rotors_present, dtype: int64

In [19]:
falklands_training_df.columns

Index(['DTG', 'air_temp_obs', 'dewpoint_obs', 'wind_direction_obs',
       'wind_speed_obs', 'wind_gust_obs', 'air_temp_1', 'air_temp_2',
       'air_temp_3', 'air_temp_4',
       ...
       'v_wind_18', 'u_wind_19', 'v_wind_19', 'u_wind_20', 'v_wind_20',
       'u_wind_21', 'v_wind_21', 'u_wind_22', 'v_wind_22', 'rotors_present'],
      dtype='object', length=140)

### Split into traing/validate/test sets

In [20]:
test_fraction = 0.1
validation_fraction = 0.1

In [21]:
num_no_rotors = sum(falklands_training_df[target_feature_name] == False)
num_with_rotors = sum(falklands_training_df[target_feature_name] == True)

In [22]:
data_no_rotors = falklands_training_df[falklands_training_df[target_feature_name] == False]
data_with_rotors = falklands_training_df[falklands_training_df[target_feature_name] == True]

In [23]:
data_no_rotors = falklands_training_df[falklands_training_df[target_feature_name] == False]
data_with_rotors = falklands_training_df[falklands_training_df[target_feature_name] == True]

In [31]:
data_test = pandas.concat([data_no_rotors.sample(int(test_fraction * num_no_rotors)), data_with_rotors.sample(int(test_fraction * num_with_rotors))])
data_test[target_feature_name].value_counts()

False    1705
True       44
Name: rotors_present, dtype: int64

In [32]:
falklands_training_df[test_set_name] = False
falklands_training_df.loc[data_test.index,test_set_name] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falklands_training_df[test_set_name] = False


In [33]:
falklands_training_df[test_set_name] = False
falklands_training_df.loc[data_test.index,test_set_name] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falklands_training_df[test_set_name] = False


In [34]:
data_working = falklands_training_df[falklands_training_df[test_set_name] == False]
data_working_no_rotors = data_working[data_working[target_feature_name] == False]
data_working_with_rotors = data_working[data_working[target_feature_name] == True]

# Preprocess data into input for ML algorithm

In [35]:
input_feature_names = temp_feature_names + humidity_feature_names + u_wind_feature_names + v_wind_features_names

In [36]:
preproc_dict = {}
for if1 in input_feature_names:
    scaler1 = sklearn.preprocessing.StandardScaler()
    scaler1.fit(data_working[[if1]])
    preproc_dict[if1] = scaler1

In [37]:
target_encoder = sklearn.preprocessing.LabelEncoder()
target_encoder.fit(data_working[[target_feature_name]])

  return f(*args, **kwargs)


LabelEncoder()

Apply transformation to each input column

In [38]:
def preproc_input(data_subset, pp_dict):
    return numpy.concatenate([scaler1.transform(data_subset[[if1]]) for if1,scaler1 in pp_dict.items()],axis=1)

def preproc_target(data_subset, enc1):
     return enc1.transform(data_subset[[target_feature_name]])

In [39]:
X_working = preproc_input(data_working, preproc_dict)
y_working = preproc_target(data_working, target_encoder)

  return f(*args, **kwargs)


create target feature from rotors

In [40]:
y_working.shape, X_working.shape

((15758,), (15758, 88))

In [41]:
X_test = preproc_input(data_test, preproc_dict)
y_test = preproc_target(data_test, target_encoder)

  return f(*args, **kwargs)


In [42]:
train_test_tuples = [
    (X_working, y_working),
    (X_test, y_test),    
]

### Create a dask cluster
We set up a dask cluster to distribute our hyperparameter tuning. Not strictly necessary for a problem of this size, but demonstrates how it can be done very easily and can be scaled up by just increasing the number and size of your dask workers.

In this example, I am using a local cluster to demonstrate. I used the dask-labextension package, which is an add-on to Jupyter Lab, to set up the cluster parameters for me.

https://github.com/dask/dask-labextension

You will need to edit the URL of the cluster in the cell below for each cluster you create. 
If you use the dask-lab jupyter lab extension, you can drag the cluster onto the notebook and this cell


In [43]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:64826")
client

0,1
Connection method: Direct,
Dashboard: http://127.0.0.1:8787/status,

0,1
Comm: tcp://127.0.0.1:64826,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: 17 minutes ago,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:64842,Total threads: 2
Dashboard: http://127.0.0.1:64849/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64830,
Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-k4s04_v8,Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-k4s04_v8
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 4.6%,Last seen: Just now
Memory usage: 104.25 MiB,Spilled bytes: 0 B
Read bytes: 187.69 kiB,Write bytes: 13.98 kiB

0,1
Comm: tcp://127.0.0.1:64844,Total threads: 2
Dashboard: http://127.0.0.1:64848/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64832,
Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-u_wgc402,Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-u_wgc402
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.5%,Last seen: Just now
Memory usage: 103.66 MiB,Spilled bytes: 0 B
Read bytes: 97.96 kiB,Write bytes: 10.00 kiB

0,1
Comm: tcp://127.0.0.1:64845,Total threads: 2
Dashboard: http://127.0.0.1:64847/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64833,
Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-g5fwg0d9,Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-g5fwg0d9
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.1%,Last seen: Just now
Memory usage: 104.49 MiB,Spilled bytes: 0 B
Read bytes: 99.78 kiB,Write bytes: 7.98 kiB

0,1
Comm: tcp://127.0.0.1:64843,Total threads: 2
Dashboard: http://127.0.0.1:64846/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:64831,
Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-x8hfwmju,Local directory: /Users/stephen.haddad/prog/data_science_cop/dask-worker-space/worker-x8hfwmju
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 5.0%,Last seen: Just now
Memory usage: 104.91 MiB,Spilled bytes: 0 B
Read bytes: 183.92 kiB,Write bytes: 13.99 kiB


### train some classifiers

Set up some classifiers, then set up dask objects to run distibuted hyperparameter tuning.

In [44]:
# nn_hidden_layers_specs = [(50,)*2, (50,)*4, (50,)*8, (100,)*2, (100,)*5, (100,)*8, (200,)*4, (400,)*4, (500,)*4]
nn_hidden_layers_specs = [(50,)*2, (50,)*8, (100,)*2, (100,)*5,]

classifiers_params = {
    'decision_tree': {'class': sklearn.tree.DecisionTreeClassifier, 'opts': {'max_depth':[5,10,15,20], 'class_weight':['balanced']}},
    'random_forest': {'class': sklearn.ensemble.RandomForestClassifier, 'opts': {'max_depth':[5,10,15,20], 'class_weight':['balanced']}},
     'ann': {'class': sklearn.neural_network.MLPClassifier, 'opts': {'hidden_layer_sizes': nn_hidden_layers_specs}},   
}



In [45]:
%%time
classifiers_dict = {}             
for clf_name, clf_params in classifiers_params.items():
    print(clf_name)
    clf1 = clf_params['class']()
    cv1 = sklearn.model_selection.KFold(n_splits=5, shuffle=True)
    hpt1 = dask_ml.model_selection.GridSearchCV(clf1, 
                                                clf_params['opts'],
                                                cv=cv1,
                                               )
    res1 = hpt1.fit(X_working, y_working)
    classifiers_dict[clf_name] = hpt1

decision_tree
random_forest
ann
CPU times: user 505 ms, sys: 109 ms, total: 615 ms
Wall time: 3min 45s


In [46]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_test_tuples:
        print(sklearn.metrics.precision_recall_fscore_support(clf1.predict(X1), y1))

(array([0.9330424, 1.       ]), array([1.        , 0.28262387]), array([0.96536155, 0.44069641]), array([14325,  1433]))
(array([0.93372434, 0.68181818]), array([0.99128269, 0.20979021]), array([0.96164301, 0.32085561]), array([1606,  143]))
(array([0.941119, 1.      ]), array([1.        , 0.30939649]), array([0.96966647, 0.47257876]), array([14449,  1309]))
(array([0.94956012, 0.72727273]), array([0.99264255, 0.27118644]), array([0.9706235 , 0.39506173]), array([1631,  118]))
(array([0.99973946, 0.96790123]), array([0.99915376, 0.98989899]), array([0.99944652, 0.97877653]), array([15362,   396]))
(array([0.9888563 , 0.18181818]), array([0.97909408, 0.2962963 ]), array([0.98395098, 0.22535211]), array([1722,   27]))


In [47]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_test_tuples:
        print(sklearn.metrics.balanced_accuracy_score(clf1.predict(X1), y1))


0.6413119330076762
0.6005364498515183
0.6546982429335371
0.6319144956302154
0.9945263729601707
0.6376951864756742


In [48]:
data_working['rotors_present'].value_counts()

False    15353
True       405
Name: rotors_present, dtype: int64

In [49]:
data_test['rotors_present'].value_counts()

False    1705
True       44
Name: rotors_present, dtype: int64

In [50]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_test_tuples:
        print(sklearn.metrics.confusion_matrix(clf1.predict(X1), y1))

[[14325     0]
 [ 1028   405]]
[[1592   14]
 [ 113   30]]
[[14449     0]
 [  904   405]]
[[1619   12]
 [  86   32]]
[[15349    13]
 [    4   392]]
[[1686   36]
 [  19    8]]


In this sort of classification problem, there are 4 sorts of results:
* true postive(hits) - should be positive classification and is
* true negative - should be negative and is
* false negative (miss) - should be classified positive but is classified negative
* false positive (false alarm) - should be classified negative but is classified as positive by the algorithm

Given less than 100% accuracy, changing parameters can shift results between false negatives and false positive, depending on which is more damaging for how the prediction will be used. If we decide that predicting a rotor that doesn't happen is more costly, we would penalise false positives. If we decide that a rotor event happening when not forecast is more damaging, we penalise false negatives. Tis can be done by optimising for an F-score other F1. F1 balances tese out, but instead one use a different weighting in the F-score formula for one or the other.


### Resample the data 

Our yes/no classes for classification are very unbalanced, so we can try doing a naive resampling so we have equal representation fo the two classes in our sample set.

In [51]:
data_working_resampled = pandas.concat([
    data_working[data_working[target_feature_name] == True].sample(n=int(1e4), replace=True), 
    data_working[data_working[target_feature_name] == False].sample(n=int(1e4), replace=False),],
    ignore_index=True)

In [52]:
X_working_resampled = preproc_input(data_working_resampled, preproc_dict)
y_working_resampled = preproc_target(data_working_resampled, target_encoder)

  return f(*args, **kwargs)


In [53]:
train_test_res_tuples = [
    (X_working_resampled, y_working_resampled),
    (X_test, y_test),    
]

In [54]:
%%time
classifiers_res_dict = {}                    
for clf_name, clf_params in classifiers_params.items():
    print(clf_name)
    clf1 = clf_params['class']()
    cv1 = sklearn.model_selection.KFold(n_splits=5, shuffle=True)
    hpt1 = dask_ml.model_selection.GridSearchCV(clf1, 
                                                clf_params['opts'],
                                                cv=cv1,
                                               )
    res1 = hpt1.fit(X_working, y_working)
    classifiers_res_dict[clf_name] = hpt1

decision_tree
random_forest
ann
CPU times: user 473 ms, sys: 83 ms, total: 556 ms
Wall time: 3min 37s


In [55]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_test_res_tuples:
        print(sklearn.metrics.precision_recall_fscore_support(clf1.predict(X1), y1))


(array([0.9335, 1.    ]), array([1.        , 0.93764651]), array([0.96560641, 0.96781999]), array([ 9335, 10665]))
(array([0.93372434, 0.68181818]), array([0.99128269, 0.20979021]), array([0.96164301, 0.32085561]), array([1606,  143]))
(array([0.9452, 1.    ]), array([1.        , 0.94804702]), array([0.97182809, 0.97333074]), array([ 9452, 10548]))
(array([0.95249267, 0.72727273]), array([0.99266504, 0.28318584]), array([0.97216402, 0.40764331]), array([1636,  113]))
(array([0.9999, 0.9756]), array([0.97617885, 0.99989751]), array([0.98789705, 0.98759933]), array([10243,  9757]))
(array([0.9888563 , 0.09090909]), array([0.97682503, 0.17391304]), array([0.98280385, 0.11940299]), array([1726,   23]))


In [56]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_test_res_tuples:
        print(sklearn.metrics.balanced_accuracy_score(clf1.predict(X1), y1))    

0.9688232536333803
0.6005364498515183
0.9740235115661737
0.6379254386913906
0.9880381816658919
0.5753690362234873


In [57]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_test_res_tuples:
        print(sklearn.metrics.confusion_matrix(clf1.predict(X1), y1))

[[ 9335     0]
 [  665 10000]]
[[1592   14]
 [ 113   30]]
[[ 9452     0]
 [  548 10000]]
[[1624   12]
 [  81   32]]
[[9999  244]
 [   1 9756]]
[[1686   40]
 [  19    4]]


## Further work

Improving results
* Outer cross-validation loop
* visualising the metrics
* using an F-score to penalise false positives or false negatives
 * using that score as the optimisation criteria for the hyper parameter tuning