In [1]:
import pathlib
import datetime
import math
import functools
import numpy

In [2]:
import pandas

In [3]:
import iris

In [4]:
import matplotlib

In [5]:
%matplotlib inline

In [6]:
import sklearn
import sklearn.tree
import sklearn.preprocessing
import sklearn.ensemble
import sklearn.neural_network
import sklearn.metrics

In [7]:
# root_data_dir = '/data/users/shaddad/ds_cop/2021_opmet_challenge/ML'
root_data_dir = '/project/informatics_lab/data_science_cop/ML_challenges/2021_opmet_challenge'

## Exploring Falklands Rotor Data

In [8]:
falklands_dir = 'Rotors'
falklands_data_path = pathlib.Path(root_data_dir, falklands_dir)

In [9]:
falklands_new_training_data_path = pathlib.Path(falklands_data_path, 'new_training.csv')

In [10]:
falklands_training_df = pandas.read_csv(falklands_new_training_data_path, header=0).loc[1:,:]
falklands_training_df

Unnamed: 0,DTG,air_temp_obs,dewpoint_obs,wind_direction_obs,wind_speed_obs,wind_gust_obs,air_temp_1,air_temp_2,air_temp_3,air_temp_4,...,windspd_18,winddir_19,windspd_19,winddir_20,windspd_20,winddir_21,windspd_21,winddir_22,windspd_22,Rotors 1 is true
1,01/01/2015 00:00,283.9,280.7,110.0,4.1,-9999999.0,284.000,283.625,283.250,282.625,...,5.8,341.0,6.0,334.0,6.1,330.0,6.0,329.0,5.8,
2,01/01/2015 03:00,280.7,279.7,90.0,7.7,-9999999.0,281.500,281.250,280.750,280.250,...,6.8,344.0,5.3,348.0,3.8,360.0,3.2,12.0,3.5,
3,01/01/2015 06:00,279.8,278.1,100.0,7.7,-9999999.0,279.875,279.625,279.125,278.625,...,6.0,345.0,5.5,358.0,5.0,10.0,4.2,38.0,4.0,
4,01/01/2015 09:00,279.9,277.0,120.0,7.2,-9999999.0,279.625,279.250,278.875,278.250,...,3.1,338.0,3.5,354.0,3.9,9.0,4.4,22.0,4.6,
5,01/01/2015 12:00,279.9,277.4,120.0,8.7,-9999999.0,279.250,278.875,278.375,277.875,...,1.6,273.0,2.0,303.0,2.3,329.0,2.5,338.0,2.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20101,31/12/2020 06:00,276.7,275.5,270.0,3.6,-9999999.0,277.875,277.750,277.625,277.500,...,12.1,223.0,11.8,221.0,11.4,219.0,11.3,215.0,11.4,
20102,31/12/2020 09:00,277.9,276.9,270.0,3.1,-9999999.0,277.875,277.625,277.875,277.875,...,10.2,230.0,10.8,230.0,11.6,227.0,12.3,222.0,12.0,
20103,31/12/2020 12:00,283.5,277.1,220.0,3.6,-9999999.0,281.125,280.625,280.125,279.625,...,10.3,218.0,11.9,221.0,12.8,222.0,11.9,225.0,10.6,
20104,31/12/2020 15:00,286.1,276.9,250.0,3.6,-9999999.0,284.625,284.125,283.625,283.000,...,9.4,218.0,8.6,212.0,8.3,218.0,8.7,226.0,10.1,


In [11]:
falklands_training_df = falklands_training_df.drop_duplicates(subset='DTG')

In [12]:
falklands_training_df.shape

(17507, 95)

### Specify and create input features

In [13]:
temp_feature_names = [f'air_temp_{i1}' for i1 in range(1,23)]
humidity_feature_names = [f'sh_{i1}' for i1 in range(1,23)]
wind_direction_feature_names = [f'winddir_{i1}' for i1 in range(1,23)]
wind_speed_feature_names = [f'windspd_{i1}' for i1 in range(1,23)]
target_feature_name = 'rotors_present'

In [14]:
def get_v_wind(wind_dir_name, wind_speed_name, row1):
    return math.cos(math.radians(row1[wind_dir_name])) * row1[wind_speed_name]

def get_u_wind(wind_dir_name, wind_speed_name, row1):
    return math.sin(math.radians(row1[wind_dir_name])) * row1[wind_speed_name]

In [15]:
u_feature_template = 'u_wind_{level_ix}'
v_feature_template = 'v_wind_{level_ix}'
u_wind_feature_names = []
v_wind_features_names = []
for wsn1, wdn1 in zip(wind_speed_feature_names, wind_direction_feature_names):
    level_ix = int( wsn1.split('_')[1])
    u_feature = u_feature_template.format(level_ix=level_ix)
    u_wind_feature_names += [u_feature]
    falklands_training_df[u_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_u_wind, wdn1, wsn1), axis='columns')
    v_feature = v_feature_template.format(level_ix=level_ix)
    v_wind_features_names += [v_feature]
    falklands_training_df[v_feature_template.format(level_ix=level_ix)] = falklands_training_df.apply(functools.partial(get_v_wind, wdn1, wsn1), axis='columns')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
falklands_training_df[target_feature_name] =  falklands_training_df['Rotors 1 is true']
falklands_training_df.loc[falklands_training_df[falklands_training_df['Rotors 1 is true'].isna()].index, target_feature_name] = 0.0
falklands_training_df[target_feature_name]  = falklands_training_df[target_feature_name] .astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [17]:
falklands_training_df[target_feature_name].value_counts()

False    17058
True       449
Name: rotors_present, dtype: int64

In [18]:
falklands_training_df.columns

Index(['DTG', 'air_temp_obs', 'dewpoint_obs', 'wind_direction_obs',
       'wind_speed_obs', 'wind_gust_obs', 'air_temp_1', 'air_temp_2',
       'air_temp_3', 'air_temp_4',
       ...
       'v_wind_18', 'u_wind_19', 'v_wind_19', 'u_wind_20', 'v_wind_20',
       'u_wind_21', 'v_wind_21', 'u_wind_22', 'v_wind_22', 'rotors_present'],
      dtype='object', length=140)

### SPlit into traing/validate/test sets

In [19]:
test_fraction = 0.1
validation_fraction = 0.1

In [20]:
num_no_rotors = sum(falklands_training_df[target_feature_name] == False)
num_with_rotors = sum(falklands_training_df[target_feature_name] == True)

In [21]:
data_no_rotors = falklands_training_df[falklands_training_df[target_feature_name] == False]
data_with_rotors = falklands_training_df[falklands_training_df[target_feature_name] == True]

In [22]:
data_test = pandas.concat([data_no_rotors.sample(int(test_fraction * num_no_rotors)), data_with_rotors.sample(int(test_fraction * num_with_rotors))])
data_test[target_feature_name].value_counts()

False    1705
True       44
Name: rotors_present, dtype: int64

In [23]:
falklands_training_df['test_set'] = False
falklands_training_df.loc[data_test.index,'test_set'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [24]:
data_working = falklands_training_df[falklands_training_df['test_set'] == False]
data_working_no_rotors = data_working[data_working[target_feature_name] == False]
data_working_with_rotors = data_working[data_working[target_feature_name] == True]

In [25]:
data_validation = pandas.concat(
    [data_working_no_rotors.sample(int(validation_fraction * num_no_rotors)), 
     data_working_with_rotors.sample(int(validation_fraction * num_with_rotors))])
falklands_training_df['validation_set'] = False
falklands_training_df.loc[data_validation.index,'validation_set'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [26]:
data_validation[target_feature_name].value_counts()

False    1705
True       44
Name: rotors_present, dtype: int64

In [27]:
data_train = falklands_training_df[(~falklands_training_df['test_set']) & (~falklands_training_df['validation_set'])]

In [28]:
data_train[target_feature_name].value_counts()

False    13648
True       361
Name: rotors_present, dtype: int64

# Preprocess data into input for ML algorithm

In [29]:
input_feature_names = temp_feature_names + humidity_feature_names + u_wind_feature_names + v_wind_features_names

In [30]:
preproc_dict = {}
for if1 in input_feature_names:
    scaler1 = sklearn.preprocessing.StandardScaler()
    scaler1.fit(data_train[[if1]])
    preproc_dict[if1] = scaler1

In [31]:
target_encoder = sklearn.preprocessing.LabelEncoder()
target_encoder.fit(data_train[[target_feature_name]])

  return f(*args, **kwargs)


LabelEncoder()

Apply transformation to each input column

In [32]:
def preproc_input(data_subset, pp_dict):
    return numpy.concatenate([scaler1.transform(data_subset[[if1]]) for if1,scaler1 in pp_dict.items()],axis=1)

def preproc_target(data_subset, enc1):
     return enc1.transform(data_subset[[target_feature_name]])

In [33]:
X_train = preproc_input(data_train, preproc_dict)
y_train = preproc_target(data_train, target_encoder)

  return f(*args, **kwargs)


create target feature from rotors

In [34]:
y_train.shape, X_train.shape

((14009,), (14009, 88))

In [35]:
X_val = preproc_input(data_validation, preproc_dict)
y_val = preproc_target(data_validation, target_encoder)

  return f(*args, **kwargs)


In [36]:
X_test = preproc_input(data_test, preproc_dict)
y_test = preproc_target(data_test, target_encoder)

  return f(*args, **kwargs)


In [37]:
train_val_test_tuples = [
    (X_train, y_train),
    (X_val, y_val),
    (X_test, y_test),    
]

### train some classifiers

In [38]:
classifiers_params = {
    'decision_tree': {'class': sklearn.tree.DecisionTreeClassifier, 'opts': {'max_depth':10, 'class_weight':'balanced'}},
    'random_forest': {'class': sklearn.ensemble.RandomForestClassifier, 'opts': {'max_depth':10, 'class_weight':'balanced'}},
     'ann_3_50': {'class': sklearn.neural_network.MLPClassifier, 'opts': {'hidden_layer_sizes':(50,50,50)}},   
     'ann_2_200': {'class': sklearn.neural_network.MLPClassifier, 'opts': {'hidden_layer_sizes':(200,200)}},   
}



In [39]:
%%time
classifiers_dict = {}             
for clf_name, clf_params in classifiers_params.items():
    print(clf_name)
    clf1 = clf_params['class'](**clf_params['opts'])
    clf1.fit(X_train, y_train)
    classifiers_dict[clf_name] = clf1

decision_tree
random_forest
ann_3_50
ann_2_200
CPU times: user 2min 48s, sys: 5.44 s, total: 2min 53s
Wall time: 52 s


In [40]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_val_test_tuples:
        print(sklearn.metrics.precision_recall_fscore_support(clf1.predict(X1), y1))

(array([0.87815064, 0.99445983]), array([0.99983315, 0.17754698]), array([0.93504974, 0.30130088]), array([11987,  2022]))
(array([0.85571848, 0.84090909]), array([0.9952251 , 0.13074205]), array([0.92021444, 0.22629969]), array([1466,  283]))
(array([0.86392962, 0.79545455]), array([0.99392713, 0.13108614]), array([0.92438029, 0.22508039]), array([1482,  267]))
(array([0.89617526, 0.99722992]), array([0.99991825, 0.20258863]), array([0.94520866, 0.33676333]), array([12232,  1777]))
(array([0.88211144, 0.84090909]), array([0.99536731, 0.15546218]), array([0.93532338, 0.26241135]), array([1511,  238]))
(array([0.88739003, 0.79545455]), array([0.99408673, 0.15418502]), array([0.93771305, 0.25830258]), array([1522,  227]))
(array([1., 1.]), array([1., 1.]), array([1., 1.]), array([13648,   361]))
(array([0.98475073, 0.15909091]), array([0.97843823, 0.21212121]), array([0.98158433, 0.18181818]), array([1716,   33]))
(array([0.98475073, 0.09090909]), array([0.97673066, 0.13333333]), array([

In [41]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_val_test_tuples:
        print(sklearn.metrics.balanced_accuracy_score(clf1.predict(X1), y1))


0.5886900678834646
0.5629835758946004
0.5625066339140852
0.6012534398735679
0.5754147456467696
0.5741358750079596
1.0
0.5952797202797203
0.5550319953461315
0.959005851665409
0.645681646999749
0.6313765182186235


In [42]:
for clf_name, clf1 in classifiers_dict.items():
    for X1, y1 in train_val_test_tuples:
        print(sklearn.metrics.confusion_matrix(clf1.predict(X1), y1))

[[11985     2]
 [ 1663   359]]
[[1459    7]
 [ 246   37]]
[[1473    9]
 [ 232   35]]
[[12231     1]
 [ 1417   360]]
[[1504    7]
 [ 201   37]]
[[1513    9]
 [ 192   35]]
[[13648     0]
 [    0   361]]
[[1679   37]
 [  26    7]]
[[1679   40]
 [  26    4]]
[[13616     2]
 [   32   359]]
[[1676   31]
 [  29   13]]
[[1677   33]
 [  28   11]]


In [43]:
sklearn.metrics.confusion_matrix(classifiers_dict['decision_tree'].predict(X_val), y_val)

array([[1459,    7],
       [ 246,   37]])

In [44]:
sklearn.metrics.confusion_matrix(classifiers_dict['random_forest'].predict(X_val), y_val)

array([[1504,    7],
       [ 201,   37]])

### Resample the data 

Our yes/no classes for classification are very unbalanced, so we can try doing a naive resampling so we have equal representation fo the two classes in our sample set.

In [45]:
data_train_resampled = pandas.concat([
    data_train[data_train[target_feature_name] == True].sample(n=int(1e4), replace=True), 
    data_train[data_train[target_feature_name] == False].sample(n=int(1e4), replace=False),],
    ignore_index=True)

In [46]:
X_train_resampled = preproc_input(data_train_resampled, preproc_dict)
y_train_resampled = preproc_target(data_train_resampled, target_encoder)

  return f(*args, **kwargs)


In [47]:
train_val_test_res_tuples = [
    (X_train_resampled, y_train_resampled),
    (X_val, y_val),
    (X_test, y_test),    
]

In [48]:
%%time
classifiers_res_dict = {}                    
for clf_name, clf_params in classifiers_params.items():
    print(clf_name)
    clf1 = clf_params['class'](**clf_params['opts'])
    clf1.fit(X_train_resampled, y_train_resampled)
    classifiers_res_dict[clf_name] = clf1

decision_tree
random_forest
ann_3_50
ann_2_200
CPU times: user 3min, sys: 5.88 s, total: 3min 6s
Wall time: 56.6 s


In [49]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_val_test_res_tuples:
        print(sklearn.metrics.precision_recall_fscore_support(clf1.predict(X1), y1))


(array([0.885 , 0.9977]), array([0.99740787, 0.89664779]), array([0.93784772, 0.94447863]), array([ 8873, 11127]))
(array([0.87096774, 0.86363636]), array([0.99597586, 0.14728682]), array([0.92928661, 0.25165563]), array([1491,  258]))
(array([0.87155425, 0.72727273]), array([0.99198932, 0.12749004]), array([0.92788011, 0.21694915]), array([1498,  251]))
(array([0.8962, 1.    ]), array([1.        , 0.90596122]), array([0.94525894, 0.95066071]), array([ 8962, 11038]))
(array([0.87859238, 0.86363636]), array([0.99601064, 0.15510204]), array([0.93362418, 0.26297578]), array([1504,  245]))
(array([0.88739003, 0.79545455]), array([0.99408673, 0.15418502]), array([0.93771305, 0.25830258]), array([1522,  227]))
(array([0.999, 1.   ]), array([1.      , 0.999001]), array([0.99949975, 0.99950025]), array([ 9990, 10010]))
(array([0.97184751, 0.22727273]), array([0.97989355, 0.17241379]), array([0.97585395, 0.19607843]), array([1691,   58]))
(array([0.96656891, 0.29545455]), array([0.98153663, 0.1

In [50]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_val_test_res_tuples:
        print(sklearn.metrics.balanced_accuracy_score(clf1.predict(X1), y1))    

0.947027830108276
0.5716313384181055
0.5597396794663801
0.9529806124297879
0.5755563395570994
0.5741358750079596
0.9995004995004995
0.5761536736067212
0.5836254573300435
1.0
0.6073267995099633
0.625363893342334


In [52]:
for clf_name, clf1 in classifiers_res_dict.items():
    for X1, y1 in train_val_test_res_tuples:
        print(sklearn.metrics.confusion_matrix(clf1.predict(X1), y1))

[[8850   23]
 [1150 9977]]
[[1485    6]
 [ 220   38]]
[[1486   12]
 [ 219   32]]
[[ 8962     0]
 [ 1038 10000]]
[[1498    6]
 [ 207   38]]
[[1513    9]
 [ 192   35]]
[[ 9990     0]
 [   10 10000]]
[[1657   34]
 [  48   10]]
[[1648   31]
 [  57   13]]
[[10000     0]
 [    0 10000]]
[[1669   33]
 [  36   11]]
[[1664   29]
 [  41   15]]


## Further work
Improviong accuracy
* more sophisticated resampling (e.g. SMOTE)
* proper hyperpamater tuning to get best results with each classifier
* try other tree and NN implementations e.g. tensorflow, pytorch, xgboost
* try dimensionality reduction step

Result presentation/investigation
* some visualisation of results
* consolidate output into data frames
* look atfeature importance measures to see which features are most important

MLOps
* do hypyerparamter tuning using dask cluster
* execute training on spice (create SPICE dask cluster)
* save experiment details in experiment using MLflow
* replicate workflow
