In [1]:
import numpy as np
from sklearn.decomposition import PCA
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
import matplotlib
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.layers import Conv2D, Dense, Flatten
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import RMSprop

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
y_path = Path.cwd().parent.joinpath("Data","Split","train_labels.npy")
x_path = Path.cwd().parent.joinpath("Data","Split","train_features.npy")
x_test_path = Path.cwd().parent.joinpath("Data","Split","test_features.npy")
y_test_path = Path.cwd().parent.joinpath("Data","Split","test_labels.npy")

y_train = np.load(y_path)
x_train = np.load(x_path)
y_test = np.load(y_test_path)
x_test = np.load(x_test_path)

In [3]:
y_full = np.concatenate((y_train,y_test),axis=0)
x_full = np.vstack((x_train,x_test))

In [4]:
#Load data for CNN
path = '/p/project/training2005/HZG_Challenge/'

temperature_data = np.load(path+'tas_train.npy')
pressure_data = np.load(path+'psl_train.npy')

temp_new = np.load(path+'tas_predict.npy')
pressure_new = np.load(path+'psl_predict.npy')

x_train_full = np.concatenate([temperature_data, pressure_data], axis=1)

x_new = np.concatenate([temp_new, pressure_new], axis=1)


nao_data = np.load(path+'nao_index_train.npy')

In [5]:
from keras.utils import to_categorical
quant_down = np.quantile(nao_data,q=0.25)
quant_up = np.quantile(nao_data,q=0.75)
inds_no_extr = np.where(np.logical_and(quant_down<nao_data, nao_data<quant_up))[0]
inds_extr = np.where(np.logical_or(quant_down>nao_data, nao_data>quant_up))[0]
y_trim = nao_data[inds_extr]
y_train = np.sign(y_trim)
one_hot = np.zeros((len(y_train),2))
one_hot[y_train==1.] = [1, 0]
one_hot[y_train==-1.] = [0, 1]

#same for non extreme

y_trim_val = nao_data[inds_no_extr]
y_train_val = np.sign(y_trim_val)
one_hot_val = np.zeros((len(y_train_val),2))
one_hot_val[y_train_val==1.] = [1, 0]
one_hot_val[y_train_val==-1.] = [0, 1]

#same for all
y_train_all = np.sign(nao_data)
one_hot_all = np.zeros((len(y_train_all),2))
one_hot_all[y_train_all==1.] = [1, 0]
one_hot_all[y_train_all==-1.] = [0, 1]


Using TensorFlow backend.


In [6]:
np.shape(inds_extr)

(450,)

In [7]:
#trim x too
x_trim = x_train_full[inds_extr,]
temperature_data_trim = temperature_data[inds_extr,]
pressure_data_trim = pressure_data[inds_extr,]

In [8]:
#Take data that was not extreme (and thus not predicted as benchmark)
temperature_data_val = temperature_data[inds_no_extr,]
pressure_data_val = pressure_data[inds_no_extr,]

In [9]:
pd.DataFrame(y_train).describe()
pd.DataFrame(y_test).describe()
np.shape(temperature_data)
#matplotlib.pyplot.plot(pd.DataFrame(y_train).hist())

(900, 2322)

In [10]:
#reshape test data
temp_cnn_new = temp_new.reshape((temp_new.shape[0], 54, 43))
press_cnn_new = pressure_new.reshape((pressure_new.shape[0], 54, 43))
stacked_predict = np.array([temp_cnn_new, press_cnn_new])
stacked_predict = stacked_predict.reshape((stacked_predict.shape[1], stacked_predict.shape[2], stacked_predict.shape[3], stacked_predict.shape[0]))
print(stacked_predict.shape)

(100, 54, 43, 2)


In [11]:
#Reshape Data for validation
temp_data_val = temperature_data_val.reshape((temperature_data_val.shape[0], 54, 43))
press_data_val = pressure_data_val.reshape((pressure_data_val.shape[0], 54, 43))

stacked_data_val = np.array([temp_data_val, press_data_val])
stacked_data_val = stacked_data_val.reshape((stacked_data_val.shape[1], stacked_data_val.shape[2], stacked_data_val.shape[3], stacked_data_val.shape[0]))
print(stacked_data_val.shape)

(450, 54, 43, 2)


In [13]:
#Reshape Data for CNN
temp_data_train = temperature_data_trim.reshape((temperature_data_trim.shape[0], 54, 43))
press_data_train = pressure_data_trim.reshape((pressure_data_trim.shape[0], 54, 43))

stacked_data_train = np.array([temp_data_train, press_data_train])
stacked_data_train = stacked_data_train.reshape((stacked_data_train.shape[1], stacked_data_train.shape[2], stacked_data_train.shape[3], stacked_data_train.shape[0]))

#for all data
temp_data = temperature_data.reshape((temperature_data.shape[0], 54, 43))
press_data = pressure_data.reshape((pressure_data.shape[0], 54, 43))

stacked_data = np.array([temp_data, press_data])
stacked_data = stacked_data.reshape((stacked_data.shape[1], stacked_data.shape[2], stacked_data.shape[3], stacked_data.shape[0]))
print(stacked_data.shape)

(900, 54, 43, 2)


In [None]:
#generate test data
pressure_data_trim

In [15]:
# model simple CNN
class CNN:
    def __init__(self):
        pass
    
    def setup(self, input_shape=(54, 43, 2), n_filters=[10], kernel_size=[[5, 5]], padding='valid', activation='relu', n_neurons=10, activation_dense='relu'):
        
        Input = K.Input(shape=input_shape)
        
        tmp = []
        tmp.append(Input)
        for f in range(len(n_filters)):
            conv = Conv2D(filters=n_filters[f], kernel_size=kernel_size[f], padding=padding, activation=activation)(tmp[-1])
            tmp.append(conv)
        
        flat = Flatten()(tmp[-1])
        
        dense = Dense(n_neurons, activation_dense)(flat)
        out = Dense(units=2, activation='softmax')(dense)
        
        model = K.Model(inputs=Input, outputs=out)
        
        return model
        

In [16]:
NN=CNN()
mod = NN.setup(input_shape=stacked_data.shape[1:])
mod.compile(loss=categorical_crossentropy, optimizer=RMSprop(),metrics=["accuracy"])

Instructions for updating:
Colocations handled automatically by placer.


In [19]:
history = mod.fit(stacked_data, one_hot_all, batch_size=20, epochs=5, validation_split=0.02)

Train on 882 samples, validate on 18 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [24]:
num_folds = 10
kfold = KFold(n_splits=10,random_state=1337,shuffle=True)
acc_per_fold = []
loss_per_fold = []

In [25]:
fold_no = 1
for train, test in kfold.split(stacked_data_train, one_hot):
    NN=CNN()
    mod = NN.setup(input_shape=stacked_data.shape[1:])
    mod.compile(loss=categorical_crossentropy, optimizer=RMSprop(),metrics=["accuracy"])
    history = mod.fit(stacked_data[train], one_hot_all[train], batch_size=20, epochs=5, validation_split=0.1)
    
    scores = mod.evaluate(stacked_data[test], one_hot_all[test], verbose=0)
    print(f'Score for fold {fold_no}: {mod.metrics_names[0]} of {scores[0]}; {mod.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

Train on 364 samples, validate on 41 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: loss of 0.7782579700152079; acc of 40.00000059604645%
Train on 364 samples, validate on 41 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: loss of 0.9614929702546862; acc of 53.33333611488342%
Train on 364 samples, validate on 41 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: loss of 0.7287442803382873; acc of 51.11111402511597%
Train on 364 samples, validate on 41 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: loss of 0.7571892552905612; acc of 55.55555820465088%
Train on 364 samples, validate on 41 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: loss of 0.7994131220711602; acc of 55.55555820465088%
Train on 364 samples, validate on 41 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: loss of 0.6197706699371338; acc of 57.77778029441833

In [26]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.7782579700152079 - Accuracy: 40.00000059604645%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.9614929702546862 - Accuracy: 53.33333611488342%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.7287442803382873 - Accuracy: 51.11111402511597%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.7571892552905612 - Accuracy: 55.55555820465088%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.7994131220711602 - Accuracy: 55.55555820465088%
------------------------------------------------------------------------
> Fold 6 - Loss: 0.6197706699371338 - Accuracy: 57.777780294418335%
-----------------------------------------------------------------------

In [106]:
#How does it generalize?
#score = model.evaluate(input_test, target_test, verbose=0)
predictions= mod.predict(stacked_predict)
final_predictions_extr = np.argmax(predictions, axis = 1)

In [76]:
final_predictions

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0])

In [107]:
final_predictions_extr

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [113]:
score_eval = mod.evaluate(stacked_data, one_hot, verbose=0)

In [114]:
score_eval

[0.6958862484825982, 0.53333336]