In [1]:
import numpy as np
from sklearn.decomposition import PCA
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
import matplotlib
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.layers import Conv2D, Dense, Flatten
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import RMSprop

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
y_path = Path.cwd().parent.joinpath("Data","Split","train_labels.npy")
x_path = Path.cwd().parent.joinpath("Data","Split","train_features.npy")
x_test_path = Path.cwd().parent.joinpath("Data","Split","test_features.npy")
y_test_path = Path.cwd().parent.joinpath("Data","Split","test_labels.npy")

y_train = np.load(y_path)
x_train = np.load(x_path)
y_test = np.load(y_test_path)
x_test = np.load(x_test_path)

In [26]:
y_full = np.concatenate((y_train,y_test),axis=0)
x_full = np.vstack((x_train,x_test))

In [40]:
#Load data for CNN
path = '/p/project/training2005/HZG_Challenge/'

temperature_data = np.load(path+'tas_train.npy')
pressure_data = np.load(path+'psl_train.npy')

temp_new = np.load(path+'tas_predict.npy')
pressure_new = np.load(path+'psl_predict.npy')

x_train_full = np.concatenate([temperature_data, pressure_data], axis=1)

x_new = np.concatenate([temp_new, pressure_new], axis=1)


nao_data = np.load(path+'nao_index_train.npy')

In [12]:
from keras.utils import to_categorical
quant_down = np.quantile(nao_data,q=0.25)
quant_up = np.quantile(nao_data,q=0.75)
inds = np.where(np.logical_and(quant_down<nao_data, nao_data<quant_up))[0]
y_trim = nao_data[inds]
y_train = np.sign(y_trim)
y_binary = to_categorical(y_train)

Using TensorFlow backend.


In [6]:
#trim x too
x_trim = x_train_full[inds,]
temperature_data_trim = temperature_data[inds,]
pressure_data_trim = pressure_data_trim[inds,]

In [7]:
pd.DataFrame(y_train).describe()
pd.DataFrame(y_test).describe()
np.shape(temperature_data)
#matplotlib.pyplot.plot(pd.DataFrame(y_train).hist())

(450, 2322)

In [None]:
#reshape training data


In [8]:
#Reshape Data for CNN
temp_data = temperature_data_trim.reshape((temperature_data_trim.shape[0], 54, 43))
press_data = pressure_data_trim.reshape((pressure_data_trim.shape[0], 54, 43))

stacked_data = np.array([temp_data, press_data])
stacked_data = stacked_data.reshape((stacked_data.shape[1], stacked_data.shape[2], stacked_data.shape[3], stacked_data.shape[0]))
print(stacked_data.shape)

(450, 54, 43, 2)


In [None]:
#generate test data
pressure_data_trim

In [14]:
# model simple CNN
class CNN:
    def __init__(self):
        pass
    
    def setup(self, input_shape=(54, 43, 2), n_filters=[10], kernel_size=[[5, 5]], padding='valid', activation='relu', n_neurons=10, activation_dense='relu'):
        
        Input = K.Input(shape=input_shape)
        
        tmp = []
        tmp.append(Input)
        for f in range(len(n_filters)):
            conv = Conv2D(filters=n_filters[f], kernel_size=kernel_size[f], padding=padding, activation=activation)(tmp[-1])
            tmp.append(conv)
        
        flat = Flatten()(tmp[-1])
        
        dense = Dense(n_neurons, activation_dense)(flat)
        out = Dense(units=2, activation='softmax')(dense)
        
        model = K.Model(inputs=Input, outputs=out)
        
        return model
        

In [15]:
NN=CNN()
mod = NN.setup(input_shape=stacked_data.shape[1:])
mod.compile(loss=categorical_crossentropy, optimizer=RMSprop(),metrics=["accuracy"])

In [23]:
history = mod.fit(stacked_data, y_binary, batch_size=20, epochs=5, validation_split=0.01)

Train on 445 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [36]:
num_folds = 10
kfold = KFold(n_splits=10,random_state=1337,shuffle=True)
acc_per_fold = []
loss_per_fold = []

In [None]:
fold_no = 1
for train, test in kfold.split(stacked_data, y_binary):
    NN=CNN()
    mod = NN.setup(input_shape=stacked_data.shape[1:])
    mod.compile(loss=categorical_crossentropy, optimizer=RMSprop(),metrics=["accuracy"])
    history = mod.fit(stacked_data[train], y_binary[train], batch_size=20, epochs=5, validation_split=0.1)
    
    scores = mod.evaluate(stacked_data[test], y_binary[test], verbose=0)
    print(f'Score for fold {fold_no}: {mod.metrics_names[0]} of {scores[0]}; {mod.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

In [38]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 5.518117753429882e-06 - Accuracy: 100.0%
------------------------------------------------------------------------
> Fold 2 - Loss: 1.1920928955078125e-07 - Accuracy: 100.0%
------------------------------------------------------------------------
> Fold 3 - Loss: 1.1920928955078125e-07 - Accuracy: 100.0%
------------------------------------------------------------------------
> Fold 4 - Loss: 1.1920928955078125e-07 - Accuracy: 100.0%
------------------------------------------------------------------------
> Fold 5 - Loss: 1.1920928955078125e-07 - Accuracy: 100.0%
------------------------------------------------------------------------
> Fold 6 - Loss: 1.1920928955078125e-07 - Accuracy: 100.0%
------------------------------------------------------------------------
> Fold 7 - Loss: 1.1920928955078125e-07 - Accura

In [None]:
#How does it generalize?
score = model.evaluate(input_test, target_test, verbose=0)