In [388]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
#from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from matplotlib import pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from collections import Counter
%matplotlib inline
import numpy as np
import pandas as pd

In [389]:
# Generate and save Test/Train Data - 3categories
# X_tas = np.load('../Data/tas_train.npy')
# X_psl = np.load('../Data/psl_train.npy')
# y=np.load('../Data/nao_index_train.npy')

# med_ids = np.where(np.logical_and(-1.5<y, y<1.5))[0]
# y[y>=1.5] = 1 
# y[y<=-1.5] = -1
# y[med_ids] = 0
# X = np.concatenate((X_tas,X_psl),axis=1)
# y=y.reshape(-1,1)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# np.save('../Data/Split/X_train_3cat.npy', X_train)
# np.save('../Data/Split/y_train_3cat.npy', y_train)
# np.save('../Data/Split/X_test_3cat.npy', X_test)
# np.save('../Data/Split/y_test_3cat.npy', y_test)

In [411]:
#Load train/test data
X_train = np.load('../Data/Split/X_train_3cat.npy')
X_test = np.load('../Data/Split/X_test_3cat.npy')
y_train = np.load('../Data/Split/y_train_3cat.npy')
y_test = np.load('../Data/Split/y_test_3cat.npy')

In [391]:
pca_temp = PCA(n_components=200)
pca_press = PCA(n_components=200)
X_tas_pca = pca_temp.fit_transform(X_train[:,0:2321])
X_psl_pca = pca_press.fit_transform(X_train[:,2322:4643])
print('% Explained Variance (Temp): '+str(pca_temp.explained_variance_ratio_.sum()))
print('% Explained Variance (Press): '+str(pca_temp.explained_variance_ratio_.sum()))
X_train_pca = np.concatenate((X_tas_pca,X_psl_pca),axis=1)

% Explained Variance (Temp): 0.9792618859061815
% Explained Variance (Press): 0.9792618859061815


In [392]:
#np.histogram(y_train, bins=10)
#plt.hist(y_train,bins=10)

In [393]:
#y_train_bin = np.sign(y_train).reshape(-1,)
# kbin = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
# kbin = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

# y_train_cat= kbin.fit_transform(y_train.reshape(-1,1)).reshape(-1,)
# print(pd.Series(y_train_cat).value_counts())
# print(kbin.bin_edges_)

In [394]:
# Try some oversampling to combat the class imbalance
x_resampled, y_resampled = SMOTE().fit_resample(X_train_pca, y_train)
print(sorted(Counter(y_resampled).items()))

#x_resampled, y_resampled = RandomOverSampler().fit_resample(X_train_pca, y_train)
#print(sorted(Counter(y_resampled).items()))

[(-1.0, 625), (0.0, 625), (1.0, 625)]


In [395]:
#rf = RandomForestClassifier(random_state=1337)
rf = RandomForestClassifier(random_state=1337, criterion='entropy', max_depth=10, min_samples_split=15)
#ad = AdaBoostClassifier(random_state=1337)

In [396]:
#train best model
clf = rf.fit(x_resampled, y_resampled)

In [387]:
# HPO
## Random Forest ##
# parameters = {'max_depth':(10, 100), 'min_samples_split':[5, 15], 'criterion':['entropy','gini']}
# clf = GridSearchCV(rf, parameters)
# #clf.fit(X_train_pca, y_train_cat)
# #clf.fit(X_train,y_train_bin)
# clf.fit(x_resampled, y_resampled)
# print(clf.best_score_)
# print(clf.best_params_)

## AdaBoost ##
# parameters = {'n_estimators':[50, 100], 'learning_rate':[0.1, 1, 5]}
# clf = GridSearchCV(ad, parameters)
# #clf.fit(X_train_pca, y_train_bin)
# clf.fit(X_train_autoencoder, y_train_cat)

- With PCA(50 components):
0.5405785123966942
{'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 15}
- With PCA(100 components):
0.5324380165289255
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 15}
- With PCA(200 components):
0.5406749311294765
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 15}
- Adaboost/PCA (Multiclass):
0.44278236914600555
{'learning_rate': 0.1, 'n_estimators': 50}

# Testing

In [397]:
#transform test data
X_tas_pca_test = pca_temp.transform(X_test[:,0:2321])
X_psl_pca_test = pca_press.transform(X_test[:,2322:4643])
X_test_pca = np.concatenate((X_tas_pca_test,X_psl_pca_test),axis=1)
#y_test_bin = np.sign(y_test)
#y_test_cat = kbin.transform(y_test.reshape(-1,1)).reshape(-1,)
#print(pd.Series(y_test_cat).value_counts())

In [398]:
y_pred = clf.predict(X_test_pca)

In [399]:
#confusion_matrix(y_test_cat,y_pred)
confusion_matrix(y_test,y_pred)

array([[  0,  20,   0],
       [  0, 147,   0],
       [  0,  13,   0]])

In [400]:
print('F1 score: '+str(f1_score(y_test, y_pred, average='macro')))

F1 score: 0.2996941896024465


# Try something else

In [405]:
import keras
from keras import layers
from keras.optimizers import Adam

In [406]:
encoding_dim = 32

In [409]:
input_img = keras.Input(shape=(4644,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_img)
decoded = layers.Dense(4644, activation='sigmoid')(encoded)
autoencoder = keras.Model(input_img, decoded)
encoder = keras.Model(input_img, encoded)
encoded_input = keras.Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = keras.Model(encoded_input, decoder_layer(encoded_input))
customAdam = Adam(lr=0.001)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [410]:
autoencoder.fit(X_train, X_train, epochs=100, batch_size=50, shuffle=False, validation_data=(X_test, X_test))

Train on 720 samples, validate on 180 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.History at 0x7f65900c0ba8>

In [267]:
X_train_autoencoder = encoder.predict(X_train)