### Modeling - cnn

In [36]:
import pandas as pd
import numpy as np
import os
import glob
from pyts.image import GramianAngularField
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dropout, Dense, BatchNormalization, Conv1D, MaxPooling1D
from tensorflow.keras import backend as K, callbacks
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

from pyts.image import MarkovTransitionField
from sklearn.preprocessing import QuantileTransformer
# from tsaug import random_time_warp
import math

In [2]:
path = '/Users/yutingmei/Projects/APlusBernstein-Project/model'
filenames = [i for i in glob.glob(os.path.join(path, "*classification.csv"))]
df = [pd.read_csv(file, index_col=0, parse_dates=True) 
      for file in filenames]
test = df[0]
train = df[1]

In [10]:
# weighted version
path2 = '/Users/yutingmei/Projects/APlusBernstein-Project/data/processed/weight/feature/dt_weight_all.csv'
df2 = pd.read_csv(path2, index_col=110, parse_dates=True)

In [48]:
# weighted version + add moving avg, moving vol
path3 = '/Users/yutingmei/Projects/APlusBernstein-Project/data/processed/weight/feature/dt_weight_add_555.csv'
df3 = pd.read_csv(path3, index_col=110, parse_dates=True)

#### conv1d

In [19]:
train_ = np.array(train.iloc[:,:-2])
test_= np.array(test.iloc[:,:-2])

In [25]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [160]:
train_x, train_y = split_sequences(train_, n_steps=60)

In [79]:
n_features = train_x.shape[2]
n_steps=60
# define model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(n_steps, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
# fit model
# _, accuracy = model.evaluate(test_x, test_y, batch_size=32, verbose=0)
model.fit(train_x, train_y,  batch_size=128, epochs=300, validation_data=(test_x, test_y), verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f89382a9250>

In [80]:
_, accuracy = model.evaluate(test_x, test_y, batch_size=32, verbose=0)

In [86]:
accuracy

0.4887794256210327

### conv2d

In [3]:
# train_ = np.array(train.iloc[:,:-3])
train_target = np.array(train.iloc[:,-3])
# test_= np.array(test.iloc[:,:-3])
test_target= np.array(test.iloc[:,-3])

In [4]:
mtf = MarkovTransitionField(image_size=30)
im_train = mtf.fit_transform(train.iloc[:,:-3])
im_test = mtf.transform(test.iloc[:,:-3])

In [5]:
train_X = im_train.reshape((im_train.shape[0], im_train.shape[1], im_train.shape[2],1))
test_X = im_test.reshape((im_test.shape[0], im_test.shape[1], im_test.shape[2],1))

In [8]:
def model2(train_data, train_target, x_test, y_test,):
    LR = 1e-3
    batch_size = 128
    n_epochs = 500
    model2d = Sequential()
    model2d.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=train_data.shape[1:]))
    model2d.add(BatchNormalization())
    # model2d.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    # model2d.add(BatchNormalization())
    # model2d.add(Conv2D(32, kernel_size=(3, 3), strides=2, padding='same', activation='relu'))
    # model2d.add(BatchNormalization())
    # model2d.add(Dropout(0.4))
    # model2d.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    # model2d.add(BatchNormalization()),
    # model2d.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    # model2d.add(BatchNormalization()),
    model2d.add(Conv2D(64, kernel_size=(3, 3), strides=2, padding='same', activation='relu'))
    model2d.add(BatchNormalization()),
    model2d.add(Dropout(0.4)),
    model2d.add(Conv2D(128, kernel_size=4, activation='relu'))
    model2d.add(BatchNormalization())
    model2d.add(Flatten())
    model2d.add(Dropout(0.4))
    model2d.add(Dense(1, activation='sigmoid'))
    model2d.compile(optimizer=keras.optimizers.Adam(lr=LR), loss='binary_crossentropy', metrics=["accuracy"])
    model2d.fit(train_data, train_target, epochs=n_epochs, batch_size=batch_size, verbose=0,validation_split=0.25)
    print("Evaluate on test data")
    results = model2d.evaluate(x_test, y_test, batch_size=128)
    print("test loss, test acc:", results)
    return model2d

In [9]:
mdl2 = model2(train_X, train_target, test_X, test_target)

Evaluate on test data
test loss, test acc: [2.769523859024048, 0.49787911772727966]


In [11]:
yhat = mdl2.predict_classes(test_X)



In [23]:
path_yp = '/Users/yutingmei/Projects/APlusBernstein-Project/model/cnn/'
pd.DataFrame(yhat).to_csv(path_yp + 'ypred_1st.csv', index = False)

#### * try weighted strategy version

In [27]:
# do quantile transformation first for weighted version
X = df2.iloc[:, :-3]
quantile = QuantileTransformer(output_distribution='normal')
X_trans = quantile.fit_transform(X)

In [32]:
X_trans = pd.DataFrame(X_trans, columns = X.columns, index = X.index)

In [40]:
def traintest_split(proportion, df):
    test_size = math.ceil(df.shape[0] * proportion)

    X_train = df[:-test_size]
    X_test = df[-test_size:]
    return X_train, X_test

In [41]:
trainw, testw = traintest_split(.2, df2)

In [42]:
def trans_shape_2d(train, test, i, img_size):
    train_target = np.array(train.iloc[:,i])
    test_target= np.array(test.iloc[:,i])
    mtf = MarkovTransitionField(image_size=img_size)
    im_train = mtf.fit_transform(train.iloc[:,:i])
    im_test = mtf.transform(test.iloc[:,:i])
    train_X = im_train.reshape((im_train.shape[0], im_train.shape[1], im_train.shape[2],1))
    test_X = im_test.reshape((im_test.shape[0], im_test.shape[1], im_test.shape[2],1))
    return train_X, test_X, train_target, test_target

In [43]:
X_trainw, X_testw, y_trainw, y_testw = trans_shape_2d(trainw, testw, -3, 30)

In [44]:
# accuracy improved!!
mdl2w = model2(X_trainw, y_trainw, X_testw, y_testw)

Evaluate on test data
test loss, test acc: [1.0969626903533936, 0.534401535987854]


In [45]:
yhatw = mdl2w.predict_classes(X_testw)



In [47]:
path_yp = '/Users/yutingmei/Projects/APlusBernstein-Project/model/cnn/'
pd.DataFrame(yhatw).to_csv(path_yp + 'ypred_w_1st.csv', index = False)

* try weighted + add moving avg, moving vol

In [16]:
def q_trans(df,i):
    X = df.iloc[:, :-i]
    quantile = QuantileTransformer(output_distribution='normal')
    X_trans = quantile.fit_transform(X)
    X_trans = pd.DataFrame(X_trans, columns = X.columns, index = X.index)
    trans = pd.concat([X_trans, df.iloc[:,-i:]], axis=1)
    return trans

In [56]:
trans = q_trans(df3,3)

In [57]:
trainwa, testwa = traintest_split(.2, trans)

In [58]:
X_trainwa, X_testwa, y_trainwa, y_testwa = trans_shape_2d(trainwa, testwa, -3, 30)

In [62]:
# accuracy decrease, might be the reason of adding too much features but not enough data
mdl2wa = model2(X_trainwa, y_trainwa, X_testwa, y_testwa)

Evaluate on test data
test loss, test acc: [1.2016968727111816, 0.5014137625694275]


In [63]:
yhatwa = mdl2wa.predict_classes(X_testwa)



In [64]:
path_yp = '/Users/yutingmei/Projects/APlusBernstein-Project/model/cnn/'
pd.DataFrame(yhatwa).to_csv(path_yp + 'ypred_w_2st.csv', index = False)

* try data augmentation

In [18]:
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
import pandas as pd
import numpy as np
import os
import glob
from pyts.image import MarkovTransitionField
from sklearn.preprocessing import QuantileTransformer

In [19]:
transag = q_trans(df2,3)

In [23]:
transag_x = transag.iloc[:,:-3]
transag_y = transag.iloc[:,-3]

In [14]:
augmenter = (
    TimeWarp() * 5  # random time warping 5 times in parallel
    + Crop(size=300)  # random crop subsequences with length 300
    + Drift(max_drift=(0.1, 0.5)) @ 0.8  # with 80% probability, random drift the signal up to 10% - 50%
    + Reverse() @ 0.2  # with 50% probability, reverse the sequence
)

In [28]:
mtf = MarkovTransitionField(image_size=30)
im_x = mtf.fit_transform(transag_x)
x = im_x.reshape((im_x.shape[0], im_x.shape[1], im_x.shape[2],1))

In [37]:
transag_x = np.array(transag.iloc[:,:-3])
X = transag_x.reshape((transag_x.shape[0], transag_x.shape[1], 1))

In [47]:
y = np.arange(len(df2))

In [32]:
# y = transag.iloc[:,-3]
# y = np.array(y)

In [49]:
# x_aug, y_aug = augmenter.augment(y, transag_x)