In [275]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import scipy.stats as sts
import bpe

import keras as K
import keras.layers as L
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ModelCheckpoint

In [68]:
features = pickle.load(open('./processed/rnn_features.pkl', 'rb'))
labels = pd.read_csv('./processed/labels.csv')

bpe_encoder = pickle.load(open('./processed/pbe_encoder.pkl', 'rb'))

In [119]:
bpe_encoder.vocab_size

8192

In [60]:
list(bpe_encoder.transform([bpe_encoder.PAD + ' ' + bpe_encoder.UNK]))

[[0, 1]]

In [70]:
labels.shape, len(features[0])

((159571, 7), 82)

In [93]:
x_train, x_test, y_train, y_test = train_test_split(features, labels.values, test_size=0.2)

In [141]:
def build_model():
    l_input = L.Input(shape=(None, ))
    l_in2 = L.Embedding(input_dim=bpe_encoder.vocab_size, output_dim=10)(l_input)
    
    # (batch, len, 10) -> (batch, len, 128)
    l_in3 = L.TimeDistributed(L.Dense(units=128))(l_in2)
    
    # (batch, len, 128) -> (batch, len, 256)
    l_rnn1 = L.Bidirectional(L.LSTM(units=128, return_sequences=True))(l_in3)
    # (batch, len, 256) -> (batch, len, 256)
    l_rnn2 = L.Bidirectional(L.LSTM(units=128, return_sequences=True))(l_rnn1)
    
    # (batch, len, 256) -> (batch, len, 128)
    l_dense1 = L.TimeDistributed(L.Dense(units=128))(l_rnn2)
    # (batch, len, 128) -> (batch, 128)
    l_comb = L.GlobalMaxPool1D()(l_dense1)
    # (batch, 128) -> (batch, 128)
    l_dence2 = L.Dense(units=128, activation='relu')(l_comb)
    # (batch, 128) -> (batch, 14)
    l_final = L.Dense(units=2 * 7)(l_dence2)
    # (batch, 14) -> (batch, 7, 2)
    l_final_reshape = L.Reshape(target_shape=(7, 2))(l_final)
    # (batch, 7, 2) -> (batch, 7, 2)
    l_prob = L.Softmax(axis=2)(l_final_reshape)
    
    model = K.Model(input=l_input, output=l_prob)
    model.compile(optimizer='adam', loss='categorical_crossentropy',
    )
    return model

In [142]:
model = build_model()



In [143]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_20 (Embedding)     (None, None, 10)          81920     
_________________________________________________________________
time_distributed_32 (TimeDis (None, None, 128)         1408      
_________________________________________________________________
bidirectional_33 (Bidirectio (None, None, 256)         263168    
_________________________________________________________________
bidirectional_34 (Bidirectio (None, None, 256)         394240    
_________________________________________________________________
time_distributed_33 (TimeDis (None, None, 128)         32896     
_________________________________________________________________
global_max_pooling1d_15 (Glo (None, 128)               0         
__________

In [215]:
X = [['a', 'a'], ['b', 'b', 'b'], ['c']]
Y = [1, 2, 3]

buf = sorted([(len(x), x, y) for (x, y) in zip(X, Y) ], key=lambda x: x[0])
X, Y = zip(*[(x, y) for _, x, y in buf ])

print(X, Y)

(['c'], ['a', 'a'], ['b', 'b', 'b']) (3, 1, 2)


In [245]:
np.random.binomial(1, 0.5, size=(2, 2))

array([[0, 0],
       [1, 0]])

In [277]:
def generator_batch1(X, Y, batch_size=6, smooth=0.2, unk_prob=0.07):
    UNK_num = list(bpe_encoder.transform([bpe_encoder.UNK]))[0][0]
    PAD_num = list(bpe_encoder.transform([bpe_encoder.PAD]))[0][0]
    
    print(f'unk: {UNK_num}, pad: {PAD_num}')
    
    buf = sorted([(len(x), x, y) for (x, y) in zip(X, Y) ], key=lambda x: x[0])
    X, Y = zip(*[(x, y) for _, x, y in buf ])
    
    X = np.array(X)
    Y = np.array(Y)
    
    while True:
        indexes = np.arange(len(X) - batch_size)
        np.random.shuffle(indexes)
        
        for ind in indexes:
            y = to_categorical(np.expand_dims(Y[ind : ind + batch_size, :], axis=2), 2)
            y[y == 1] = 1 - smooth
            y[y == 0] = smooth
            x = pad_sequences(X[ind : ind + batch_size], padding='post', value=PAD_num)
            len_min = len(X[ind])
            x_unk = np.random.binomial(1, unk_prob, (batch_size, len_min))
            x[:, :len_min][x_unk == 1] = UNK_num
            yield (x, y)

In [278]:
checkpointer = ModelCheckpoint(
    filepath=r"./models/model3_{epoch:02d}.hdf5", 
    save_best_only=False,
    save_weights_only=False,
    period=2
)

In [None]:
model.fit_generator(
    generator=generator_batch1(x_train, y_train),
    steps_per_epoch=500,
    epochs=30,
    callbacks=[checkpointer]
)

Epoch 1/30
unk: 1, pad: 0
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30

In [271]:
%%time

predictions = np.array(list(map(
    lambda x: np.argmax(model2.predict(np.array([x]))[0], axis=1),
    x_test[:100],
)))

CPU times: user 8.13 s, sys: 745 ms, total: 8.87 s
Wall time: 3.06 s


In [272]:
for i in range(7):
    print(accuracy_score(predictions[:, i], y_test[:100, i]))

1.0
0.99
0.99
1.0
0.97
1.0
0.99


In [195]:
a = type('', (), {})()

In [200]:
a.y = 1

print(getattr(a, 'y'), getattr(a, 't'))

AttributeError: '' object has no attribute 't'

In [270]:
from keras.models import load_model

model2 = load_model('./models/model_7_test.hdf5')