In [68]:
from keras.layers import Dense, LSTM, Dropout, Activation, Input, Embedding, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import regularizers, Model, Sequential, callbacks
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score

In [2]:
from src.config import data_dir, models_dir
from src.helpers import calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class, init_dir, save_model, load_model, print_dict
from src.pipeline import load_data, DATAFILE

In [101]:
class Metrics(callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(f" — val_f1: {_val_f1:0.3f} — val_precision: {_val_precision:0.3f} — val_recall {_val_recall:0.3f}")
        return
 
metrics = Metrics()

In [3]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [99]:
data = load_data()
X, y = data["text"], data["label"]

In [100]:
chars = sorted(list(set("".join(X.values))))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 202


In [101]:
X_seq = X.map(lambda x: [char_indices[char] for char in x]).values

In [102]:
sequences = [to_categorical(x, num_classes=len(chars)) for x in X_seq]
X_1 = np.array(sequences)

In [108]:
sequences[2].shape

(6, 202)

### try OHE Embeddings

In [43]:
X_seq = X.map(lambda x: [char_indices[char] for char in x]).values
X_seq = sequence.pad_sequences(X_seq, value=0, padding="post")

In [63]:
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [47]:
embedding_matrix = np.zeros((len(char_indices)+1, len(char_indices)))
for word, i in char_indices.items():
    embedding_matrix[i] = np.zeros(len(char_indices))
    embedding_matrix[i][i-1] = 1

In [52]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [87]:
char_embedding_layer = Embedding(len(char_indices)+1,
                                len(char_indices),
                                weights=[embedding_matrix],
                                #input_length=n_words,
                                trainable=0,
                                mask_zero=True
                               )

In [88]:
sequence_input = Input(shape=(None,), dtype='int32')
embedded_sequences = char_embedding_layer(sequence_input)

In [90]:
#x = Flatten()(embedded_sequences)
x = LSTM(100)(embedded_sequences)
output = Dense(1, activation="sigmoid")(x)

In [91]:
model = Model(inputs=sequence_input, outputs=output)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[f1, "acc"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, None, 202)         41006     
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               121200    
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 101       
Total params: 162,307
Trainable params: 121,301
Non-trainable params: 41,006
_________________________________________________________________
None


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, 
          batch_size=64, 
          #class_weight=weights
         )

In [29]:
maxlen = int(X.map(len).quantile(0.6))
print(f"Max len: {maxlen}")
print('Vectorization...')
X_ohe = np.zeros((len(X), maxlen, len(chars)), dtype=np.bool)
for i, text in enumerate(X.values):
    #for t, char in enumerate(sentence):
    idx = [(i, t, char_indices[c]) for t,c in enumerate(text) if t<maxlen]
    #X[i, t, char_indices[char]] = 1
    X_ohe[tuple(zip(*idx))] = 1

Max len: 104
Vectorization...


In [30]:
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [31]:
input_text = Input(shape=(None, len(chars),))
x = LSTM(100, input_shape=(None, len(chars)))(input_text)
#x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)

In [32]:
model = Model(inputs=input_text, outputs=output)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[f1, "acc"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, None, 202)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               121200    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 121,301
Trainable params: 121,301
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
weights = class_weight.compute_class_weight('balanced',
                                             np.unique(y_train),
                                             y_train)
weights

array([0.62401402, 2.51590106])

In [34]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, 
          batch_size=64, 
          #class_weight=weights
         )

Train on 4272 samples, validate on 1831 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efd8bc54048>