In [2]:
from keras.layers import Dense, LSTM, Dropout, Activation, Input, Embedding, Flatten, concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPool1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import regularizers, Model, Sequential, callbacks, activations
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from collections import Counter
from itertools import groupby, chain
import random

In [3]:
from src.config import data_dir, models_dir
from src.helpers import calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class, init_dir, save_model, load_model, print_dict
from src.pipeline import load_data, DATAFILE, build_transform_pipe, TF_PARAMS

In [4]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [5]:
data = load_data()
X, y = data["text"], data["label"]

#### Char CNN

In [11]:
def prepare_embeddings(X, mask_zeros=True):
    chars = sorted(list(set("".join(X.values))))
    char_indices = dict((c, i + mask_zeros) for i, c in enumerate(chars))
    indices_char = dict((i + mask_zeros, c) for i, c in enumerate(chars))
    
    X_seq = X.map(lambda x: [char_indices[char] for char in x]).values
    
    embedding_matrix = np.zeros((len(char_indices) + mask_zeros, len(char_indices)))
    for word, i in char_indices.items():
        embedding_matrix[i] = np.zeros(len(char_indices))
        embedding_matrix[i][i - mask_zeros] = 1

    sequence_input = Input(shape=(200,), dtype='int32')
    char_embedding_layer = Embedding(len(char_indices)+mask_zeros,
                                     len(char_indices),
                                    weights=[embedding_matrix],
                                    trainable=0,
                                    mask_zero=mask_zeros
                                   )
    embedded_sequences = char_embedding_layer(sequence_input)
    return sequence_input, embedded_sequences, X_seq

In [12]:
def build_model(X, train_features):
    inp, x, X_seq = prepare_embeddings(X, mask_zeros=0)
    
    features_input = Input(shape=(train_features.shape[1],))
    alpha = 1e-8
    z = Dense(100, activation=activations.tanh,
             kernel_regularizer=regularizers.l2(alpha),
             use_bias=1
             )(features_input)
    z = Dropout(0.3)(z)
    # x = Dense(50, activation=activations.tanh,
    #          kernel_regularizer=regularizers.l2(alpha))(x)
    # x = Dropout(0.25)(x)
    features_output = Dense(1, activation="sigmoid",
                   use_bias=1,
                   kernel_regularizer=regularizers.l2(alpha)
                  )(z)

    conv1 = Conv1D(filters=32, kernel_size=3, activation='relu')(x)
    drop1 = Dropout(0.3)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    conv2 = Conv1D(filters=32, kernel_size=5, activation='relu')(x)
    drop2 = Dropout(0.3)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    conv3 = Conv1D(filters=32, kernel_size=7, activation='relu')(x)
    drop3 = Dropout(0.3)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    out_conv = [flat1, flat2, flat3]
    
    x = concatenate(out_conv, axis = -1)    
    x = Dense(10, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation='sigmoid')(x)
    
    ensemble = concatenate([features_output, x])
    ensemble_output = Dense(1, activation="sigmoid", use_bias=0)(ensemble)
    
    model = Model(inputs=[features_input, inp], outputs=ensemble_output)
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=[f1, 'accuracy'])
    return model, X_seq

In [21]:
model, X_seq = build_model(X, train_features)

In [178]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, 200, 202)     40804       input_27[0][0]                   
__________________________________________________________________________________________________
conv1d_41 (Conv1D)              (None, 198, 32)      19424       embedding_23[0][0]               
__________________________________________________________________________________________________
conv1d_42 (Conv1D)              (None, 196, 32)      32352       embedding_23[0][0]               
__________________________________________________________________________________________________
conv1d_43 

In [22]:
X_s = sequence.pad_sequences(X_seq, maxlen=200, padding="post", truncating="post")

In [23]:
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_s, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [24]:
weights = class_weight.compute_class_weight('balanced',
                                             np.unique(y_train),
                                             y_train)
weights

array([0.62401402, 2.51590106])

In [25]:
batch_size = 128
model.fit([train_features, X_train], y_train, 
          batch_size=batch_size,
          validation_data=([test_features, X_test], y_test),
        epochs=50,
        shuffle=True,
        class_weight=weights
       )

Train on 4272 samples, validate on 1831 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f92281e7518>

In [95]:
probas = model.predict(X_test)

In [96]:
y_pred = np.zeros_like(probas)
y_pred[probas>=0.5] = 1
accuracy_score(y_pred, y_test)
f1_score(y_pred=y_pred, y_true=y_test)
precision_score(y_pred, y_test)
recall_score(y_pred, y_test)

0.9361004915346806

0.8169014084507042

0.717032967032967

0.9490909090909091

#### Char RNN (batches based on length)

In [15]:
X_seq = X.map(lambda x: [char_indices[char] for char in x]).values

In [374]:
sequences = [to_categorical(x, num_classes=len(chars)) for x in X_seq]

In [375]:
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(sequences, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [384]:
input_text = Input(shape=(None, len(chars),))
x = LSTM(100, input_shape=(None, len(chars)))(input_text)
x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)

In [385]:
model = Model(inputs=input_text, outputs=output)
model.compile(loss='binary_crossentropy', 
              optimizer="rmsprop", 
              metrics=[f1, "acc"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, None, 202)         0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               121200    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 121,301
Trainable params: 121,301
Non-trainable params: 0
_________________________________________________________________
None


In [454]:
gen_train = gen_bacthes(X_train, y_train)
gen_test = gen_bacthes(X_test, y_test)

In [446]:
steps_per_epoch = len(np.unique([len(el) for el in X_train]))
validation_steps = len(np.unique([len(el) for el in X_test]))

In [466]:
model.fit_generator(gen_train, validation_data=gen_test, 
                    steps_per_epoch=steps_per_epoch, 
                    validation_steps=validation_steps,
                    epochs=2,
                    shuffle=True
                    #class_weight=weights
                   )

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe28ca93eb8>

In [463]:
probas = model.predict_generator(gen_test, steps=validation_steps)

In [464]:
y_test_batches = list(chain.from_iterable([next(gen_test)[-1] for _ in range(validation_steps)]))

In [465]:
y_pred = np.zeros_like(probas)
y_pred[probas>=0.5] = 1
f1_score(y_pred=y_pred, y_true=y_test_batches)
accuracy_score(y_pred, y_test_batches)
precision_score(y_pred, y_test_batches)
recall_score(y_pred, y_test_batches)

0.36132812499999994

0.642818132168214

0.5082417582417582

0.2803030303030303

#### Try OHE Embeddings with zero-masking and FNN on TF-IDF + custom features

In [16]:
pipe = Pipeline(build_transform_pipe(TF_PARAMS))

In [19]:
X_train_text, X_test_text, _, _ = train_test_split(X, y, test_size=test_size, random_state=42,
                                                   stratify=y)

In [20]:
train_features = pipe.fit_transform(X_train_text)
test_features = pipe.transform(X_test_text)

In [684]:
chars = sorted(list(set("".join(X.values))))
print('total chars:', len(chars))
char_indices = dict((c, i+1) for i, c in enumerate(chars))
indices_char = dict((i+1, c) for i, c in enumerate(chars))

total chars: 202


In [685]:
X_seq = X.map(lambda x: [char_indices[char] for char in x]).values
X_seq = sequence.pad_sequences(X_seq, value=0, padding="post")

In [686]:
X_seq.shape

(6103, 710)

In [687]:
X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [688]:
embedding_matrix = np.zeros((len(char_indices)+1, len(char_indices)))
for word, i in char_indices.items():
    embedding_matrix[i] = np.zeros(len(char_indices))
    embedding_matrix[i][i-1] = 1

In [689]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [690]:
char_embedding_layer = Embedding(len(char_indices)+1,
                                len(char_indices),
                                weights=[embedding_matrix],
                                #input_length=n_words,
                                trainable=0,
                                mask_zero=True
                               )

In [691]:
sequence_input = Input(shape=(None,), dtype='int32')
features_input = Input(shape=(train_features.shape[1],))
embedded_sequences = char_embedding_layer(sequence_input)

In [711]:
alpha = 1e-8
z = Dense(100, activation=activations.tanh,
         kernel_regularizer=regularizers.l2(alpha),
         use_bias=1
         )(features_input)
z = Dropout(0.3)(z)
# x = Dense(50, activation=activations.tanh,
#          kernel_regularizer=regularizers.l2(alpha))(x)
# x = Dropout(0.25)(x)
features_output = Dense(1, activation="sigmoid",
               use_bias=1,
               kernel_regularizer=regularizers.l2(alpha)
              )(z)

In [713]:
#x = Flatten()(embedded_sequences)
x = LSTM(100, kernel_regularizer=regularizers.l2(1e-8))(embedded_sequences)
x = Dropout(0.3)(x)
rnn_output = Dense(1, activation="sigmoid")(x)

In [714]:
ensemble = concatenate([features_output, rnn_output])
ensemble_output = Dense(1, activation="sigmoid", use_bias=0)(ensemble)

In [715]:
model = Model(inputs=[sequence_input, features_input], outputs=ensemble_output)
model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop', 
              metrics=[f1, "acc"])
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_32 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_33 (InputLayer)           (None, 4003)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 202)    41006       input_32[0][0]                   
__________________________________________________________________________________________________
dense_61 (Dense)                (None, 100)          400400      input_33[0][0]                   
__________________________________________________________________________________________________
lstm_36 (L

In [719]:
model.fit([X_train, train_features], y_train, 
          validation_data=([X_test, test_features], y_test), 
          epochs=1, 
          batch_size=64, 
          class_weight=weights,
          shuffle=True
         )

Train on 4272 samples, validate on 1831 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe2461836a0>

In [703]:
probas = model.predict([X_test, test_features])

In [704]:
y_pred = np.zeros_like(probas)
y_pred[probas>=0.5] = 1
accuracy_score(y_pred, y_test)
f1_score(y_pred=y_pred, y_true=y_test)
precision_score(y_pred, y_test)
recall_score(y_pred, y_test)

0.9814309120699072

0.9530386740331492

0.9478021978021978

0.9583333333333334

#### OHE 

In [670]:
chars = sorted(list(set("".join(X.values))))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 202


In [671]:
maxlen = 80
print(f"Max len: {maxlen}")
print('Vectorization...')
X_ohe = np.zeros((len(X), maxlen, len(chars)), dtype=np.bool)
for i, text in enumerate(X.values):
    #for t, char in enumerate(sentence):
    idx = [(i, t, char_indices[c]) for t,c in enumerate(text) if t < maxlen]
    #X[i, t, char_indices[char]] = 1
    X_ohe[tuple(zip(*idx))] = 1

Max len: 80
Vectorization...


In [672]:
X_ohe.shape

(6103, 80, 202)

In [673]:
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [674]:
features_input = Input(shape=(train_features.shape[1],))

In [675]:
alpha = 1e-8
z = Dense(100, activation=activations.tanh,
         kernel_regularizer=regularizers.l2(alpha),
         use_bias=1
         )(features_input)
z = Dropout(0.3)(z)
# x = Dense(50, activation=activations.tanh,
#          kernel_regularizer=regularizers.l2(alpha))(x)
# x = Dropout(0.25)(x)
features_output = Dense(1, activation="sigmoid",
               use_bias=1,
               kernel_regularizer=regularizers.l2(alpha)
              )(z)

In [676]:
input_text = Input(shape=(maxlen, len(chars),))
x = LSTM(100, kernel_regularizer=regularizers.l2(1e-8))(input_text)
x = Dropout(0.3)(x)
rnn_output = Dense(1, activation='sigmoid')(x)

In [677]:
ensemble = concatenate([features_output, rnn_output])
ensemble_output = Dense(1, activation="sigmoid")(ensemble)

In [678]:
model = Model(inputs=[input_text, features_input], outputs=ensemble_output)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[f1, "acc"])
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_30 (InputLayer)           (None, 4003)         0                                            
__________________________________________________________________________________________________
input_31 (InputLayer)           (None, 80, 202)      0                                            
__________________________________________________________________________________________________
dense_48 (Dense)                (None, 100)          400400      input_30[0][0]                   
__________________________________________________________________________________________________
lstm_31 (LSTM)                  (None, 100)          121200      input_31[0][0]                   
__________________________________________________________________________________________________
dropout_34

In [680]:
model.fit([X_train, train_features], 
          y_train, 
          validation_data=([X_test, test_features], y_test), 
          epochs=20, 
          batch_size=64, 
          class_weight=weights
         )

Train on 4272 samples, validate on 1831 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: 

In [643]:
probas = model.predict([X_test, test_features])

In [644]:
y_pred = np.zeros_like(probas)
y_pred[probas>=0.5] = 1
accuracy_score(y_pred, y_test)
f1_score(y_pred=y_pred, y_true=y_test)
precision_score(y_pred, y_test)
recall_score(y_pred, y_test)

0.9770617149098854

0.9411764705882353

0.9230769230769231

0.96