In [1]:
import numpy as np
import pandas as pd

from string import punctuation

from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.models import Model

Using TensorFlow backend.


In [54]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], 1),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1], 1),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        super(Attention, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        logits = K.dot(x, self.W) + self.b
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            mask = K.cast(mask, K.floatx())
            a *= mask

        # in some cases especially in the early stages of training the sum may be almost zero
        att_weights = ai / K.sum(ai, axis=1, keepdims=True)
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        #if self.return_attention:
            #return [result, att_weights]
        return result

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
        #return input_shape[0],  self.features_dim

In [3]:
#Glove Vectors
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [4]:
MAX_SEQUENCE_LENGTH = 150
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 300
num_dense = 256
rate_drop_lstm = 0.25
rate_drop_dense = 0.25

In [5]:
## process texts in datasets
import re

#Regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^a-z\d ]',re.IGNORECASE)

#regex to replace all numerics
replace_numbers = re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text = special_character_removal.sub('',text)
    
    #Replace Numbers
    text = replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [6]:
train_df = pd.read_csv('datasets/train.csv')
list_sentences_train = train_df["comment_text"].fillna("NA").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values


In [7]:
print(type(list_sentences_train))

<class 'numpy.ndarray'>


In [8]:
list_sentences_train.shape

(159571,)

In [9]:
y.shape

(159571, 6)

In [10]:
test_df = pd.read_csv('datasets/test.csv')
list_sentences_test = test_df["comment_text"].fillna("NA").values

In [11]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))

In [13]:
comments[:5]

['explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired nown',
 'daww he matches this background colour im seemingly stuck with thanks talk n january n n utc',
 'hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info',
 ' more i cant make any real suggestions on improvement  i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first  if you have any preferences for formatting style on references or want to do it yourself please let me know there a

In [14]:
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

In [16]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments + test_comments)

In [17]:
sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

In [19]:
print(sequences[:5])

[[675, 79, 1, 137, 129, 178, 29, 666, 4398, 9812, 1291, 84, 348, 52, 1951, 13200, 49, 6346, 16, 62, 2501, 145, 7, 2654, 33, 115, 1155, 15630, 2534, 4, 51, 53, 242, 1, 424, 31, 1, 60, 30, 139, 68, 3863, 12750], [54, 2736, 14, 1402, 3672, 68, 4561, 2508, 22, 96, 60, 12, 947, 12, 12, 211], [446, 389, 68, 122, 15, 253, 2, 82, 324, 43, 49, 9, 14, 568, 8, 2280, 492, 472, 105, 4, 561, 2, 37, 310, 137, 357, 3, 29, 60, 30, 54, 184, 2, 436, 61, 35, 1, 2273, 94, 1, 677, 475], [61, 7, 191, 98, 57, 317, 1331, 16, 1981, 7, 5334, 23, 1, 114, 2258, 59, 17, 483, 16, 27, 5, 3161, 3, 1256, 3, 9891, 7, 66, 1, 281, 87, 118, 12628, 36, 9, 52, 19, 42, 10, 1, 1410, 136, 1210, 698, 431, 1210, 313, 7, 39, 34, 9, 483, 16, 23, 3330, 308, 101, 112, 23, 6, 20, 57, 4302, 13, 2273, 478, 16, 281, 27, 107, 2, 34, 11, 220, 51, 263, 37, 72, 41, 515, 2, 17, 5, 5607, 16, 80, 13, 373, 36, 7, 598, 41, 87, 17, 5, 2045, 363, 5, 2844, 2833, 76, 43, 412, 10, 1, 472, 596, 887], [6, 1666, 19, 29, 3509, 57, 1011, 6, 545, 38, 30, 16

In [27]:
word_index = tokenizer.word_index
# {'the': 1,
#  'to': 2,
#  'of': 3,
#  'and': 4,
#  'a': 5,
#  'you': 6,
#  'i': 7,
#  'is': 8, ...}

In [22]:
len(word_index)

392183

In [24]:
from keras.preprocessing.sequence import pad_sequences
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data.shape

(159571, 150)

In [25]:
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data.shape

(153164, 150)

In [26]:
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [32]:
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 30606


In [33]:
embedding_matrix.shape

(100000, 300)

In [35]:
## sample train/validation data

# np.random.permutation(10) -- array([4, 2, 3, 8, 7, 5, 9, 6, 1, 0])
perm = np.random.permutation(len(data)) # 随机打乱顺序
idx_train = perm[:int(len(data)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data)*(1-VALIDATION_SPLIT)):]

In [36]:
data_train = data[idx_train]
labels_train = y[idx_train]
print(data_train.shape, labels_train.shape)

(143613, 150) (143613, 6)


In [37]:
data_val = data[idx_val]
labels_val = y[idx_val]

In [38]:
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,return_sequences=True)

In [55]:
from keras import initializers, regularizers, constraints
from keras.layers import BatchNormalization

comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences= embedding_layer(comment_input)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
merged = Attention(MAX_SEQUENCE_LENGTH)(x)
merged = Dense(num_dense, activation='relu')(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(6, activation='sigmoid')(merged)

model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          30000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 300)          721200    
_________________________________________________________________
dropout_11 (Dropout)         (None, 150, 300)          0         
_________________________________________________________________
attention_6 (Attention)      (None, 300)               450       
_________________________________________________________________
dense_8 (Dense)              (None, 256)               77056     
_________________________________________________________________
dropout_12 (Dropout)         (None, 256)               0         
__________

In [44]:
# from keras.models import Sequential
# lstm_att = Sequential()
# lstm_att.add(embedding_layer)
# lstm_att.add(lstm_layer)
# lstm_att.add(Dropout(rate_drop_dense))
# lstm_att.add(Attention(MAX_SEQUENCE_LENGTH))
# lstm_att.add(Dense(num_dense, activation='relu'))
# lstm_att.add(Dropout(rate_drop_dense))
# lstm_att.add(BatchNormalization())
# lstm_att.add(Dense(6, activation='sigmoid'))
# lstm_att.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# print(lstm_att.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          30000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 300)          721200    
_________________________________________________________________
dropout_7 (Dropout)          (None, 150, 300)          0         
_________________________________________________________________
attention_4 (Attention)      (None, 300)               450       
_________________________________________________________________
dense_4 (Dense)              (None, 256)               77056     
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               1024      
__________

In [56]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
bst_model_path = 'lstm_att.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [58]:
hist = model.fit(data_train, labels_train, 
                 epochs=50, 
                 batch_size=256,  
                 shuffle=True,
                 callbacks=[early_stopping, model_checkpoint], 
                 validation_data=(data_val, labels_val))

Train on 143613 samples, validate on 15958 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [59]:
## make the submission

print('Start making the submission before fine-tuning')

y_test = model.predict([test_data], batch_size=1024, verbose=1)

sample_submission = pd.read_csv("datasets/sample_submission.csv")
sample_submission[list_classes] = y_test

sample_submission.head()


Start making the submission before fine-tuning


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999313,0.4550171,0.987431,0.0426175,0.959817,0.2599539
1,0000247867823ef7,3.9e-05,4.555678e-07,1e-06,1.537656e-08,8e-06,1.371404e-06
2,00013b17ad220c46,0.001096,3.824639e-05,0.000359,1.019876e-06,0.000141,4.238672e-05
3,00017563c3f7919a,0.000253,3.751143e-06,5.4e-05,4.417281e-06,4e-05,4.238053e-07
4,00017695ad8997eb,0.000455,4.17763e-06,0.000108,4.474023e-07,1.5e-05,7.391547e-07


In [60]:
sample_submission.to_csv('sample_submission_lstm_att.csv', index=False) #0.9785