In [0]:
! pip install keras

In [0]:
# Glove
# !wget http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip -c
# !unzip glove.840B.300d.zip -d glove.840B.300d/

# Twitter
# !wget http://nlp.stanford.edu/data/glove.twitter.27B.zip -c
# !unzip glove.twitter.27B.zip -d glove.twitter.27B.200d/

# FastTest
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip -d crawl-300d-2M.vec/

In [0]:
import numpy as np
np.random.seed(32)
import pandas as pd

! pip install pydrive
# these classes allow you to request the Google drive API
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# https://drive.google.com/open?id=1Rk9F9YUMuG9JtuhCaUck1Lxid3OISc1-
file_id = '1Rk9F9YUMuG9JtuhCaUck1Lxid3OISc1-'
downloaded = drive.CreateFile({'id': file_id})
# allows you to temporarily load your file in the notebook VM

downloaded.GetContentFile('train.csv')
train = pd.read_csv('train.csv')

# https://drive.google.com/open?id=1CH6MLJYHK6rtC-p_4kK7Ms17CIisowGZ
file_id = '1CH6MLJYHK6rtC-p_4kK7Ms17CIisowGZ'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('test.csv')
test = pd.read_csv('test.csv')

In [0]:
embed_size = 300
max_features = 150000 
max_text_len = 150
max_sent = 10

# EMBEDDING_FILE = "glove.840B.300d/glove.840B.300d.txt"
# EMBEDDING_FILE = "glove.twitter.27B.200d/glove.twitter.27B.200d.txt"
EMBEDDING_FILE = "crawl-300d-2M.vec/crawl-300d-2M.vec"

In [0]:
import sys, os, re, csv, codecs, gc
os.environ["OMP_NUM_THREADS"] = "4"
from sklearn.model_selection import train_test_split
import nltk
from nltk import tokenize, word_tokenize
nltk.download("punkt")

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GRU, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, Add, Flatten, CuDNNGRU, CuDNNLSTM, TimeDistributed
from keras.optimizers import Adam, RMSprop, SGD, Nadam

from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [0]:
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None


def pair_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    parts = tf.dynamic_partition(y_pred, y_true, 2)
    y_pos = parts[1]
    y_neg = parts[0]
    y_pos = tf.expand_dims(y_pos, 0)
    y_neg = tf.expand_dims(y_neg, -1)
    out = K.sigmoid(y_neg - y_pos)
    return K.mean(out)

In [0]:
def clean_corpus(comment):
    comment = comment.lower()
    comment = comment.replace('&', ' and ')
    comment = comment.replace('0', ' zero ')
    comment = comment.replace('1', ' one ')
    comment = comment.replace('2', ' two ')
    comment = comment.replace('3', ' three ')
    comment = comment.replace('4', ' four ')
    comment = comment.replace('5', ' five ')
    comment = comment.replace('6', ' six ')
    comment = comment.replace('7', ' seven ')
    comment = comment.replace('8', ' eight ')
    comment = comment.replace('9', ' nine ')
    comment = comment.replace('\'ve', ' have ')
    comment = comment.replace('\'d', ' would ')
    comment = comment.replace('\'m', ' am ')
    comment = comment.replace('n\'t', ' not ')
    comment = comment.replace('\'s', ' is ')
    comment = comment.replace('\'r', ' are ')
    comment = re.sub(r'\\', '', comment)
    comment = nltk.word_tokenize(comment)
    comment = " ".join(word for word in comment)
    return comment

In [0]:
category = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train["comment_text"].fillna("no comment", inplace = True)
train["comment_text"] = train["comment_text"].apply(lambda x: clean_corpus(x))

test["comment_text"].fillna("no comment", inplace = True)
test["comment_text"] = test["comment_text"].apply(lambda x: clean_corpus(x))

train["sentences"] = train["comment_text"].apply(lambda x: tokenize.sent_tokenize(x))
test["sentences"] = test["comment_text"].apply(lambda x: tokenize.sent_tokenize(x))

In [0]:
Y_train = train[category].values
X_train, X_valid, Y_train, Y_valid = train_test_split(train, Y_train, test_size = 0.1)

In [0]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

raw_text = X_train["comment_text"]
tk = Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text)

def sentenize(data):
    comments = data["sentences"]
    sent_matrix = np.zeros((comments.shape[0], max_sent, max_text_len), dtype = "int32")
    for i, sentences in enumerate(comments):
        for j, sent in enumerate(sentences):
            if j < max_sent:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    try:
                        if k < max_text_len and tk.word_index[word] < max_features:
                            sent_matrix[i, j, k] = tk.word_index[word]
                            k = k+1 
                    except:
                            sent_matrix[i, j, k] = 0
                            k = k+1
    return sent_matrix
  
X_train = sentenize(X_train)
X_valid = sentenize(X_valid)
X_test = sentenize(test)

In [0]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype = "float32")
embeddings_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_FILE))

In [0]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [0]:
def build_model(rnn_units = 0, de_units = 0, lr = 0.0):
    encoder_inp = Input(shape = (max_text_len,), dtype = "int32")
    endcoder = Embedding(nb_words, embed_size, weights = [embedding_matrix],
                        input_length = max_text_len, trainable = False)(encoder_inp)
    endcoder = Bidirectional(CuDNNGRU(rnn_units, return_sequences = True))(endcoder)
    endcoder = TimeDistributed(Dense(de_units, activation = "relu"))(endcoder)
    endcoder = AttentionWeightedAverage()(endcoder)
    Encoder = Model(encoder_inp, endcoder)

    decoder_inp = Input(shape = (max_sent, max_text_len), dtype = "int32")
    decoder = TimeDistributed(Encoder)(decoder_inp)
    decoder = Bidirectional(CuDNNGRU(rnn_units, return_sequences = True))(decoder)
    decoder = TimeDistributed(Dense(de_units, activation = "relu"))(decoder)
    Decoder = AttentionWeightedAverage()(decoder)

    out = Dense(6, activation = "sigmoid")(Decoder)
    model = Model(decoder_inp, out)
    model.compile(loss = "binary_crossentropy", optimizer = Nadam(),  metrics = ["accuracy"])
    return model

In [0]:
model = build_model(rnn_units = 128, de_units = 64, lr = 1e-3)

file_path = "best_model.hdf5"
ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

history = model.fit(X_train, Y_train, batch_size = 256, epochs = 10, validation_data = (X_valid, Y_valid), 
                    verbose = 2, callbacks = [ra_val, early_stop, check_point])
model = load_model(file_path, custom_objects = {"AttentionWeightedAverage": AttentionWeightedAverage})
pred = model.predict(X_test, batch_size = 1024, verbose = 2)

In [0]:
submission = pd.DataFrame()
submission = submission.reindex(columns = ["id"] + category)
submission["id"]= test[["id"]]
submission[category[0:6]] = pred
submission.to_csv("submission.csv", index = False)

In [0]:
submission.head()