In [1]:
import numpy as np
import pandas as pd

import gc, pickle, random, os, operator
from tqdm import tqdm

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence

# Opens embeddings from gensim
from gensim.models import KeyedVectors

Using TensorFlow backend.


In [2]:
EMBEDDING_FILES = [
#     '../input/gensim-embeddings-dataset/crawl-300d-2M.gensim',
#     '../input/gensim-embeddings-dataset/glove.840B.300d.gensim',
    '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl',
    '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'
]
SEED = 7321
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220

IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'

CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [3]:
def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
set_seed(SEED)

In [4]:
def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [21]:
def build_matrix(word_index, path, return_oov = False):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    oov = []
    
    for word, i in word_index.items():
        found = False
        for candidate in [word, word.lower(), word.title()]:
            if candidate in embedding_index:
                found = True
                embedding_matrix[i] = embedding_index[candidate]
                break
        if not found:
            oov.append(word)
    
    tkz_len = len(word_index)
    emb_len = tkz_len - len(oov)
    print('{} found embeddings. {:.2%} of total.'.format(emb_len, emb_len/N_WORDS))
    if return_oov:
        return embedding_matrix, unknown_words
    else:
        return embedding_matrix

In [6]:
def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [12]:
path = '../input/jigsaw-unintended-bias-in-toxicity-classification/'
nrows = 2**15
train = pd.read_csv(path + 'train.csv', nrows=nrows)
test = pd.read_csv(path + 'test.csv', nrows=nrows)

In [13]:
vocab = build_vocab(list(train[TEXT_COLUMN].apply(lambda x: x.split())))
N_WORDS = len(vocab)
del vocab
gc.collect()

100%|██████████| 32768/32768 [00:00<00:00, 57348.67it/s]


11

In [14]:
X_train = train[TEXT_COLUMN].astype(str)
y_train = train[TARGET_COLUMN].values
y_aux_train = train[AUX_COLUMNS].values
X_test = test[TEXT_COLUMN].astype(str)
train = train[IDENTITY_COLUMNS + [TARGET_COLUMN]]
test = test[['id']]

In [15]:
for c in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train[c] = np.where(train[c] >= 0.5, True, False)

In [16]:
tokenizer  = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

In [17]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)

In [18]:
sample_weights = np.ones(len(X_train), dtype=np.float32)
sample_weights += train[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train[TARGET_COLUMN] * (~train[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train[TARGET_COLUMN]) * train[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

In [24]:
print(f'{N_WORDS} unique words.')
tkz_len = len(tokenizer.word_index)
print('{} words in tokenizer. {:.2%} of total.'.format(tkz_len, tkz_len/N_WORDS))

120173 unique words.
78860 words in tokenizer. 65.62% of total.


In [25]:
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f, return_oov=False) for f in EMBEDDING_FILES], axis=-1)
oof = []
weights = []

71431 founded embeddings. 59.44% of total.
71235 founded embeddings. 59.28% of total.


In [None]:
for model_n in range(NUM_MODELS):
    model = build_model(embedding_matrix, y_aux_train.shape[-1])
    for global_epoch in range(EPOCHS):
        model.fit(
            X_train, [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=2,
            sample_weight = [sample_weights.values, np.ones_like(sample_weights)]
        )
        oof.append(model.predict(X_test, batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)

In [None]:
# --------------------------------
# No emb. cov. increasing, emb. concat (300+300), static pad to 220 on whole dataset
# 120173 unique words.
# 78860 words in tokenizer. 65.62% of total.
# 71431 founded embeddings. 59.44% of total.
# 71235 founded embeddings. 59.28% of total.
# - 14s - loss: 0.4859 - dense_7_loss: 0.3882 - dense_8_loss: 0.0977
# --------------------------------
# 

In [None]:
preds = np.average(oof, weights=weights, axis=0)
sub = pd.DataFrame.from_dict({
    'id': test.id,
    'prediction': preds
})
sub.to_csv('sub.csv', index = False)
sub.tail()