In [7]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D #,CuDNNLSTM
from keras.preprocessing import text, sequence
from gensim.models import KeyedVectors
from tqdm import tqdm
from keras.optimizers import Adam
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
import tensorflow as tf
import pickle

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


EMBEDDING_FILES = [
    'input/gensim-embeddings-dataset/crawl-300d-2M.vec',
    'input/gensim-embeddings-dataset/glove.840B.300d.txt'
]
NUM_MODELS = 2
BATCH_SIZE = 16
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

Num GPUs Available:  1


In [3]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    #each line in the file looks like 
    # apple 0.3 0.4 0.5 0.6 ...
    # that is a word followed by 300 float numbers

    with open(path) as f:
        #return dict(get_coefs(*line.strip().split(' ')) for line in f)
        return dict(get_coefs(*o.strip().split(" ")) for o in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    #print(type(embedding_index))
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        for candidate in [word, word.lower()]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                break
    return embedding_matrix
    

def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [4]:
#train_df = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/train_custom.csv')
#test_df = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/valid_custom.csv')
train_df = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

#print(len(train_df))
#print(len(test_df))
#train_df.dropna(inplace = True, subset = ['comment_text'])
#test_df.dropna(inplace = True, subset = ['comment_text'])
#print(len(train_df))
#print(len(test_df))
#train_df = train_df[:int(len(train_df) * 1/4)]
#train_df = train_df[:10000]
#test_df = test_df[:10000]

x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)
#print(y_train[:10])

for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()
    
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

checkpoint_predictions = []
weights = []


1999996it [01:41, 19769.61it/s]
2196017it [01:45, 20867.61it/s]


In [5]:
model = build_model(embedding_matrix, y_aux_train.shape[-1])


In [6]:

#for model_idx in range(NUM_MODELS):
for global_epoch in range(EPOCHS):
    print("starting epoch", global_epoch)
    model.fit(
        x_train,
        [y_train, y_aux_train],
        batch_size=16,
        epochs=1,
        verbose=2,
        sample_weight=[sample_weights.values, np.ones_like(sample_weights)]
    )
    print("finishing epoch", global_epoch)
    checkpoint_predictions.append(model.predict(x_test, batch_size=16)[0].flatten())
    weights.append(2 ** global_epoch)

predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)

starting epoch 0
112805/112805 - 4765s - loss: 0.5212 - dense_2_loss: 0.4163 - dense_3_loss: 0.1049
finishing epoch 0
starting epoch 1
112805/112805 - 4772s - loss: 0.5118 - dense_2_loss: 0.4088 - dense_3_loss: 0.1030
finishing epoch 1
starting epoch 2
112805/112805 - 4641s - loss: 0.5100 - dense_2_loss: 0.4072 - dense_3_loss: 0.1028
finishing epoch 2
starting epoch 3
112805/112805 - 4742s - loss: 0.5091 - dense_2_loss: 0.4064 - dense_3_loss: 0.1027
finishing epoch 3


TypeError: cannot pickle 'weakref' object