In [47]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.optimizers import Adam
from tensorflow.keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

In [2]:
EMBEDDING_FILES = [
    'DebiasMulticlassWordEmbedding-master/Debiasing/output/intersection_evalset/poly/reddit_US_txt_tok_clean_cleanedforw2v_0_inter_biasedEmbeddingsOut.w2v',
    'DebiasMulticlassWordEmbedding-master/Debiasing/output/intersection_evalset/poly/reddit_US_txt_tok_clean_cleanedforw2v_0_inter_hardDebiasedEmbeddingsOut.w2v'
]

In [3]:
NUM_MODELS = 1
# the maximum number of different words to keep in the original texts
# 40_000 is a normal number
# 100_000 seems good too
MAX_FEATURES = 100000 
BATCH_SIZE = 512

#units parameters in Keras.layers.LSTM/cuDNNLSTM
#it it the dimension of the output vector of each LSTM cell.
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 1

#we will convert each word in a comment_text to a number.
#So a comment_text is a list of number. How many numbers in this list?
#we want the length of this list is a constant -> MAX_LEN
MAX_LEN = 220


In [4]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    #each line in the file looks like 
    # apple 0.3 0.4 0.5 0.6 ...
    # that is a word followed by 50 float numbers

    with open(path) as f:
        #return dict(get_coefs(*line.strip().split(' ')) for line in f)
        return dict(get_coefs(*o.strip().split(" ")) for o in tqdm(f))

def build_matrix(word_index, path):
    #path: a path that contains embedding matrix
    #word_index is a dict of the form ('apple': 123, 'banana': 349, etc)
    # that means word_index[word] gives the index of the word
    # word_index was built from all commment_texts

    #we will construct an embedding_matrix for the words in word_index
    #using pre-trained embedding word vectors from 'path'

    embedding_index = load_embeddings(path)

    #embedding_matrix is a matrix of len(word_index)+1  x 50
    embedding_matrix = np.zeros((len(word_index) + 1, 50))

    # word_index is a dict. Each element is (word:i) where i is the index
    # of the word
    for word, i in word_index.items():
        try:
            #RHS is a vector of 300d
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [5]:
def build_model(embedding_matrix, num_aux_targets):
   # a simpler version can be found here
   # https://www.tensorflow.org/tutorials/keras/basic_text_classification

   # Trainable params of the model: 1,671,687
   # Recall that the number of samples in train.csv is 1_804_874

    #words is a vector of MAX_LEN dimension
    words = Input(shape=(MAX_LEN,))

    #Embedding is the keras layer. We use the pre-trained embbeding_matrix
    # https://keras.io/layers/embeddings/
    # have to say that parameters in this layer are not trainable
    # x is a vector of 600 dimension
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)

    #*embedding_matrix.shape is a short way for 
    #input_dim = embedding_matrix.shape[0], output_dim  = embedding_matrix.shape[1]

    #here the author used pre-train embedding matrix.
    #instead of train from begining like in tensorflow example

    #https://stackoverflow.com/questions/50393666/how-to-understand-spatialdropout1d-and-when-to-use-it
    x = SpatialDropout1D(0.25)(x)

    # x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    # x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])

    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='tanh')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid', name = 'main_output')(hidden)

    #num_aux_targets = 6 since y_aux_train has 6 columns
    aux_result = Dense(num_aux_targets, activation='sigmoid', name = 'aux_ouput')(hidden)

    model = Model(inputs=words, outputs=[result, aux_result])

    #model.summary() will gives a good view of the model structure

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(clipnorm=0.1),
        metrics=['accuracy'])


    return model

In [6]:
train = pd.read_csv('jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('jigsaw-unintended-bias-in-toxicity-classification/test.csv')

#
#Take the columns 'comment_text' from train,
# then fillall NaN values by emtpy string '' (redundant)
x_train = train['comment_text'].fillna('').values

#if true, y_train[i] =1, if false, it is = 0
y_train = np.where(train['target'] >= 0.5, 1, 0)

y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

#
#Take the columns 'comment_text' from test,
# then fillall NaN values by emtpy string '' (redundant)
x_test = test['comment_text'].fillna('').values


In [10]:
print(train.shape)
print(test.shape)
print(x_train.shape)
print(y_train.shape)
print(y_aux_train.shape)
print(x_test.shape)

(1804874, 45)
(97320, 2)
(1804874,)
(1804874,)
(1804874, 6)
(97320,)


In [15]:
# https://keras.io/preprocessing/text/
# tokenizer is a class with some method
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)

#we apply method fit_on_texts of tokenizer on x_train and x_test
#it will initialize some parameters/attribute inside tokenizer
#https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L139
#https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L210

tokenizer.fit_on_texts(list(x_train) + list(x_test))
#for example, after fit_on_texts, we can type
#tokenizer.word_counts #give a OderedDict
#tokenizer.document_counts # an int
#tokenizer.word_index is a dict of words with correponding indices
#There are 410046 different words in all 'comment_text'
#len(tokenizer.word_index) == 410_046


#these words come from all 'comment_text' in training.csv and test.csv

#tokenizer.index_word: the inverse of tokenizer.word_index


#https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L267
#we will convert each word in a comment_text to a number.
#So a comment_text is a list of number.


x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)


In [25]:
print(len(x_train))
print(x_train[0])
print(x_test[0])

1804874
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0 

In [21]:
#https://keras.io/preprocessing/sequence/
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/sequence.py
#each comment_text is now a list of word
# we want the length of this list is a constant -> MAX_LEN
# if the list is longer, then we cut/trim it 
# if shorter, then we add/pad it with 0's at the beginning
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [24]:
print(x_train.shape)
print(x_test.shape)

(1804874, 220)
(97320, 220)


In [26]:
# create an embedding_matrix 
#after this, embedding_matrix is a matrix of size
# len(tokenizer.word_index)+1   x 50
# for bw, dbw in EMBEDDING_FILES:
biased_embedding_matrix = build_matrix(tokenizer.word_index, EMBEDDING_FILES[0])
debiased_embedding_matrix = build_matrix(tokenizer.word_index, EMBEDDING_FILES[1])

print(biased_embedding_matrix.shape)
#== (?, 50)

#embedding_matrix[i] is a 600d vector representation of the word whose index is i
#embedding_matrix[10]
#tokenizer.index_word[10] == 'you'

44895it [00:00, 49505.38it/s]
44895it [00:01, 33844.99it/s]


(409328, 50)


In [27]:
model = build_model(biased_embedding_matrix, y_aux_train.shape[-1])

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 220)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 220, 50)      20466400    ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 220, 50)     0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 220, 256)     183296      ['spatial_dropout1d[0][0]']  

In [29]:
print(x_train.shape)
print(y_train.shape)
print(y_aux_train.shape)

(1804874, 220)
(1804874,)
(1804874, 6)


In [33]:
model.fit(
    x_train[:5000],
    [y_train[:5000], y_aux_train[:5000]],
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    # callbacks=[
    #     LearningRateScheduler(lambda epochs: 1e-3 * (0.6 ** global_epoch), verbose = 1)
    # ]
)



<keras.callbacks.History at 0x25fd8c71420>

In [38]:
predictions = model.predict(x_test[:5000], batch_size=2048)[0].flatten()



In [40]:
submission = pd.DataFrame.from_dict({
    'id': test['id'][:5000],
    'prediction': predictions
})

submission.to_csv('submission1.csv', index=False)

In [41]:
model.save('model_1.h5')



INFO:tensorflow:Assets written to: model_1.pth\assets


INFO:tensorflow:Assets written to: model_1.pth\assets


In [48]:
# It can be used to reconstruct the model identically.
from keras.models import load_model
reconstructed_model = load_model("model_1.h5")