# Loading required libraries and the cleaned data set

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import optimizers

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential #, load_model
from keras.layers import Embedding, GlobalMaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Conv1D
from keras.callbacks import EarlyStopping, Callback
from keras_self_attention import SeqSelfAttention
from keras import metrics

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
data = pd.read_csv('train.csv')
data.lem_comments = data.lem_comments.astype(str)
data.shape

In [None]:
data.head()

# Word embedding
## friendly help from: https://www.kaggle.com/vsmolyakov/keras-cnn-with-fasttext-embeddings

In [None]:
#load embeddings
print('loading word embeddings...')
embeddings_index = {}
f = open('modeling/embeddings/crawl-300d-2M-subword.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

## First look on data set and the distribution of comment lengths

In [None]:
# Splitting the comments in lists of words
data['doc_len'] = data['comment_text'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(data['doc_len'].mean() + data['doc_len'].std()).astype(int)
# and plotting the lengths
sns.distplot(data['doc_len'], hist=True, kde=True, color='b', label='doc len')
plt.axvline(x=max_seq_len, color='k', linestyle='--', label='max len')
plt.title('comment length'); plt.legend()
plt.show()

## Train test split for validation data

In [None]:
# some fixed parameters
testsize = 0.3
randomstate = 42
# train-test-split
x = data['lem_comments']

label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = data[label_names].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = testsize, random_state = randomstate)

In [None]:
raw_docs_train = x_train.tolist()
raw_docs_test = x_test.tolist()
num_classes = len(label_names)
len(raw_docs_train)

## Tokenizing and padding the sequences

In [None]:
print("tokenizing input data...")
tokenizer = Tokenizer(num_words=None, lower=True, char_level=False)
tokenizer.fit_on_texts(raw_docs_train)
tokenizer.fit_on_texts(raw_docs_test)
word_seq_train = tokenizer.texts_to_sequences(raw_docs_train)
word_seq_test = tokenizer.texts_to_sequences(raw_docs_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

In [None]:
#pad sequences to create sequences of same length --> CRUCIAL
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

## Embedding data

In [None]:
embed_dim = 300 
#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
#nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    if i >= (len(word_index) + 1):
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("sample words not found: ", np.random.choice(words_not_found, 10))

## Predicting classes 

In [None]:
def pred_class(word_seq_test, y_test, threshold = 0.5):
    print("Predicting lables")
    y_pred = model.predict(word_seq_test)
    # transferring probability predictions in classes
    for j in range(len(y_pred)):    
        for i in range(len(y_pred[1])):
            if y_pred[j][i] >= threshold:
                y_pred[j][i] = 1
            else: 
                y_pred[j][i] = 0
    y_pred = y_pred.astype(int)
    metric_values(y_test, y_pred)
    return y_pred

In [None]:
def metric_values(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    macro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="macro")
    weighted_roc_auc_ovr = roc_auc_score(
        y_test, y_pred, multi_class="ovr", average="weighted")
        
    print(
        "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
        "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
        )

# Modelling

In [None]:
def get_bi_lstm_model():
    print("training CNN ...")
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=True))
    model.add(Bidirectional(LSTM(20, return_sequences=True), merge_mode='concat')) # biLSTM-Layer
    model.add(SeqSelfAttention(attention_activation='sigmoid')) # Basic attention layer
    # model.add(SeqSelfAttention( # local attention layer
    #    attention_width=15,
    #    attention_activation='sigmoid',
    #    name='Attention',
    #))
    #model.add(SeqSelfAttention(attention_type=SeqSelfAttention.ATTENTION_TYPE_MUL, # Mulitiplicative attention
    #                        kernel_regularizer=regularizers.l2(lr), # with regularizer
    #                        bias_regularizer=regularizers.l1(lr),
    #                        attention_regularizer_weight=lr,
    #                        name='Attention'))
    
    model.add(Dropout(rate=0.8))
    model.add(Conv1D(64, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(num_classes, activation='sigmoid'))  #multi-label (k-hot encoding)

    adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="binary_crossentropy", optimizer=adam, metrics=metrics.AUC())#['accuracy'])
    return model


In [None]:
# Function for saving every epoch of fitting
class CustomSaver(Callback):
    def on_epoch_end(self, epoch, logs={}, how_often=1):
        if epoch % how_often == 0:  # or save after some epoch, each k-th epoch etc.
            self.model.save("modeling/models/model_1{}.hd5".format(epoch))

In [None]:
def train_model(model, word_seq_train, y_train, num_epochs=2):
    # create and use callback:
    saver = CustomSaver()
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=11, verbose=1)
    callbacks_list = [early_stopping, saver]

    result = model.fit(word_seq_train, y_train, batch_size=256, epochs=num_epochs, 
                 callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=4)
    return result


# Main program

In [None]:
model = get_bi_lstm_model()
model.summary()
results = train_model(model, word_seq_train, y_train, num_epochs=2)
y_pred = pred_class(word_seq_test, y_test, threshold = 0.5)

metric_values(y_test, y_pred)


In [None]:
# Cell for loading existing model 
# model = load_model('modeling/models/model_3.hd5')
# y_pred = pred_class(word_seq_test, threshold = 0.5)

In [None]:
# Plotting confusion matrices for each category
f, axes = plt.subplots(2, 3, figsize=(25, 15))
axes = axes.ravel()
for i in range(6):
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test[:, i],
                                                   y_pred[:, i]),
                                  display_labels=[f'non {label_names[i]}', label_names[i]])#[0, i])
    disp.plot(ax=axes[i], values_format='.4g')
    disp.ax_.set_title(f'toxicity label:\n {label_names[i]}', fontsize=20)
    if i<3:
        disp.ax_.set_xlabel('')
    if i%3!=0:
        disp.ax_.set_ylabel('')
    disp.im_.colorbar.remove()

plt.subplots_adjust(wspace=0.8, hspace=0.01)
f.colorbar(disp.im_, ax=axes)
plt.show()