# Modell test: BiLSTM-CNN 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
from keras import backend as K
from keras import regularizers
from keras import metrics
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Flatten, Input, Convolution2D, MaxPooling2D, Concatenate
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Bidirectional, LSTM
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import tensorflow_addons as tfa

import warnings
warnings.filterwarnings("ignore")

In [4]:
def load_data(location_train, location_test):
    df_train = pd.read_csv(location_train)
    df_test = pd.read_csv(location_test)
    df_train.lem_comments = df_train.lem_comments.astype(str)
    df_test.lem_comments = df_test.lem_comments.astype(str)
    return df_train, df_test

In [5]:
def get_x_y_train(df_train):
    x_train = df_train['lem_comments']
    label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    y_train = df_train[label_names].values
    return x_train, y_train, label_names

In [6]:
def get_x_test(df_train):
    x_test = df_train['lem_comments']
    return x_test

In [7]:
def get_y_test(location_y_test, label_names):
    y_test = pd.read_csv(location_y_test)
    col_names =['ids']
    col_names = col_names + label_names
    y_test.columns = col_names
    return y_test

In [8]:
def tokenize(x_train, x_test):
    raw_docs_train = x_train.tolist()
    raw_docs_test = x_test.tolist()
    tokenizer = Tokenizer(num_words=None, lower=True, char_level=False)
    tokenizer.fit_on_texts(raw_docs_train)
    tokenizer.fit_on_texts(raw_docs_test)
    word_seq_train = tokenizer.texts_to_sequences(raw_docs_train)
    word_seq_test = tokenizer.texts_to_sequences(raw_docs_test)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))
    return word_index, word_seq_train, word_seq_test

In [9]:
def padding(word_seq_train, word_seq_test):
    max_seq_len = 168
    word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
    word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)
    return word_seq_train, word_seq_test

In [10]:
def make_pipeline(location_train, location_test, location_y_test):
    print('loading_data...')
    df_train, df_test = load_data(location_train, location_test)
    print('getting all the xs and ys..')
    x_train, y_train, label_names = get_x_y_train(df_train)
    x_test = get_x_test(df_train)
    y_test = get_y_test(location_y_test, label_names)
    print("tokenizing input data...")
    word_index, word_seq_train, word_seq_test = tokenize(x_train, x_test)
    print('padding...')
    word_seq_train, word_seq_test = padding(word_seq_train, word_seq_test)
    return word_seq_train, y_train, word_seq_test, y_test, word_index


In [11]:
def get_embeddings(location_embedding):
    print('extracting word vextors...')
    embeddings_index = {}
    f = open(location_embedding, encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('preparing embedding matrix...')
    words_not_found = []
    embed_dim = 300
    embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
    for word, i in word_index.items():
        if i >= (len(word_index) + 1):
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix, words_not_found


In [12]:
#data['doc_len'] = data['comment_text'].apply(lambda words: len(words.split(" ")))
#max_seq_len = np.round(data['doc_len'].mean() + data['doc_len'].std()).astype(int)

max_seq_len = 168

In [12]:
# call functions to get started

location_train = # add your path here
location_test = # add your path here
location_y_test = # add your path here
location_embedding = # add your path here

word_seq_train, y_train, word_seq_test, y_test, word_index = make_pipeline(location_train, location_test, location_y_test)
embedding_matrix, words_not_found = get_embeddings(location_embedding)

loading_data...
getting all the xs and ys..
tokenizing input data...
dictionary size:  202005
padding...
extracting word vextors...


2000001it [00:59, 33552.93it/s]


preparing embedding matrix...
number of null word embeddings: 114000


In [13]:
MAX_NB_WORDS = 100000

#training params
batch_size = 256 
num_epochs = 10 

#model parameters
num_filters = 64 
embed_dim = 300 
weight_decay = 1e-4

input_dim = len(word_index) + 1
#vocab_size = nb_words

In [17]:
print("training BiLSTM-CNN ...")

embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=max_seq_len,
                            trainable=False)


# inspiration: https://stackoverflow.com/questions/43150635/combining-the-outputs-of-multiple-models-into-one-model
#parallel ip for different sections of image
inp = Input(shape=(max_seq_len, ))
emb = embedding_layer(inp)

bilstm = Bidirectional(LSTM(100, return_sequences=True), merge_mode='concat')(emb)
# paralle conv and pool layer which process the input with different kernels
conv1 = Conv1D(200, 7, activation='relu')(bilstm)
conv2 = Conv1D(200, 9, activation='relu')(bilstm)
conv3 = Conv1D(200, 11, activation='relu')(bilstm)

maxp1 = MaxPooling1D(3)(conv1)
maxp2 = MaxPooling1D(3)(conv2)
maxp3 = MaxPooling1D(3)(conv3)

flt1 = Flatten()(maxp1)
flt2 = Flatten()(maxp2)
flt3 = Flatten()(maxp3)

mrg = Concatenate()([flt1,flt2,flt3])

dense = Dense(256, activation='relu')(mrg)

op = Dense(6, activation='sigmoid')(dense)

model = Model(inputs=inp, outputs=op)
model.compile(optimizer='adam',
              loss= 'binary_crossentropy',
              metrics= metrics.AUC(),
            )
model.summary()              


training BiLSTM-CNN ...
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 168)]        0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 168, 300)     60601800    ['input_4[0][0]']                
                                                                                                  
 bidirectional_3 (Bidirectional  (None, 168, 200)    320800      ['embedding_3[0][0]']            
 )                                                                                                
                                                                                                  
 conv1d_9 (Conv1D)              (None, 162, 200)     280200      ['b

In [18]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=21, verbose=1)
callbacks_list = [early_stopping]

In [19]:
yay = model.fit(word_seq_train, y_train, epochs = 2, batch_size=256)

2022-02-15 09:27:56.017218: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/2
Epoch 2/2


In [20]:
y_pred = model.predict(word_seq_test)

In [21]:
for j in range(len(y_pred)):    
    for i in range(len(y_pred[1])):
        if y_pred[j][i] >= 0.5:
            y_pred[j][i] = 1
        else: 
            y_pred[j][i] = 0

In [22]:
y_pred.astype(int)

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
print(classification_report(y_test, y_pred))

In [106]:
macro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="macro")
weighted_roc_auc_ovr = roc_auc_score(
    y_test, y_pred, multi_class="ovr", average="weighted"
)
print(
    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
)

One-vs-Rest ROC AUC scores:
0.772453 (macro),
0.843408 (weighted by prevalence)


In [None]:
# plotting confusion matrix

f, axes = plt.subplots(2, 3, figsize=(25, 15))
axes = axes.ravel()
for i in range(6):
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test[:, i],
                                                   y_pred[:, i]),
                                  display_labels=[f'non {label_names[i]}', label_names[i]])#[0, i])
    disp.plot(ax=axes[i], values_format='.4g')
    disp.ax_.set_title(f'toxicity label:\n {label_names[i]}', fontsize=20)
    if i<3:
        disp.ax_.set_xlabel('')
    if i%3!=0:
        disp.ax_.set_ylabel('')
    disp.im_.colorbar.remove()

plt.subplots_adjust(wspace=0.8, hspace=0.01)
f.colorbar(disp.im_, ax=axes)
plt.show()