# Hate Span Prediction (HSP): Multi-class Classification

HSP Module takes a non-normalised sentence and predicts hateful spans within the spans. These spans are initially manualled annotated via the BIO notation. 
For evaluation we get the classic P/R/F1 but through the lense of POS tagged sequence eval setup

In [None]:
## The folder is setup to from google drive. If used else only the following lines needs commenting

from google.colab import drive

drive.mount('/content/drive')

## IMP NOTE: the trainable elmo crf module works only in tf 1. Tf-hub does not effectively support training of elmo in tf2.

## Install these inside colab

In [None]:
# !pip install tensorflow==1.13.1
# !pip install tensorflow-gpu==1.13.1
# !pip install tensorflow-hub==0.7.0
# !pip install keras==2.2.4
# !pip install git+https://www.github.com/keras-team/keras-contrib.git
# !pip install seqeval

from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

In [None]:
import random
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

K.tensorflow_backend._get_available_gpus()
config = tf.ConfigProto(device_count={'GPU': 0})
sess = tf.Session(config=config)
K.set_session(sess)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

import pickle
import pandas as pd
import numpy as np
import json

import keras
from keras.utils import to_categorical, plot_model
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

In [None]:
BASE_FOLDER = "/content/drive/MyDrive/hate_norm_kdd22/"
INPUT_FILE = "crf_datafinalkdd22.pkl"
OUTPUT_FOLDER = "hate_span_elmo_crf_weight"
OUTPUT_FILE = "elmo_weights"

ELMO_MODEL = "https://tfhub.dev/google/elmo/2"
MAX_LEN = 60
TEST_SIZE = 448

N_CRF_TAGS = 3
LSTM_UNITS = 512
DENSE_UNITS = 50
LSTM_DROPOUT = 0.2
DENSE_DROPOUT = 0.2
EPOCHS = 5
BATCH_SIZE = 32
SEED = 42

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(seed)


random_seed(SEED)

### Base ELMO Model

In [None]:
elmo_model = hub.Module(ELMO_MODEL, trainable=True)


def ElmoEmbedding(x):
    return elmo_model(inputs={
        "tokens": tf.squeeze(tf.cast(x, tf.string)),
        "sequence_len": tf.constant(batch_size * [MAX_LEN])
    },
                      signature="tokens",
                      as_dict=True)["elmo"]

### Model design

In [None]:
crf = CRF(N_CRF_TAGS)
input_text = keras.Input(shape=(None, ), dtype='string')
embedding = keras.layers.Lambda(ElmoEmbedding,
                                output_shape=(60, 1024))(input_text)
x = keras.layers.Bidirectional(
    keras.layers.LSTM(units=LSTM_UNITS,
                      return_sequences=True,
                      recurrent_dropout=LSTM_DROPOUT,
                      dropout=LSTM_DROPOUT))(embedding)
x_rnn = keras.layers.Bidirectional(
    keras.layers.LSTM(units=LSTM_UNITS,
                      return_sequences=True,
                      recurrent_dropout=LSTM_DROPOUT,
                      dropout=LSTM_DROPOUT))(embedding)
x = keras.layers.add([x, x_rnn])  # residual connection to the first biLSTM
base_model = keras.layers.TimeDistributed(
    keras.layers.Dense(DENSE_UNITS, activation="relu"))(x)
out = crf(base_model)  # CRF layer
model = keras.models.Model(input_text, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

### Dataset Prep

In [None]:
with open(BASE_FOLDER + INPUT_FILE, "rb") as f:
    input_data = pickle.load(f)

words = list(set(input_data["Word"].values))
words.append("__PAD__")
n_words = len(words)
tags = list(set(input_data["Tag"].values))
n_tags = len(tags)
assert n_tags == N_CRF_TAGS
postags = list(set(input_data['POS'].values))
postags.append("XX")
n_postags = len(postags)


class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(
            s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].
            values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


getter = SentenceGetter(data)
sent = getter.get_next()
sentences = getter.sentences

tag2idx = {t: i for i, t in enumerate(tags)}
postag2idx = {t: i for i, t in enumerate(postags)}
X = [[w[0] for w in s] for s in sentences]

new_X = []
for seq in X:
    new_seq = []
    for i in range(MAX_LEN):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=MAX_LEN,
                  sequences=y,
                  padding="post",
                  value=tag2idx["O"])
y = [to_categorical(i, num_classes=N_CRF_TAGS) for i in y]

ypos = [[postag2idx[w[1]] for w in s] for s in sentences]
ypos = pad_sequences(maxlen=MAX_LEN,
                     sequences=ypos,
                     padding="post",
                     value=postag2idx["XX"])

X_tr, X_te, y_tr, y_te, ypos_tr, ypos_te = train_test_split(
    X, y, ypos, test_size=TEST_SIZE, random_state=SEED)
print(len(X_tr), len(X_te), len(ypos_te))

X_tr, X_val = X_tr[:72 * batch_size], X_tr[-8 * batch_size:]
y_tr, y_val = y_tr[:72 * batch_size], y_tr[-8 * batch_size:]
X_tr = np.array(X_tr)
X_te = np.array(X_te)
y_tr = np.array(y_tr)
y_te = np.array(y_te)
X_val = np.asarray(X_val)
y_val = np.asarray(y_val)

train_steps = len(y_tr) // batch_size
val_steps = len(y_te) // batch_size

print(X_tr.shape, y_tr.shape, train_setps)
print(X_tr.shape, X_val.shape, X_te.shape)

assert X_tr.shape[0] % 32 == 0 and X_val.shape[0] % 32 == 0 and X_te.shape[
    0] % 32 == 0
print(X_tr.shape[0] / 32, X_val.shape[0] / 32, X_te.shape[0] / 32)

### Train and Evaluate

In [None]:
model.fit(
    X_tr,
    y_tr,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
)

y_pred = model.predict(X_te)

## Here we convert the normal BIO output to the form BIO-POSTAG form.
## This is for consumption into the seqeval classification report
idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("__PAD__", "O"))
        out.append(out_i)
    return out


def test2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p].replace("__PAD__", "O"))
        out.append(out_i)
    return out


pred_labels = pred2label(y_pred)
test_labels = test2label(np.argmax(y_te, -1))

posidx2tag = {i: w for w, i in postag2idx.items()}

true_labels_pos = []
pred_labels_pos = []
for idx in range(len(test_labels)):
    true = test_labels[idx]
    pred = pred_labels[idx]
    new_true = []
    new_pred = []
    for i, pt in enumerate(ypos_te[idx]):
        pt = posidx2tag[pt]
        new_true.append(true[i] + "-" + pt)
        new_pred.append(pred[i] + "-" + pt)
    true_labels_pos.append(new_true)
    pred_labels_pos.append(new_pred)

## IMP NOTE: This classification report is from the seqeval library and NOT SKLEARN

print(
    classification_report(true_labels_pos,
                          pred_labels_pos,
                          digits=N_CRF_ATGS,
                          scheme=IOB2))

In [None]:
model.save_weights(BASE_FOLDER + OUTPUT_FOLDER + OUTPUT_FILE)

## Save the tag-id maps
with open(BASE_FOLDER+"posidx2tag_elmo","w") as f:
    json.dump(posidx2tag)
with open(BASE_FOLDER+"idx2tag_elmo","w") as f:
    json.dump(idx2tag)    