# Hate Span Prediction (HSP): Multi-class Classification

HSP Module takes a non-normalised sentence and predicts hateful spans within the spans. These spans are initially manualled annotated via the BIO notation. 
For evaluation we get the classic P/R/F1 but through the lense of POS tagged sequence eval setup

## IMP NOTE: This is the TF-2 Version of HSP using Glove as based embedding.

In [None]:
# !pip install numpy==1.19.5
# !pip uninstall tensorflow
# !pip install tensorflow==2.2.0
# !pip install tensorflow-addons==0.10.0
# !pip install sklearn scipy
# !pip install seqeval

import numpy
assert numpy.__version__=="1.19.5"

import tensorflow as tf

print(tf.__version__)

2.2.0


In [None]:
## The folder is setup to from google drive. If used else only the following lines needs commenting

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
import pickle
import json
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
import os
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
BASE_FOLDER = "/content/drive/MyDrive/hate_norm_kdd22/"
INPUT_FILE = "crf_datafinalkdd22.p"
OUTPUT_FOLDER = "hate_span_glove_crf_tf2_weight"
OUTPUT_FILE = "hate_span_glove_crf_tf2"
GLOVE_EMB_TRAINED = "glove_emb_crf_weights.pkl"
GLOVE_VECTORIZER_TRAINED = "glove_crf_tv_layer.pkl"

## Note we are using an existing CRF module, source is currently untraceable, but it was taken from Github.
CRF_FILE = "crf.py"
GLOVE_EMB = 200
MAX_LEN = 60
TEST_SIZE = 448

N_CRF_TAGS = 3
LSTM_UNITS = 512
DENSE_UNITS = 50
LSTM_DROPOUT = 0.2
DENSE_DROPOUT = 0.2
EPOCHS = 2 #(Default 5, check!)
BATCH_SIZE = 32
SEED = 42

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)


random_seed(SEED)

## Note we are using an existing CRF module, source is currently untraceable, but it was taken from Github.
sys.path.append(BASE_FOLDER + CRF_FILE)
!cp /content/drive/MyDrive/hate_norm_kdd22/crf.py . # Done for colab
from crf import CRF as CRF_lib

### Data Prep

In [None]:
with open(BASE_FOLDER + INPUT_FILE, "rb") as f:
    input_data = pickle.load(f)

words = list(set(input_data["Word"].values))
words.append("__PAD__")
n_words = len(words)
tags = list(set(input_data["Tag"].values))
n_tags = len(tags)
assert n_tags == N_CRF_TAGS
postags = list(set(input_data['POS'].values))
postags.append("XX")
n_postags = len(postags)


class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(
            s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].
            values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


getter = SentenceGetter(input_data)
sent = getter.get_next()
sentences = getter.sentences

tag2idx = {t: i for i, t in enumerate(tags)}
postag2idx = {t: i for i, t in enumerate(postags)}
X = [[w[0] for w in s] for s in sentences]

new_X = []
for seq in X:
    new_seq = []
    for i in range(MAX_LEN):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = keras.preprocessing.sequence.pad_sequences(maxlen=MAX_LEN,
                                               sequences=y,
                                               padding="post",
                                               value=tag2idx["O"])
y = [keras.utils.to_categorical(i, num_classes=N_CRF_TAGS) for i in y]

ypos = [[postag2idx[w[1]] for w in s] for s in sentences]
ypos = keras.preprocessing.sequence.pad_sequences(maxlen=MAX_LEN,
                                                  sequences=ypos,
                                                  padding="post",
                                                  value=postag2idx["XX"])

X_tr, X_te, y_tr, y_te, ypos_tr, ypos_te = train_test_split(
    X, y, ypos, test_size=TEST_SIZE, random_state=SEED)
print(len(X_tr), len(X_te))
print(len(y_tr), len(y_te))
print(len(ypos_tr), len(ypos_te))

X_tr_vectorize = [" ".join(X_tr[i]) for i in range(len(X_tr))]
X_te_vectorize = [" ".join(X_te[i]) for i in range(len(X_te))]
print(len(X_tr_vectorize), len(X_te_vectorize))
y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)
print(y_tr.shape, y_te.shape)

vectorizer = TextVectorization(max_tokens=None,
                               output_sequence_length=MAX_LEN,
                               standardize=None)
text_ds = tf.data.Dataset.from_tensor_slices(X_tr_vectorize)
vectorizer.adapt(text_ds.batch(128))

voc = vectorizer.get_vocabulary()
voc_bs = [tf.compat.as_str_any(i) for i in voc]
word_index = dict(zip(voc_bs, range(len(voc))))
print("Len Vocab", len(voc))

X_train = vectorizer(np.array([[s] for s in X_tr_vectorize])).numpy()
X_test = vectorizer(np.array([[s] for s in X_te_vectorize])).numpy()

2579 448
2579 448
2579 448
2579 448
(2579, 60, 3) (448, 60, 3)
Len Vocab 9754


### Set up glove embeddings

In [None]:
## Download and unzip the embeddings if using for 1st time. Y
##You can also save and load from your specified path once you save in your folder

!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip -q glove.twitter.27B.zip

--2022-08-03 07:08:40--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2022-08-03 07:08:40--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2022-08-03 07:08:41--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [ap

In [None]:
path_to_glove_file = "glove.twitter.27B." + str(GLOVE_EMB) + "d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

num_tokens = len(voc) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, GLOVE_EMB))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:

        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        # Words not found in embedding index will be have random embedding.
        # This includes the representation for "padding" and "OOV"
        random_num1 = np.random.rand(GLOVE_EMB)
        random_num2 = np.random.rand(GLOVE_EMB)
        embedding_vector = [
            r1 if r2 < 0.5 else -1 * r1
            for r1, r2 in zip(random_num1, random_num2)
        ]
        # print(embedding_vector)
        embedding_matrix[i] = embedding_vector
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

embedding_layer = Embedding(
    num_tokens,
    GLOVE_EMB,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

Found 1193514 word vectors.
Converted 6046 words (3708 misses)


## Model prep

In [None]:
crf = CRF_lib(N_CRF_TAGS, sparse_target=True)

input_text = keras.Input(shape=(MAX_LEN, ), dtype=tf.float32)
embedding = Embedding(
    num_tokens,
    GLOVE_EMB,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)(input_text)
x = keras.layers.Bidirectional(
    keras.layers.LSTM(units=LSTM_UNITS,
                      return_sequences=True,
                      recurrent_dropout=LSTM_DROPOUT,
                      dropout=LSTM_DROPOUT,
                      name='lstm1'))(embedding)
x_rnn = keras.layers.Bidirectional(
    keras.layers.LSTM(units=LSTM_UNITS,
                      return_sequences=True,
                      recurrent_dropout=LSTM_DROPOUT,
                      dropout=LSTM_DROPOUT))(embedding)
x = keras.layers.add([x, x_rnn])  # residual connection to the first biLSTM
base_model = keras.layers.TimeDistributed(
    keras.layers.Dense(DENSE_UNITS, activation="relu"))(x)
base_model = keras.layers.Dense(N_CRF_TAGS)(base_model)
out = crf(base_model)
model = keras.models.Model(input_text, out)
model.compile('adam', loss=crf.loss, metrics=[crf.accuracy])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 200)      1951200     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 60, 1024)     2920448     embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 60, 1024)     2920448     embedding_2[0][0]                
______________________________________________________________________________________________

### Train and Eval

In [None]:
model.fit(X_train,
          y_tr,
          batch_size=BATCH_SIZE,
          validation_split=0.1,
          epochs=EPOCHS)

y_pred = model.predict(X_test)

idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("__PAD__", "O"))
        out.append(out_i)
    return out


def test2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p].replace("__PAD__", "O"))
        out.append(out_i)
    return out


pred_labels = pred2label(y_pred)
test_labels = test2label(np.argmax(y_te, -1))

posidx2tag = {i: w for w, i in postag2idx.items()}

true_labels_pos = []
pred_labels_pos = []
for idx in range(len(test_labels)):
    true = test_labels[idx]
    pred = pred_labels[idx]
    new_true = []
    new_pred = []
    for i, pt in enumerate(ypos_te[idx]):
        pt = posidx2tag[pt]
        new_true.append(true[i] + "-" + pt)
        new_pred.append(pred[i] + "-" + pt)
    true_labels_pos.append(new_true)
    pred_labels_pos.append(new_pred)

## IMP NOTE: This classification report is from the seqeval library and NOT SKLEARN

print(
    classification_report(true_labels_pos,
                          pred_labels_pos,
                          digits=N_CRF_TAGS))

Epoch 1/2
Epoch 2/2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.000     0.000     0.000         1
           .      1.000     0.250     0.400         4
          CC      0.583     0.467     0.519        30
          CD      0.667     0.500     0.571         4
          DT      0.600     0.273     0.375        66
          EX      0.000     0.000     0.000         1
          IN      0.559     0.306     0.396        62
          JJ      0.718     0.575     0.639       186
         JJR      1.000     1.000     1.000         6
         JJS      0.000     0.000     0.000         1
          MD      0.000     0.000     0.000        10
          NN      0.705     0.639     0.670       457
         NNP      0.620     0.571     0.595       140
        NNPS      0.000     0.000     0.000         2
         NNS      0.810     0.693     0.747       179
         PDT      1.000     1.000     1.000         1
         PRP      0.833     0.185     0.303        27
        PRP$      0.727    

### Model Saving

In [None]:
# ## Save the trained glove embeddings

# for layer in model.layers:
#     name = layer.get_config().get('name')
#     if "embedding" in name:
#         print(layer.get_weights()[0].shape == (num_tokens, glove_emd))
#         with open(BASE_FOLDER + GLOVE_EMB_TRAINED, "wb") as f:
#             pickle.dump(layer.get_weights(), f)
#         break

# ## Save the text vectorizers
# pickle.dump(
#     {
#         'config': vectorizer.get_config(),
#         'weights': vectorizer.get_weights()
#     }, open(BASE_FOLDER + GLOVE_VECTORIZER_TRAINED, "wb"))

# model.save_weights(BASE_FOLDER + OUTPUT_FOLDER + OUTPUT_FILE)

# ## Save the tag-id maps
# with open(BASE_FOLDER + "posidx2tag_glove", "w") as f:
#     json.dump(posidx2tag)
# with open(BASE_FOLDER + "idx2tag_glove", "w") as f:
#     json.dump(idx2tag)