<a href="https://colab.research.google.com/github/GiovanniSorice/Hate_Speech_Detection/blob/main/LSTM_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Bidirectional LSTM Hate Speech Classifier

In [70]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import Bidirectional # new! 
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt
from keras.preprocessing.text import one_hot
from tensorboard.plugins.hparams import api as hp
import numpy as np
from sklearn.metrics import f1_score
import ast 
from sklearn.model_selection import train_test_split

In [98]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [99]:
# directory name 
input_dir = '/content/drive/My Drive/HLT/clean_dataset_training/' 
input_test_dir = "/content/drive/My Drive/HLT/dataset_test_evalita_preprocessed/"
# Spec
pd.set_option("display.max_colwidth", None)

In [100]:
tsv_file = open(input_dir+"training_dataset.csv")

dataset = pd.read_csv(tsv_file,sep=',')

### Vector-space embedding: 

In [101]:
p_train=0.85 # percentage of training set 

n_dim = 64 
n_unique_words = 25000 
max_length = 64 # doubled!
pad_type = trunc_type = 'pre'

# training 
batch_size = 64

#### Preprocess data 

In [102]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

all_words = []
for index, row in dataset.iterrows():
  tokenize_word = word_tokenize(row["text"])
  for word in tokenize_word:
      all_words.append(word)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [103]:
unique_words = set(all_words)
print(len(unique_words))

24773


In [104]:
def sentence_to_emb2(sentence, w2v, truncate = None, padding = False):
  pad_token = [0]*128
  s_emb = [ w2v[word] for word in sentence if word in w2v.vocab]
  if truncate is not None:
    s_emb = s_emb[:truncate] #truncate
  if padding:
    s_emb += [pad_token] * (truncate - len(s_emb))
  return np.array(s_emb)

def get_data_to_emb2(data, w2v, truncate = None, padding = False):
  X = [sentence_to_emb2(ast.literal_eval(sentence), w2v, truncate, padding) for sentence in data]
  print(len(X))
  print(X[0])
  return np.array(X)

In [105]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
w2v_felice_path = "/content/drive/My Drive/HLT/w2v/twitter128.bin"
w2v = KeyedVectors.load_word2vec_format(datapath(w2v_felice_path), binary=True)

In [157]:
X_dev = get_data_to_emb2(dataset["tokens"], w2v, max_length , True)

6837
[[ 0.97042489  0.79645807  0.10190873 ...  1.01973236  1.16674519
   0.17082037]
 [-2.10587931  1.7696439  -1.04741096 ... -1.11571276 -0.25399542
  -0.97522277]
 [ 0.89639139  1.24942708  0.72824973 ...  0.68920714  0.98506999
  -0.36202168]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [158]:
dataset_other = dataset
dataset_other = dataset.drop(['text', 'id', 'hs', 'stereotype','tokens', 'hashtags'], axis=1)
dataset_other

Unnamed: 0,text_length,#C-L words,#?!,"#.,",#bad_words,%bad_words
0,120,10,0,5,0,0
1,101,0,0,0,1,6
2,86,8,0,1,3,25
3,118,0,0,2,0,0
4,138,0,1,1,1,4
...,...,...,...,...,...,...
6832,285,2,0,4,0,0
6833,277,0,2,3,0,0
6834,233,0,0,4,0,0
6835,206,2,0,2,0,0


In [159]:
x_train, x_valid, x_train_extra, x_valid_extra, y_train, y_valid = train_test_split(X_dev, dataset_other.values , dataset[['hs']], test_size=0.15, random_state=128)

In [160]:
input_train = {"text": x_train, "other": x_train_extra}
input_val   = {"text": x_valid, "other": x_valid_extra}

In [161]:
max_sent = 0 

In [162]:

def comment_length(text):
    global max_sent 
    text = ast.literal_eval(text)
    if len(text)>max_sent: 
      max_sent = len(text)

In [163]:
dataset['tokens'].apply(comment_length)
print(max_sent)

119


In [164]:
x_train_extra.shape

(5811, 6)

### Design grid search parameters

In [165]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/

import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [177]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([64, 128, 256, 512]))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.0, 0.7))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['nadam']))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

In [167]:
class FCallback(tf.keras.callbacks.Callback):
  
    def __init__(self, validation = (), verbose = 0):
        self.validation = validation
        self.verbose = verbose

    def on_train_begin(self, logs={}):
        self.f1 = []
        self.val_f1 = []
    def on_epoch_end(self, epoch, logs=None):
        y_t =  self.validation[1]
        y_p =  np.where(self.model.predict(self.validation[0]) > 0.5, 1, 0)
        logs['val_f1'] =  f1_score(y_t, y_p, average='macro')
        if self.verbose >0:
          print("— val_f1: {}".format(logs['val_f1']))

In [168]:
def train_test_model(hparams):
  lstml1_in = tf.keras.layers.Input(name="text", shape =(max_length,128,))
  lstml1_bd1 = tf.keras.layers.Bidirectional(LSTM(hparams[HP_NUM_UNITS], dropout = hparams[HP_DROPOUT]))(lstml1_in)
  #lstml1_bd2 = tf.keras.layers.Bidirectional(LSTM(hparams[HP_NUM_UNITS], dropout = hparams[HP_DROPOUT]))(lstml1_bd1)
  
  other_in = tf.keras.layers.Input(name="other", shape =(6,))

  lconcat = tf.keras.layers.Concatenate(axis=1)([lstml1_bd1, other_in])
  dense1_layer = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.007))(lconcat)
  dense2_layer = Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.007))(dense1_layer)

  lstml1_out = tf.keras.layers.Dense(1, activation='sigmoid')(dense2_layer)


  model = tf.keras.Model(inputs = [lstml1_in, other_in], outputs = lstml1_out)
  
  model.summary()
  
  model.compile(
      optimizer=hparams[HP_OPTIMIZER],
      loss='binary_crossentropy',
      metrics=['accuracy'],
  )
  
  f1_callback = FCallback(validation = (input_val, y_valid), verbose=True)                                   

  model.fit(input_train, y_train, batch_size=batch_size, validation_data=(input_val, y_valid), epochs=10, callbacks=[f1_callback]) # Run with 1 epoch to speed things up for demo purposes
  _, accuracy = model.evaluate(input_val, y_valid)

  #y_test_pred = np.where(model.predict(input_test)[0] > 0.5, 1, 0)
  y_test_pred = np.where(model.predict(input_test) > 0.5, 1, 0)

  print(y_test_pred)
  print(len(y_test_pred))

  print("f1_score test: {}".format(f1_score(y_test, y_test_pred,average="macro")))
  return accuracy

In [169]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    accuracy = train_test_model(hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [178]:
session_num = 0

for num_units in HP_NUM_UNITS.domain.values:
  for dropout_rate in (HP_DROPOUT.domain.min_value, HP_DROPOUT.domain.max_value):
    for optimizer in HP_OPTIMIZER.domain.values:
      hparams = {
          HP_NUM_UNITS: num_units,
          HP_DROPOUT: dropout_rate,
          HP_OPTIMIZER: optimizer,
      }
      run_name = "run-%d" % session_num
      print('--- Starting trial: %s' % run_name)
      print({h.name: hparams[h] for h in hparams})
      run('logs/hparam_tuning/' + run_name, hparams)
      session_num += 1


--- Starting trial: run-0
{'num_units': 64, 'dropout': 0.0, 'optimizer': 'nadam'}
Model: "model_34"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, 64, 128)]    0                                            
__________________________________________________________________________________________________
bidirectional_34 (Bidirectional (None, 128)          98816       text[0][0]                       
__________________________________________________________________________________________________
other (InputLayer)              [(None, 6)]          0                                            
__________________________________________________________________________________________________
concatenate_34 (Concatenate)    (None, 134)          0           bidirectional_34[0][0]           
         

### Test phase

In [171]:
csv_test_file = open(input_test_dir+"test_dataset_tweet.csv")

testset = pd.read_csv(csv_test_file,sep=',')

In [172]:
X_test = get_data_to_emb2(testset["tokens"], w2v, max_length , True)

1263
[[-0.40181148  1.30801952 -0.19409141 ...  0.49553871 -0.02620392
   1.63770258]
 [ 1.86031258  0.98840606 -2.10821915 ...  1.14880133  0.14479998
  -0.10640591]
 [-1.32805312  0.75008422  0.24781393 ... -0.08247134 -0.89805609
  -0.75278544]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [173]:
testset_other = testset
testset_other = testset.drop(['text', 'id', 'hs', 'stereotype','tokens', 'hashtags'], axis=1)
testset_other

Unnamed: 0,text_length,#C-L words,#?!,"#.,",#bad_words,%bad_words
0,180,4,1,4,0,0
1,227,5,4,5,0,0
2,259,2,2,4,1,2
3,99,7,0,2,0,0
4,257,87,2,0,0,0
...,...,...,...,...,...,...
1258,216,0,0,5,0,0
1259,159,81,3,1,1,3
1260,278,32,4,7,0,0
1261,128,0,1,3,0,0


In [174]:
input_test   = {"text": X_test, "other": testset_other.values}
y_test = testset[['hs']]
print(y_test)

      hs
0      1
1      1
2      1
3      1
4      1
...   ..
1258   1
1259   1
1260   1
1261   1
1262   1

[1263 rows x 1 columns]


In [175]:
y_test_pred = np.where(model.predict(input_test)[0] > 0.5, 1, 0)

NameError: ignored