<a href="https://colab.research.google.com/github/GiovanniSorice/Hate_Speech_Detection/blob/main/LSTM_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Bidirectional LSTM Hate Speech Classifier

In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import Bidirectional # new! 
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt
from keras.preprocessing.text import one_hot
from tensorboard.plugins.hparams import api as hp

In [2]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# directory name 
input_dir = '/content/drive/My Drive/HLT/clean_dataset_training/' 

# Spec
pd.set_option("display.max_colwidth", None)

In [4]:
tsv_file = open(input_dir+"training_dataset.csv")

dataset = pd.read_csv(tsv_file,sep=',')

### Vector-space embedding: 

In [5]:
p_train=0.85 # percentage of training set 

n_dim = 64 
n_unique_words = 20000 
max_length = 100 # doubled!
pad_type = trunc_type = 'pre'

# training 
batch_size = 64

#### Preprocess data 

In [6]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

all_words = []
for index, row in dataset.iterrows():
  tokenize_word = word_tokenize(row["text"])
  for word in tokenize_word:
      all_words.append(word)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
unique_words = set(all_words)
print(len(unique_words))

24773


In [8]:
embedded_sentences = [one_hot(row["text"], n_unique_words) for i,row in dataset.iterrows()]
print(len(embedded_sentences))

6837


In [9]:
word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(dataset.get("text"), key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))

In [10]:
x_train=embedded_sentences[0:int(dataset.shape[0]*p_train)]
y_train=dataset[['hs']][0:int(dataset.shape[0]*p_train)]
x_valid=embedded_sentences[int(dataset.shape[0]*p_train):]
y_valid=dataset[['hs']][int(dataset.shape[0]*p_train):]

In [11]:
x_train = pad_sequences(x_train, maxlen=max_length, padding=pad_type, truncating=trunc_type, value=0)
x_valid = pad_sequences(x_valid, maxlen=max_length, padding=pad_type, truncating=trunc_type, value=0)

In [12]:
max_sent = 0 

In [13]:
import ast 

def comment_length(text):
    global max_sent 
    text = ast.literal_eval(text)
    if len(text)>max_sent: 
      max_sent = len(text)

In [14]:
dataset['tokens'].apply(comment_length)
print(max_sent)

119


In [21]:
x_train.shape

(5811, 100)

### Design grid search parameters

In [15]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/

import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

In [17]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([64, 128, 256, 512]))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.2, 0.7))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['nadam']))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

In [32]:
def train_test_model(hparams):
  lstml1_in = tf.keras.layers.Input(name="text", shape =(max_length,))
  lstml1_embedded = tf.keras.layers.Embedding(n_unique_words, n_dim, input_length=max_length)(lstml1_in)
  lstml1_sd = tf.keras.layers.SpatialDropout1D(hparams[HP_DROPOUT])(lstml1_embedded)
  lstml1_bd1 = tf.keras.layers.Bidirectional(LSTM(hparams[HP_NUM_UNITS], dropout = hparams[HP_DROPOUT], return_sequences=True))(lstml1_sd)
  lstml1_bd2 = tf.keras.layers.Bidirectional(LSTM(hparams[HP_NUM_UNITS], dropout = hparams[HP_DROPOUT]))(lstml1_bd1)
  lstml1_out = tf.keras.layers.Dense(1, activation='sigmoid')(lstml1_bd2)

  model = tf.keras.Model(lstml1_in, lstml1_out)

  model.compile(
      optimizer=hparams[HP_OPTIMIZER],
      loss='binary_crossentropy',
      metrics=['accuracy'],
  )
  model.fit(x_train, y_train, batch_size=batch_size, validation_data=(x_valid, y_valid), epochs=10) # Run with 1 epoch to speed things up for demo purposes
  _, accuracy = model.evaluate(x_valid, y_valid)
  return accuracy

In [33]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    accuracy = train_test_model(hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [34]:
session_num = 0

for num_units in HP_NUM_UNITS.domain.values:
  for dropout_rate in (HP_DROPOUT.domain.min_value, HP_DROPOUT.domain.max_value):
    for optimizer in HP_OPTIMIZER.domain.values:
      hparams = {
          HP_NUM_UNITS: num_units,
          HP_DROPOUT: dropout_rate,
          HP_OPTIMIZER: optimizer,
      }
      run_name = "run-%d" % session_num
      print('--- Starting trial: %s' % run_name)
      print({h.name: hparams[h] for h in hparams})
      run('logs/hparam_tuning/' + run_name, hparams)
      session_num += 1


--- Starting trial: run-0
{'num_units': 64, 'dropout': 0.2, 'optimizer': 'nadam'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
--- Starting trial: run-1
{'num_units': 64, 'dropout': 0.7, 'optimizer': 'nadam'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
--- Starting trial: run-2
{'num_units': 128, 'dropout': 0.2, 'optimizer': 'nadam'}
Epoch 1/10


KeyboardInterrupt: ignored