<a href="https://colab.research.google.com/github/GiovanniSorice/Hate_Speech_Detection/blob/main/notebooks/Con1D_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1D convolution Hate Speech Classifier

In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import Bidirectional # new! 
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt
from keras.preprocessing.text import one_hot
from tensorboard.plugins.hparams import api as hp
import numpy as np
from sklearn.metrics import f1_score
import ast 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [3]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# directory name 
input_dir = '/content/drive/My Drive/HLT/clean_dataset_training/' 
input_test_dir = "/content/drive/My Drive/HLT/dataset_test_evalita_preprocessed/"
# Spec
pd.set_option("display.max_colwidth", None)

In [5]:
tsv_file = open(input_dir+"training_dataset.csv")

dataset = pd.read_csv(tsv_file,sep=',')

### Vector-space embedding: 

In [6]:
p_val=0.15 # percentage of validation set 

n_dim = 64 
n_unique_words = 25000 
max_length = 64 # doubled!
pad_type = trunc_type = 'pre'

# training 
batch_size = 64

#### Preprocess data 

In [7]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

all_words = []
for index, row in dataset.iterrows():
  tokenize_word = word_tokenize(row["text"])
  for word in tokenize_word:
      all_words.append(word)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
unique_words = set(all_words)
print(len(unique_words))

22525


In [9]:
parole_non_ric = set()
def sentence_to_emb2(sentence, w2v, truncate = None, padding = False):
  global parole_non_ric
  pad_token = [0]*128
  s_emb = [ w2v[word.lower()] for word in sentence if word.lower() in w2v.vocab]
  parole_non_ric.update(set([ word.lower() for word in sentence if word.lower() not in w2v.vocab]))
  if truncate is not None:
    s_emb = s_emb[:truncate] #truncate
  if padding:
    s_emb += [pad_token] * (truncate - len(s_emb))
  return np.array(s_emb)

def get_data_to_emb2(data, w2v, truncate = None, padding = False):
  X = [sentence_to_emb2(ast.literal_eval(sentence), w2v, truncate, padding) for sentence in data]
  print(len(X))
  print(X[0])
  return np.array(X)

In [10]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
w2v_felice_path = "/content/drive/My Drive/HLT/w2v/twitter128.bin"
w2v = KeyedVectors.load_word2vec_format(datapath(w2v_felice_path), binary=True)

In [11]:
X_dev = get_data_to_emb2(dataset["tokens"], w2v, max_length , True)

6837
[[ 1.47564483  0.12307259  1.0753547  ...  1.06197035  1.90046942
  -0.19663759]
 [-2.10587931  1.7696439  -1.04741096 ... -1.11571276 -0.25399542
  -0.97522277]
 [ 0.89639139  1.24942708  0.72824973 ...  0.68920714  0.98506999
  -0.36202168]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [12]:
len(parole_non_ric)

2159

In [13]:
dataset_other = dataset
dataset_other = dataset.drop(['text', 'id', 'hs', 'stereotype','tokens', 'hashtags'], axis=1)
dataset_other

Unnamed: 0,text_length,#C-L words,#?!,"#.,",#bad_words,%bad_words
0,120,10,0,5,0,0
1,101,0,0,0,1,6
2,86,8,0,1,3,25
3,118,0,0,2,0,0
4,138,0,1,1,1,4
...,...,...,...,...,...,...
6832,285,2,0,4,0,0
6833,277,0,2,3,0,0
6834,233,0,0,4,0,0
6835,206,2,0,2,0,0


In [14]:
x_train, x_valid, x_train_extra, x_valid_extra, y_train, y_valid = train_test_split(X_dev, dataset_other.values , dataset[['hs']], test_size=p_val, random_state=128)

In [15]:
input_train = {"text": x_train, "other": x_train_extra}
input_val   = {"text": x_valid, "other": x_valid_extra}

In [16]:
max_sent = 0 

In [17]:

def comment_length(text):
    global max_sent 
    text = ast.literal_eval(text)
    if len(text)>max_sent: 
      max_sent = len(text)

In [18]:
dataset['tokens'].apply(comment_length)
print(max_sent)

121


In [19]:
x_train_extra.shape

(5811, 6)

### Design grid search parameters

In [20]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/

import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

In [21]:
HP_NUM_FILTERS = hp.HParam('num_filters', hp.Discrete([1024]))
HP_L2_CONVOLUTION = hp.HParam('dropout', hp.RealInterval(0.0, 0.0008))
HP_L2_DENSE = hp.HParam('L2_reg', hp.RealInterval(0.0, 0.0008))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['nadam']))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_FILTERS, HP_L2_CONVOLUTION, HP_L2_DENSE, HP_OPTIMIZER],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

In [22]:
class FCallback(tf.keras.callbacks.Callback):
  
    def __init__(self, validation = (), verbose = 0):
        self.validation = validation
        self.verbose = verbose

    def on_train_begin(self, logs={}):
        self.f1 = []
        self.val_f1 = []
    def on_epoch_end(self, epoch, logs=None):
        y_t =  self.validation[1]
        y_p =  np.where(self.model.predict(self.validation[0]) > 0.5, 1, 0)
        logs['val_f1'] =  f1_score(y_t, y_p, average='macro')
        if self.verbose >0:
          print("— val_f1: {}".format(logs['val_f1']))

In [23]:
def get_model(hparams):
  embedding_dim = 128
  num_filters = hparams[HP_NUM_FILTERS]
  conv1D_in = tf.keras.layers.Input(name="text", shape =(max_length,128,))

  reshape_4 = tf.keras.layers.Reshape((max_length, embedding_dim, 1))(conv1D_in)
  conv_0_4 = tf.keras.layers.Conv2D(num_filters, kernel_size=(3, embedding_dim), padding='valid', kernel_initializer='normal', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2_CONVOLUTION]))(reshape_4)
  conv_1_4 = tf.keras.layers.Conv2D(num_filters, kernel_size=(4, embedding_dim), padding='valid', kernel_initializer='normal', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2_CONVOLUTION]))(reshape_4)
  conv_2_4 = tf.keras.layers.Conv2D(num_filters, kernel_size=(5, embedding_dim), padding='valid', kernel_initializer='normal', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2_CONVOLUTION]))(reshape_4)

  maxpool_0_4 = tf.keras.layers.MaxPool2D(pool_size=(max_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0_4)
  maxpool_1_4 = tf.keras.layers.MaxPool2D(pool_size=(max_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1_4)
  maxpool_2_4 = tf.keras.layers.MaxPool2D(pool_size=(max_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2_4)

  concatenated_tensor_4 = tf.keras.layers.Concatenate(axis=1)([maxpool_0_4, maxpool_1_4, maxpool_2_4])
  flatten_4 = tf.keras.layers.Flatten()(concatenated_tensor_4)

  dropout_4 = tf.keras.layers.Dropout(0.5)(flatten_4)
  # note the different activation
  other_in = tf.keras.layers.Input(name="other", shape =(6,))
  lconcat = tf.keras.layers.Concatenate(axis=1)([dropout_4, other_in])

  dense1_layer = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2_DENSE]))(lconcat)
  dense2_layer = Dense(128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2_DENSE]))(dense1_layer)
  dense3_layer = Dense(32, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2_DENSE]))(dense2_layer)

  output_4 = tf.keras.layers.Dense(units=1, activation='sigmoid')(dense3_layer)

  model = tf.keras.Model(inputs = [conv1D_in, other_in], outputs = output_4)
  
  # model.summary()
  
  return model

In [24]:
def train_test_model(hparams):
  
  model = get_model(hparams)
  model.compile(
      optimizer=hparams[HP_OPTIMIZER],
      loss='binary_crossentropy',
      metrics=['accuracy'],
  )

  #model.summary()

  f1_callback = FCallback(validation = (input_val, y_valid), verbose=True)                                   

  #filepath = input_dir + "model_output/biLSTM/HP_NUM_UNITS={0}/HP_DROPOUT={1}/HP_L2={2}/".format(hparams[HP_NUM_UNITS],hparams[HP_DROPOUT],hparams[HP_L2])
  #filepath += "saved-model-{epoch:02d}-{val_accuracy:.2f}.hdf5"
  #checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=False, mode='max')

  model.fit(input_train, y_train, batch_size=batch_size, validation_data=(input_val, y_valid), epochs=20, callbacks=[f1_callback]) # Run with 1 epoch to speed things up for demo purposes
  _, accuracy = model.evaluate(input_val, y_valid)

  y_test_pred_tweets = np.where(model.predict(input_test_tweets) > 0.5, 1, 0)
  y_test_pred_news = np.where(model.predict(input_test_news) > 0.5, 1, 0)

  print("f1_score test tweets: {}".format(f1_score(y_test_tweets, y_test_pred_tweets,average="macro")))
  print("f1_score test news: {}".format(f1_score(y_test_news, y_test_pred_news,average="macro")))
  return accuracy

In [25]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    accuracy = train_test_model(hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [41]:
session_num = 0

for num_filters in HP_NUM_FILTERS.domain.values:
  for L2_rate_conv in np.arange(HP_L2_CONVOLUTION.domain.min_value, HP_L2_CONVOLUTION.domain.max_value, 0.0002):
      for L2_rate_dense in np.arange(HP_L2_DENSE.domain.min_value, HP_L2_DENSE.domain.max_value, 0.0002):
        for optimizer in HP_OPTIMIZER.domain.values:
          hparams = {
              HP_NUM_FILTERS: num_filters,
              HP_L2_CONVOLUTION: L2_rate_conv,
              HP_OPTIMIZER: optimizer,
              HP_L2_DENSE: L2_rate_dense,
          }
          run_name = "run-%d" % session_num
          print('--- Starting trial: %s' % run_name)
          print({h.name: hparams[h] for h in hparams})
          run('logs/hparam_tuning/' + run_name, hparams)
          session_num += 1


--- Starting trial: run-0
{'num_filters': 1024, 'dropout': 0.0, 'optimizer': 'nadam', 'L2_reg': 0.0}
Epoch 1/20
— val_f1: 0.7098416289592759
Epoch 2/20
— val_f1: 0.7112859495081245
Epoch 3/20
— val_f1: 0.7302031866909802
Epoch 4/20
— val_f1: 0.7036935704514364
Epoch 5/20
— val_f1: 0.7477488868618083
Epoch 6/20
— val_f1: 0.5535178236397749
Epoch 7/20
— val_f1: 0.7484197907585004
Epoch 8/20
— val_f1: 0.7325678974671956
Epoch 9/20
— val_f1: 0.7593835044324091
Epoch 10/20
— val_f1: 0.724880291131967
Epoch 11/20
— val_f1: 0.7595185126330843
Epoch 12/20
— val_f1: 0.7513665978070873
Epoch 13/20
— val_f1: 0.7567019126107914
Epoch 14/20
— val_f1: 0.7398510689406679
Epoch 15/20
— val_f1: 0.7449560925126446
Epoch 16/20
— val_f1: 0.7500488678545808
Epoch 17/20
— val_f1: 0.7127606382225455
Epoch 18/20
— val_f1: 0.7568699040714768
Epoch 19/20
— val_f1: 0.7527790271798498
Epoch 20/20
— val_f1: 0.7412229742860332
f1_score test tweets: 0.740507009121188
f1_score test news: 0.6267232941254541
--- Starti

### Test phase

In [26]:
csv_test_tweets_file = open(input_test_dir+"test_dataset_tweets.csv")

testset_tweets = pd.read_csv(csv_test_tweets_file,sep=',')

csv_test_news_file = open(input_test_dir+"test_dataset_news.csv")

testset_news = pd.read_csv(csv_test_news_file,sep=',')

In [27]:
X_test_news = get_data_to_emb2(testset_news["tokens"], w2v, max_length , True)
X_test_tweets = get_data_to_emb2(testset_tweets["tokens"], w2v, max_length , True)

500
[[ 0.4473033  -1.85372221  1.80903184 ... -0.81329495 -0.37441084
   0.67255157]
 [ 0.65353721  2.91942644  0.83319777 ...  0.10994434  0.79807943
  -0.64684689]
 [ 0.27626377  0.63812095  1.54855597 ... -0.60746866  1.20815647
   0.78223377]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
1263
[[ 0.85021353  0.94971675  0.95175791 ...  0.66373271  0.95456082
   0.80749106]
 [ 1.86031258  0.98840606 -2.10821915 ...  1.14880133  0.14479998
  -0.10640591]
 [-1.32805312  0.75008422  0.24781393 ... -0.08247134 -0.89805609
  -0.75278544]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [28]:
testset_tweets_other = testset_tweets
testset_tweets_other = testset_tweets.drop(['text', 'id', 'hs', 'stereotype','tokens', 'hashtags'], axis=1)
testset_tweets_other

Unnamed: 0,text_length,#C-L words,#?!,"#.,",#bad_words,%bad_words
0,180,4,1,4,0,0
1,227,5,4,5,0,0
2,259,2,2,4,1,2
3,99,7,0,2,0,0
4,257,87,2,0,0,0
...,...,...,...,...,...,...
1258,216,0,0,5,0,0
1259,159,81,3,1,1,3
1260,278,32,4,7,1,2
1261,128,0,1,3,0,0


In [29]:
testset_news_other = testset_news
testset_news_other = testset_news.drop(['text', 'id', 'hs', 'stereotype','tokens', 'hashtags'], axis=1)
testset_news_other

Unnamed: 0,text_length,#C-L words,#?!,"#.,",#bad_words,%bad_words
0,102,0,1,5,0,0
1,108,0,0,1,0,0
2,48,0,0,0,0,0
3,112,0,0,5,0,0
4,117,0,0,6,0,0
...,...,...,...,...,...,...
495,80,0,0,2,0,0
496,60,0,0,0,0,0
497,86,0,0,2,0,0
498,92,0,0,0,0,0


In [30]:
input_test_tweets   = {"text": X_test_tweets, "other": testset_tweets_other.values}
y_test_tweets = testset_tweets[['hs']]


In [31]:
input_test_news   = {"text": X_test_news, "other": testset_news_other.values}
y_test_news = testset_news[['hs']]


In [32]:
x_kfold = X_dev
x_other_kfold = dataset_other
y_kfold = dataset[['hs']]

In [37]:
def train_test_model_with_kfold(hparams):
  number_of_splits = 5
  cv_kfold = StratifiedKFold(n_splits=number_of_splits, shuffle=True, random_state=100)
  models = []
  for train_index, validation_index in cv_kfold.split(x_kfold, y_kfold):
    model = get_model(hparams)
    model.compile(
        optimizer=hparams[HP_OPTIMIZER],
        loss='binary_crossentropy',
        metrics=['accuracy'],
    )

    input_train_kfold = {"text": x_kfold[train_index], "other": x_other_kfold.loc[train_index]}
    input_val_kfold   = {"text": x_kfold[validation_index], "other": x_other_kfold.loc[validation_index]}
    y_train_kfold = y_kfold.loc[train_index]
    y_valid_kfold = y_kfold.loc[validation_index]

    f1_callback = FCallback(validation = (input_val_kfold, y_valid_kfold), verbose=True)                                   

    #filepath = input_dir + "model_output/biLSTM/HP_NUM_UNITS={0}/HP_DROPOUT={1}/HP_L2={2}/".format(hparams[HP_NUM_UNITS],hparams[HP_DROPOUT],hparams[HP_L2])
    #filepath += "saved-model-{epoch:02d}-{val_accuracy:.2f}.hdf5"
    #checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=False, mode='max')


    model.fit(input_train_kfold, y_train_kfold, batch_size=batch_size, validation_data=(input_val_kfold, y_valid_kfold), epochs=20, callbacks=[f1_callback]) # Run with 1 epoch to speed things up for demo purposes
    _, accuracy = model.evaluate(input_val_kfold, y_valid_kfold)

    #y_test_pred = np.where(model.predict(input_test)[0] > 0.5, 1, 0)
    #y_test_pred_tweets = np.where(model.predict(input_test_tweets) > 0.5, 1, 0)
    #y_test_pred_news = np.where(model.predict(input_test_news) > 0.5, 1, 0)

    #print("f1_score test tweets: {}".format(f1_score(y_test_tweets, y_test_pred_tweets,average="macro")))
    #print("f1_score test news: {}".format(f1_score(y_test_news, y_test_pred_news,average="macro")))
    models.append(model)

  return models

In [34]:
def predict_with_ensemble(models, test_input):
  # make predictions
  results = []
  y_predict = [np.squeeze(np.where(model.predict(test_input) > 0.5, 1,0).reshape(1,-1)) for model in models]
  # sum across ensemble members
  y_predict = np.array(y_predict)

  for i in range(y_predict.shape[1]):
    counts = np.bincount(y_predict[:,i])
    results.append(np.argmax(counts))
  # argmax across classes
  return results

In [35]:
def run_with_kfold(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    models = train_test_model_with_kfold(hparams)
    y_test_pred_tweets = predict_with_ensemble(models, input_test_tweets)
    y_test_pred_news = predict_with_ensemble(models, input_test_news)

    print("f1_score test tweets: {}".format(f1_score(y_test_tweets, y_test_pred_tweets,average="macro")))
    print("f1_score test news: {}".format(f1_score(y_test_news, y_test_pred_news,average="macro")))
    return models


In [40]:
y_test_pred_tweets = predict_with_ensemble(models, input_test_tweets)
y_test_pred_news = predict_with_ensemble(models, input_test_news)

print("f1_score test tweets: {}".format(f1_score(y_test_tweets, y_test_pred_tweets,average="macro")))
print("f1_score test news: {}".format(f1_score(y_test_news, y_test_pred_news,average="macro")))


NameError: ignored

In [45]:
hparams = {
  HP_NUM_FILTERS: 1024,
  HP_L2_CONVOLUTION: 0.0,
  HP_OPTIMIZER: 'nadam',
  HP_L2_DENSE: 0.0002,
}
run_name = "run-test" 
print('--- Starting trial: %s' % run_name)
print({h.name: hparams[h] for h in hparams})
models = run_with_kfold('logs/hparam_tuning/' + run_name, hparams)


--- Starting trial: run-test
{'num_filters': 1024, 'dropout': 0.0, 'optimizer': 'nadam', 'L2_reg': 0.0002}
Epoch 1/20
— val_f1: 0.676961410414791
Epoch 2/20
— val_f1: 0.6966100290981186
Epoch 3/20
— val_f1: 0.49448594032837756
Epoch 4/20
— val_f1: 0.7426004460473083
Epoch 5/20
— val_f1: 0.7537436596260125
Epoch 6/20
— val_f1: 0.7457920525785597
Epoch 7/20
— val_f1: 0.756670911128867
Epoch 8/20
— val_f1: 0.7573797115933752
Epoch 9/20
— val_f1: 0.7427280467085267
Epoch 10/20
— val_f1: 0.45002831389946535
Epoch 11/20
— val_f1: 0.7359576857989958
Epoch 12/20
— val_f1: 0.7516454317374149
Epoch 13/20
— val_f1: 0.7469229346170347
Epoch 14/20
— val_f1: 0.6841813648536337
Epoch 15/20
— val_f1: 0.7463018967570287
Epoch 16/20
— val_f1: 0.6844551456004104
Epoch 17/20
— val_f1: 0.725928533708854
Epoch 18/20
— val_f1: 0.755602904040404
Epoch 19/20
— val_f1: 0.7393436789339305
Epoch 20/20
— val_f1: 0.6215349690648599
Epoch 1/20
— val_f1: 0.6107003069949759
Epoch 2/20
— val_f1: 0.7059989607190285
Epoc

In [46]:
for i in range(len(models)):
  models[i].save(input_dir+"model_output/kimCNN/{0}_{1}_{2}_{3}/model_{4}.h5".format(hparams[HP_NUM_FILTERS],hparams[HP_L2_CONVOLUTION],hparams[HP_OPTIMIZER],hparams[HP_L2_DENSE],i))

In [None]:
f1_score test tweets: 0.7603277015648642
f1_score test news: 0.6831647909673162