In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm.notebook import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
# import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
# from transformers import * # this may screw things up later
from transformers import BertTokenizer, TFBertModel, BertConfig
# import emoji
# import torch

import spacy
nlp = spacy.load('en_core_web_sm')      

# Set CPU usage
tf.config.threading.set_intra_op_parallelism_threads(2)

In [2]:
def preprocess_text(text):
    if text == '':
        return ''
    else:
        text = text.lower()
        text_cleaned = re.sub(r'@[A-Za-z0-9_]+', '', text)
        text_cleaned = re.sub(r'#[A-Za-z0-9_]+', '', text_cleaned)
        text_cleaned = re.sub(r'https?:\/\/\S*', '', text_cleaned)
        text_cleaned = text_cleaned.replace(',', '')
        
        tokenized = nlp(text_cleaned)
        output_list = []
        for token in tokenized:
            if not token.is_stop:
                output_list.append(token.lemma_)
        
        output = ' '.join([x for x in output_list if x != ''])
        return output

In [3]:
def create_bert_tokenizer_model(num_classes):
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
    return bert_tokenizer, bert_model

In [4]:
def create_sentence_embeddings(sentences, bert_tokenizer):
    input_ids=[]
    attention_masks=[]

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent, add_special_tokens=True, max_length=64, pad_to_max_length=True,
                                            return_attention_mask = True)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])
        
    input_ids=np.asarray(input_ids)
    attention_masks=np.array(attention_masks)
    return input_ids, attention_masks

In [5]:
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath='./output',save_weights_only=True,
                                                monitor='val_loss',mode='min',save_best_only=True)]

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

2022-03-07 10:51:23.430335: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-03-07 10:51:23.430982: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-07 10:51:23.432435: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 19. Tune using inter_op_parallelism_threads for best performance.


In [6]:
def compile_fit_bert_model(bert_model, input_ids, attention_masks, labels, epochs):
    bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    bert_model.fit([input_ids, attention_masks], labels, batch_size=32,
                       epochs=epochs, callbacks=callbacks)
    return bert_model

### Civility Data

In [27]:
df_train = pd.read_csv('./civility_data/civility_data/train.tsv', sep='\t', encoding='utf-8')
df_train['preprocess_text'] = df_train['text'].apply(preprocess_text)
df_train['label_bin'] = df_train['label'].apply(lambda x: 0 if x=='OFF' else 1)
df_train.head()

Unnamed: 0,text,label,category,preprocess_text,label_bin
0,@USER @USER You are an embarrassing citizen!!,OFF,TIN,embarrassing citizen ! !,0
1,@USER Seems hard to believe that you stood nex...,OFF,TIN,hard believe stand guy wear short masturbate...,0
2,@USER @USER @USER Wow !!! no wonder the Libera...,OFF,TIN,wow ! ! ! wonder liberal get bad party bul...,0
3,@USER @USER And not all idiots grandstands lik...,OFF,TIN,idiot grandstand like,0
4,@USER Bring on the hypocrite gungrabber. MAGA,OFF,TIN,bring hypocrite gungrabber . maga,0


In [28]:
num_classes=len(df_train['label_bin'].unique())
civ_bert_tokenizer, civ_bert_model = create_bert_tokenizer_model(num_classes)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
sentences = df_train['preprocess_text']
civ_labels = df_train['label_bin']
civ_labels = np.array(civ_labels)
len(sentences), len(civ_labels)

(10592, 10592)

In [30]:
civ_input_ids, civ_attention_masks = create_sentence_embeddings(sentences, civ_bert_tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [44]:
civ_bert_model = compile_fit_bert_model(civ_bert_model, civ_input_ids, civ_attention_masks, civ_labels, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Get civ dev data
df_dev = pd.read_csv('./civility_data/civility_data/dev.tsv', sep='\t', encoding='utf-8')
df_dev['preprocess_text'] = df_dev['text'].apply(preprocess_text)
df_dev['label_bin'] = df_dev['label'].apply(lambda x: 0 if x=='OFF' else 1)
df_dev.head()

Unnamed: 0,text,label,category,perspective_score,preprocess_text,label_bin
0,@USER She should ask a few native Americans wh...,OFF,UNT,0.311852,ask native americans .,0
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,0.566334,home drunk ! ! ! 👊 🇺 🇸 👊 url,0
2,Amazon is investigating Chinese employees who ...,NOT,,0.110361,amazon investigate chinese employee sell inter...,1
3,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,0.927032,"should'vetaken "" piece shit volcano . 😂 """,0
4,@USER @USER Obama wanted liberals &amp; illega...,NOT,,0.319764,obama want liberal & amp ; illegal red state,1


In [33]:
sentences_dev = df_dev['preprocess_text']
labels_dev = np.array(df_dev['label_bin'])

dev_input_ids, dev_attention_masks = create_sentence_embeddings(sentences_dev, civ_bert_tokenizer)
preds = civ_bert_model.predict([dev_input_ids, dev_attention_masks], batch_size=32)
pred_labels = preds['logits'].argmax(axis=1)
df_dev['pred'] = pred_labels
df_dev_classification = classification_report(df_dev['label_bin'].tolist(), df_dev['pred'].tolist(), output_dict=True)
pd.DataFrame(df_dev_classification)



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.671642,0.815618,0.771903,0.74363,0.767771
recall,0.613636,0.850679,0.771903,0.732158,0.771903
f1-score,0.64133,0.83278,0.771903,0.737055,0.769156
support,440.0,884.0,0.771903,1324.0,1324.0


In [None]:
# civ test data
df_test = pd.read_csv('./civility_data/civility_data/test.tsv', sep='\t', encoding='utf-8')
df_test['preprocess_text'] = df_test['text'].apply(preprocess_text)
sentences_test = df_test['preprocess_text']

test_input_ids, test_attention_masks = create_sentence_embeddings(sentences_test, civ_bert_tokenizer)

test_preds = bert_model.predict([input_ids_test, attention_masks_test],batch_size=32)
test_pred_labels = test_preds['logits'].argmax(axis=1)
df_test['label'] = test_pred_labels
df_test.to_csv('./civility_data/civility_data/lynnette_ng_test_final.tsv', sep='\t', index=False)

## Gab data 

In [34]:
df_gab_test = pd.read_csv('/storage2/mamille3/data/hate_speech/gab_hate_corpus//gab_test.tsv', sep='\t')
df_gab_test['label_bin'] = df_gab_test['vo'].apply(lambda x: 1 if x==1 else 0)
df_gab_test['preprocess_text'] = df_gab_test['text'].apply(preprocess_text)
df_gab_test.head()

Unnamed: 0,text_id,text,purity,harm,im,cv,ex,degradation,fairness,hd,...,rel,sxo,rae,nat,pol,authority,vo,idl,label_bin,preprocess_text
0,29574085,People think bones are made of calcium They a...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,people think bone calcium bone high calciu...
1,37547580,Why hasn't this disgusting illegal Obama polic...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,disgusting illegal obama policy completely rem...
2,21542473,"Persecution for righteousness' sake, part 2: ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,persecution righteousness ' sake 2 :
3,26888094,Blasphemy will get you Everywhere.......,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,blasphemy .......
4,23041627,"I don't see protecting our borders, as ALL oth...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,protect border nation ! have resemble martial ...


In [35]:
sentences_gab = df_gab_test['preprocess_text']
input_ids_gab, attention_masks_gab = create_sentence_embeddings(sentences_gab, civ_bert_tokenizer)



In [37]:
# Predict with SemEval classifier 
gab_preds = civ_bert_model.predict([input_ids_gab, attention_masks_gab],batch_size=32)
gab_pred_labels = gab_preds['logits'].argmax(axis=1)
df_gab_test['label_pred'] = gab_pred_labels
df_dev_classification = classification_report(df_gab_test['label_bin'].tolist(), df_gab_test['label_pred'].tolist(), output_dict=True)
pd.DataFrame(df_dev_classification)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.794562,0.015104,0.21495,0.404833,0.744752
recall,0.217625,0.175758,0.21495,0.196691,0.21495
f1-score,0.341669,0.027818,0.21495,0.184744,0.321613
support,2417.0,165.0,0.21495,2582.0,2582.0


In [38]:
# Gab's own classifier
df_gab_train = pd.read_csv('./gab_data/gab_data/gab_train.tsv', sep='\t')
df_gab_train['label_bin'] = df_gab_train['vo'].apply(lambda x: 1 if x==1 else 0)
df_gab_train['preprocess_text'] = df_gab_train['text'].apply(preprocess_text)

num_gab_classes=len(df_gab_train['label_bin'].unique())
gab_bert_tokenizer, gab_bert_model = create_bert_tokenizer_model(num_classes)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
sentences_gab_train = df_gab_train['preprocess_text']
labels_gab_train = np.array(df_gab_train['label_bin'])

input_ids_train_gab, attention_masks_train_gab = create_sentence_embeddings(sentences_gab_train, gab_bert_tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
gab_model = compile_fit_bert_model(gab_bert_model, input_ids_train_gab, attention_masks_train_gab, labels_gab_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
gab_new_preds = gab_model.predict([input_ids_gab, attention_masks_gab],batch_size=32)
gab_new_pred_labels = gab_new_preds['logits'].argmax(axis=1)
df_gab_test['label_pred_new'] = gab_new_pred_labels
df_dev_classification = classification_report(df_gab_test['label_bin'].tolist(), df_gab_test['label_pred_new'].tolist(), output_dict=True)
pd.DataFrame(df_dev_classification)

## Contextual Abuse Dataset

In [7]:
# df_contextual_test = pd.read_csv('contextual_abuse_dataset/data/data/cad_v1_1_test.tsv', sep='\t')
df_contextual_test = pd.read_csv('/storage2/mamille3/data/hate_speech/contextual_abuse_dataset/cad_v1_1_test.tsv', sep='\t')
def cad_off_or_not(label):
    if label == 'Neutral':
        return 0
    else:
        return 1

df_contextual_test = df_contextual_test.dropna(subset=['text'])
df_contextual_test['label_bin'] = df_contextual_test['labels'].apply(cad_off_or_not)
df_contextual_test['preprocess_text'] = df_contextual_test['text'].apply(preprocess_text)

In [None]:
# Prediction using SemEval model
sentences_cad = df_contextual_test['preprocess_text']
input_ids_cad, attention_masks_cad = create_sentence_embeddings(sentences_cad, civ_bert_tokenizer)

In [None]:
cad_preds = civ_bert_model.predict([input_ids_cad, attention_masks_cad],batch_size=32)
cad_pred_labels = cad_preds['logits'].argmax(axis=1)
df_contextual_test['label_pred'] = cad_pred_labels
df_dev_classification = classification_report(df_contextual_test['label_bin'].tolist(), df_contextual_test['label_pred'].tolist(), output_dict=True)
pd.DataFrame(df_dev_classification)

In [8]:
# CAD own model
# df_contextual_train = pd.read_csv('contextual_abuse_dataset/data/data/cad_v1_1_train.tsv', sep='\t')
df_contextual_train = pd.read_csv('/storage2/mamille3/data/hate_speech/contextual_abuse_dataset/cad_v1_1_train.tsv', sep='\t')

df_contextual_train = df_contextual_train.dropna(subset=['text'])
df_contextual_train['label_bin'] = df_contextual_train['labels'].apply(cad_off_or_not) # did assign it to df_contextual_test (bug?)
df_contextual_train['preprocess_text'] = df_contextual_train['text'].apply(preprocess_text)

In [21]:
num_classes=len(df_contextual_train['label_bin'].unique()) # originally df_cad_train
cad_bert_tokenizer, cad_bert_model = create_bert_tokenizer_model(num_classes)

sentences_cad_train = df_contextual_train['preprocess_text']
labels_cad_train = df_contextual_train['label_bin']

input_ids_train_cad, attention_masks_train_cad = create_sentence_embeddings(sentences_cad_train, cad_bert_tokenizer)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [25]:
cad_model = compile_fit_bert_model(cad_bert_model, input_ids_train_cad, attention_masks_train_cad, labels_cad_train, epochs=5)

Epoch 1/5
  6/425 [..............................] - ETA: 20:52 - loss: 0.5809 - accuracy: 0.6553

KeyboardInterrupt: 

In [None]:
cad_new_preds = cad_model.predict([input_ids_cad, attention_masks_cad],batch_size=32)
cad_new_pred_labels = cad_new_preds['logits'].argmax(axis=1)
df_contextual_test['label_pred_new'] = cad_new_pred_labels
df_dev_classification = classification_report(df_contextual_test['label_bin'].tolist(), df_contextual_test['label_pred_new'].tolist(), output_dict=True)
pd.DataFrame(df_dev_classification)