# Connect to drive, load data and requirements

Install requirements (restart your runtime after installation)

In [None]:
 !pip install simpletransformers

Mount google drive with data


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/SM_Assignment3

Load requirements

In [None]:
import pandas as pd
from simpletransformers.classification import ClassificationModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import torch
import seaborn as sns
import matplotlib.pyplot as plt


Load OLID data

In [None]:
data_dir = 'data/'
olid_train = pd.read_csv(data_dir + 'olid-train-small.csv', sep=',')
olid_test = pd.read_csv(data_dir + 'olid-test.csv', sep=',')
olid_train.head(5)
olid_test.head(5)

Load Hasoc data

In [None]:
hasoc_train = pd.read_csv(data_dir + 'hasoc-train.csv', sep=',')
hasoc_train.head(5)

# create and train models


Load pre-trained models
List of models [here](https://huggingface.co/models)

In [None]:
# make models
model_args = {}
model_args["save_steps"] = -1
model_args["save_model_every_epoch"] = False
# pre-trained on english
model_BERT = ClassificationModel('bert', 'bert-base-cased', args = model_args)
# pre-trained on hate 
model_hateBERT = ClassificationModel('bert', 'GroNLP/hateBERT', args = model_args)
model_fBERT = ClassificationModel('bert', 'diptanu/fBERT', args = model_args)



Training and caving in and cross models

## in domain

In [None]:
# train models for in-domain
train_args = {'output_dir' : 'outputs_in/', 'overwrite_output_dir' : True}
model_BERT.train_model(olid_train, args = train_args )
model_fBERT.train_model(olid_train, args = train_args )
model_hateBERT.train_model(olid_train, args =  train_args )


In [None]:
#saves models so training dus not have to be run again
torch.save(model_BERT, "outputs_in/model_BERT")
torch.save(model_fBERT, "outputs_in/model_fBERT")
torch.save(model_hateBERT, "outputs_in/model_hateBERT")

## cross-domain

In [None]:
# make models
model_args = {}
model_args["save_steps"] = -1
model_args["save_model_every_epoch"] = False
# pre-trained on english
model_BERT_cross = ClassificationModel('bert', 'bert-base-cased', args = model_args)
# pre-trained on hate 
model_hateBERT_cross = ClassificationModel('bert', 'GroNLP/hateBERT', args = model_args)
model_fBERT_cross = ClassificationModel('bert', 'diptanu/fBERT', args = model_args)


In [None]:
# train cross models
train_args_cross = {'output_dir' : 'outputs_cross/', 'overwrite_output_dir' : True}
model_BERT_cross.train_model(hasoc_train, args = train_args_cross )
model_fBERT_cross.train_model(hasoc_train, args = train_args_cross)
model_hateBERT_cross.train_model(hasoc_train, args = train_args_cross)

In [None]:
# save models so they do not have to be trained again
torch.save(model_BERT_cross, "outputs_cross/model_BERT_cross")
torch.save(model_fBERT_cross, "outputs_cross/model_fBERT_cross")
torch.save(model_hateBERT_cross, "outputs_in/model_hateBERT_cross")

# Evaluation of models

 ## In - domain

Load models

In [None]:
# load models from file
model_BERT = torch.load("outputs_in/model_BERT")
model_fBERT = torch.load("outputs_in/model_fBERT")
model_hateBERT = torch.load("outputs_in/model_hateBERT")


predictions

In [None]:
# method for visualing and saving confusion matetrix
def makeConfusionMatrix(name, cm):
  ax = sns.heatmap(cm, annot=True, cmap='Blues', cbar = False, fmt='g' , annot_kws={"fontsize":16})
  

  ax.set_title(f'Confusion Matrix of {name}\n\n');
  ax.set_xlabel('\nPredicted Values')
  ax.set_ylabel('Actual Values ');
  
  ## Ticket labels - List must be in alphabetical order
  ax.xaxis.set_ticklabels(['0','1'])
  ax.yaxis.set_ticklabels(['0','1'])
  plt.savefig(f"ConfusionMatrix{name}.png", bbox_inches = 'tight') 

  ## Display the visualization of the Confusion Matrix.
  plt.show()

In [None]:
# make predictions and pring classification report and confusion matrices for in-domain
predictions_BERT, raw_outputs_BERT = model_BERT.predict(olid_test['text'].tolist())
predictions_fBERT, raw_outputs_fBERT = model_fBERT.predict(olid_test['text'].tolist())
predictions_hateBERT, raw_outputs_hateBERT = model_hateBERT.predict(olid_test['text'].tolist())

cm_BERT = confusion_matrix(olid_test['labels'], predictions_BERT)
cm_fBERT = confusion_matrix (olid_test['labels'], predictions_fBERT)
cm_hateBERT = confusion_matrix( olid_test['labels'], predictions_hateBERT)



print("BERT\n")
print(cm_BERT)
makeConfusionMatrix("BERT In-domain", cm_BERT)
print("\n")
print(classification_report(olid_test['labels'], predictions_BERT))
print("\n")
print("fBERT\n")
makeConfusionMatrix("fBERT In-domain", cm_fBERT)
print("\n")
print(classification_report( olid_test['labels'], predictions_fBERT))
print("\n")
print("hateBERT\n")
makeConfusionMatrix("hateBERT In-domain", cm_hateBERT)
print("\n")
print(classification_report(olid_test['labels'], predictions_hateBERT))

## Cross-domain

In [None]:
# make predictions and pring classification report and confusion matrices for cross-domain

predictions_BERT_cross, raw_outputs_BERT_cross = model_BERT_cross.predict(olid_test['text'].tolist())
predictions_fBERT_cross, raw_outputs_fBERT_cross = model_fBERT_cross.predict(olid_test['text'].tolist())
predictions_hateBERT_cross, raw_outputs_hateBERT_cross = model_hateBERT_cross.predict(olid_test['text'].tolist())

cm_BERT_cross = confusion_matrix(olid_test['labels'], predictions_BERT_cross)
cm_fBERT_cross = confusion_matrix (olid_test['labels'], predictions_fBERT_cross)
cm_hateBERT_cross = confusion_matrix( olid_test['labels'], predictions_hateBERT_cross)



print("BERT\n")
print(cm_BERT_cross)
makeConfusionMatrix("BERT Cross-domain", cm_BERT_cross)
print("\n")
print(classification_report(olid_test['labels'], predictions_BERT_cross))
print("\n")
print("fBERT\n")
makeConfusionMatrix("fBERT Cross-domain", cm_fBERT_cross)
print("\n")
print(classification_report( olid_test['labels'], predictions_fBERT_cross))
print("\n")
print("hateBERT\n")
makeConfusionMatrix("hateBERT Cross-domain", cm_hateBERT_cross)
print("\n")
print(classification_report(olid_test['labels'], predictions_hateBERT_cross))

## LSTM

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [None]:
# 'reshape' data
# filtered_train = olid_train[['text','labels']].copy()
filtered_train = hasoc_train[['text','labels']].copy()
filtered_test = olid_test[['text','labels']].copy()
x_train = filtered_train['text']
y_train = filtered_train['labels']

max_dimension = 200

In [None]:
#make embedding vectors
token = Tokenizer()
token.fit_on_texts(x_train)
seq = token.texts_to_sequences(x_train)
pad_seq = pad_sequences(seq,maxlen=max_dimension)
vocab_size = len(token.word_index)+1
embedding_vector = {}
#twitter embeddings
# f = open('glove_word_embeddings/glove.twitter.27B.200d.txt')
#Wiki embeddings
f = open('glove_word_embeddings/glove.6B.200d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef
embedding_matrix = np.zeros((vocab_size,max_dimension))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [None]:
#model setup
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, max_dimension, weights = [embedding_matrix],input_length=max_dimension,trainable = False))
model.add(keras.layers.LSTM(300))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#fit model
history = model.fit(pad_seq,y_train,epochs = 10,batch_size=16)

In [None]:
#test model and create confusion matrix
x_test = filtered_test['text']
x_test = token.texts_to_sequences(x_test)
testing_seq = pad_sequences(x_test,maxlen=max_dimension)
predictions =  (model.predict(testing_seq) > 0.5).astype("int32")
predicted_test = filtered_test.copy()
predicted_test['labels'] = predictions
predicted_test.head(10)
predictions_lstm = predicted_test['labels']
cm_lstm = confusion_matrix(olid_test['labels'], predictions_lstm)
print("LSTM\n")
print(cm_lstm)
makeConfusionMatrix("LSTM Cross-domain + In-domain Embeddings)", cm_lstm)
print("\n")
print(classification_report(olid_test['labels'], predictions_lstm))