In [None]:
!nvidia-smi

Sat Jul  3 19:57:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    31W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Load Dataset

In [None]:
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Train dataset / Validation dataset
df = pd.read_csv("/content/drive/MyDrive/HateSpeech/FINAL_DATASET/Final_dataset_balanced.csv")
df = df.dropna()
df_train, df_val = train_test_split(df,test_size=0.2,random_state = 42)
# Test dataset
df_test = pd.read_csv("/content/drive/MyDrive/HateSpeech/hateXplain.csv")
print(df_train.shape,df_val.shape,df_test.shape)
df_train = df_train[(df_train['text'].apply(len)<1000)]
df_val = df_val[(df_val['text'].apply(len)<1000)]
df_test = df_test[(df_test['text'].apply(len)<1000)]
print(df_train.shape,df_val.shape,df_test.shape)

(123424, 3) (30856, 3) (15351, 3)
(119868, 3) (29931, 3) (15351, 3)


In [None]:
X_train = df_train['text']
y_train = df_train['class']
X_val = df_val['text']
y_val = df_val['class']
X_test = df_test['text']
y_test = df_test['class']

# Modeling

## GloVe + BiLSTM

In [None]:
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import *

def build_model(num_words,embedding_matrix):
  model = Sequential()
  model.add(Embedding(num_words,
                      embedding_dim,
                      embeddings_initializer=Constant(embedding_matrix),
                      input_length=sequence_length,
                      trainable=True))
  model.add(SpatialDropout1D(0.2))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(32)))
  model.add(Dropout(0.25))
  model.add(Dense(units=1, activation='sigmoid'))
  model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
  return model

In [None]:
import numpy as np
embeddings_index = {}
f = open("/content/drive/MyDrive/HateSpeech/glove.6B.100d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
X_train = df_train['text']
y_train = df_train['class']
X_val = df_val['text']
y_val = df_val['class']
X_test = df_test['text']
y_test = df_test['class']

eval = pd.DataFrame([[np.nan for i in range(11)]])
eval.columns = ['Model',
                'Train_Score(ACC)','Train_Score(ROC_AUC)','Train_Score(F1)',
                'Val_Score(ACC)','Val_Score(ROC_AUC)','Val_Score(F1)',
                'Test_Score(ACC)','Test_Score(ROC_AUC)','Test_Score(F1)','Inference_Time']
eval = eval.iloc[1:]

# Initialization
max_features=100000
sequence_length = 235
embedding_dim = 100
num_words = 100001

# Tokenizing
data_start = time.time()
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, sequence_length)

word_index = tokenizer.word_index
embedding_matrix_train = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_train[i] = embedding_vector
    else:
        embedding_matrix_train[i] = np.random.randn(embedding_dim)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, sequence_length)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, sequence_length)

# Fit
LR = build_model(num_words,embedding_matrix_train)
filepath = "/content/drive/MyDrive/HateSpeech/Weight/GloVe_BiLSTM"
best_weight = tf.keras.callbacks.ModelCheckpoint(filepath, monitor="val_loss", verbose=0, save_best_only=True,save_weights_only=True)
early = tf.keras.callbacks.EarlyStopping( monitor='val_loss', min_delta=0, patience=1, verbose=3, mode='auto', baseline=None, restore_best_weights=False )
LR.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=1000,batch_size=256,callbacks=[early,best_weight])
LR.load_weights(filepath)

# Inference
print("TRAIN SET")
fitted = LR.predict_classes(X_train)
fitted_proba = LR.predict_proba(X_train)

print("VAL SET")
val_pred = LR.predict_classes(X_val)
val_pred_proba = LR.predict_proba(X_val)

print("TEST SET")
start = time.time()
test_pred = LR.predict_classes(X_test)
inference_time = time.time()-start
test_pred_proba = LR.predict_proba(X_test)
print(f"Inferenced : {inference_time}s",end='\t')

# Evaluate
train_acc = accuracy_score(y_train,fitted)
train_auc = roc_auc_score(y_train,fitted_proba)
train_f1 = f1_score(y_train,fitted)

val_acc = accuracy_score(y_val,val_pred)
val_auc = roc_auc_score(y_val,val_pred_proba)
val_f1 = f1_score(y_val,val_pred)

test_acc = accuracy_score(y_test,test_pred)
test_auc = roc_auc_score(y_test,test_pred_proba)
test_f1 = f1_score(y_test,test_pred)
print(f"train ACC : {train_acc} train F1 : {train_f1} test ACC : {test_acc} test ROCAUC : {test_auc} test F1 : {test_f1}")

LR_list = ['BiLSTM+GloVe(10)']
LR_list.append(train_acc)
LR_list.append(train_auc)
LR_list.append(train_f1)
LR_list.append(val_acc)
LR_list.append(val_auc)
LR_list.append(val_f1)
LR_list.append(test_acc)
LR_list.append(test_auc)
LR_list.append(test_f1)
LR_list.append(inference_time)

eval = eval.append(pd.DataFrame([LR_list],columns=eval.columns))
eval.to_csv(f"/content/drive/MyDrive/HateSpeech/PERFORMANCE2/TEST_GloVe_BiLSTM.csv")
print(f"SAVED!!! {time.time()-data_start}")

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 00004: early stopping


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa3ec4c4910>

## BERT Family (Tensorflow)

### Load Tokenizer & Model

In [None]:
pip install transformers

In [None]:
from transformers import ElectraTokenizer, TFElectraForSequenceClassification
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from transformers import MobileBertTokenizer, TFMobileBertForSequenceClassification
from transformers import AlbertTokenizerFast, TFAlbertForSequenceClassification
from transformers import AutoTokenizer, TFMobileBertForSequenceClassification, TFAutoModelForSequenceClassification, TFAutoModel

Model_list = []

tokenizer = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l12-h384-uncased")
model = TFAutoModel.from_pretrained("microsoft/xtremedistil-l12-h384-uncased")
Model_list.append((tokenizer,model,'MobileBERT','google/mobilebert-uncased','TF'))
 
tokenizer_electra = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model_electra = TFElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator')
Model_list.append((tokenizer_electra,model_electra,'ELECTRA','google/electra-small-discriminator','TF'))
 
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_distilbert = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
Model_list.append((tokenizer_distilbert,model_distilbert,'DistilBERT','distilbert-base-uncased','TF'))
 
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
Model_list.append((tokenizer_roberta,model_roberta,'RoBERTa','roberta-base','TF'))

tokenizer_mobilebert = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
model_mobilebert = TFMobileBertForSequenceClassification.from_pretrained("google/mobilebert-uncased")
Model_list.append((tokenizer_mobilebert,model_mobilebert,'MobileBERT','google/mobilebert-uncased','TF'))


tokenizer_twitter = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate")
model_twitter = TFAutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-hate")
Model_list.append((tokenizer_twitter,model_twitter,'twitter-roberta',"cardiffnlp/twitter-roberta-base-hate",'TF'))

tokenizer_mobilebert = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model_mobilebert = TFMobileBertForSequenceClassification.from_pretrained('google/mobilebert-uncased')
Model_list.append((tokenizer_mobilebert,model_mobilebert,'MobileBERT','google/mobilebert-uncased','TF'))
 
tokenizer_albert = AlbertTokenizerFast.from_pretrained('albert-base-v2')
model_albert = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
Model_list.append((tokenizer_albert,model_albert,'ALBERT','albert-base-v2','TF'))
 
for i in range(len(Model_list)):
  print(f"{i+1} - Tokenizer[0] : {str(Model_list[i][0].__class__).split('.')[-1][:-2]}",end='\t\t')
  print(f"Model[1] : {str(Model_list[i][1].__class__).split('.')[-1][:-2]:38s}",end='\t')
  print(f"Name[2] : {Model_list[i][2]:10s}",end='\t')
  print(f"Pretrained[3] : {Model_list[i][3]}")

All model checkpoint layers were used when initializing TFMobileBertForSequenceClassification.

Some layers of TFMobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


1 - Tokenizer[0] : MobileBertTokenizerFast		Model[1] : TFMobileBertForSequenceClassification 	Name[2] : MobileBERT	Pretrained[3] : google/mobilebert-uncased
2 - Tokenizer[0] : RobertaTokenizerFast		Model[1] : TFRobertaForSequenceClassification    	Name[2] : twitter-roberta	Pretrained[3] : cardiffnlp/twitter-roberta-base-hate


### Train & Evaluation

In [None]:
X_train = df_train['text']
y_train = df_train['class']
X_val = df_val['text']
y_val = df_val['class']
X_test = df_test['text']
y_test = df_test['class']

X_train = X_train.to_list()
X_val = X_val.to_list()
X_test = X_test.to_list()
y_train = y_train.to_list()
y_val = y_val.to_list()
y_test = y_test.to_list()
 
for tokenizer, model_seq, model_name, pretrained, plat in Model_list:
  data_start = time.time()
  eval = pd.DataFrame([[np.nan for i in range(11)]])
  eval.columns = ['Model',
                  'Train_Score(ACC)','Train_Score(ROC_AUC)','Train_Score(F1)',
                  'Val_Score(ACC)','Val_Score(ROC_AUC)','Val_Score(F1)',
                  'Test_Score(ACC)','Test_Score(ROC_AUC)','Test_Score(F1)','Inference_Time']
  eval = eval.iloc[1:]
  print(f"Tokenizer[0] : {str(tokenizer.__class__).split('.')[-1][:-2]}",end='\t\t')
  print(f"Model[1] : {str(model_seq.__class__).split('.')[-1][:-2]:38s}",end='\t')
  print(f"Name[2] : {model_name:10s}",end='\t')
  print(f"Pretrained[3] : {pretrained}")
 
  # Initialization
  LR = model_seq
  optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
  LR.compile(optimizer=optimizer, loss='binary_crossentropy') # can also use any keras loss fn
 
  # Tokenizing
  encoding_time = time.time()
  train_encodings = tokenizer(X_train, truncation=True, padding=True)
  val_encodings = tokenizer(X_val, truncation=True, padding=True)
  test_encodings = tokenizer(X_test, truncation=True, padding=True)
  print("encoding time : ", time.time()-encoding_time)
 
  # Build Dataset
  dataset_time = time.time()
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      y_train
  ))
  val_dataset = tf.data.Dataset.from_tensor_slices((
      dict(val_encodings),
      y_val
  ))
  test_dataset = tf.data.Dataset.from_tensor_slices((
      dict(test_encodings),
      y_test
  ))
  print("building dataset : ", time.time()-dataset_time)
 
  # Fit
  filepath = f"/content/drive/MyDrive/HateSpeech/Weight/{model_name}"
  best_weight = tf.keras.callbacks.ModelCheckpoint(filepath, monitor="val_loss", verbose=0, save_best_only=True)
  early = tf.keras.callbacks.EarlyStopping( monitor='val_loss', min_delta=0, patience=3, verbose=3, mode='auto', baseline=None, restore_best_weights=False )
  LR.fit(train_dataset.shuffle(1000).batch(16),
         validation_data=val_dataset.shuffle(1000).batch(16),
         epochs=1000,
         callbacks=[early,best_weight])
 
  # Inference
  fitted = LR.predict(train_dataset.batch(16))
  fitted = tf.nn.softmax(fitted.logits, axis=1)
  fitted_proba = fitted[:,1]
  fitted = [np.argmax(res) for res in fitted]
 
  val_pred = LR.predict(val_dataset.batch(16))
  val_pred = tf.nn.softmax(val_pred.logits, axis=1)
  val_pred_proba = val_pred[:,1]
  val_pred = [np.argmax(res) for res in val_pred]
 
  start = time.time()
  test_pred = LR.predict(test_dataset.batch(16))
  test_pred = tf.nn.softmax(test_pred.logits, axis=1)
  inference_time = time.time()-start
  test_pred_proba = test_pred[:,1]
  test_pred = [np.argmax(res) for res in test_pred]
  print(f"Inferenced : {inference_time}s",end='\t')
 
  # Evaluate
  train_acc = accuracy_score(y_train,fitted)
  train_auc = roc_auc_score(y_train,fitted_proba)
  train_f1 = f1_score(y_train,fitted)
 
  val_acc = accuracy_score(y_val,val_pred)
  val_auc = roc_auc_score(y_val,val_pred_proba)
  val_f1 = f1_score(y_val,val_pred)
 
  test_acc = accuracy_score(y_test,test_pred)
  test_auc = roc_auc_score(y_test,test_pred_proba)
  test_f1 = f1_score(y_test,test_pred)
 
  print(f"TRAIN ROC_AUC : {train_auc} VAL ROC_AUC : {val_auc} TEST ROC_AUC : {test_auc}")
 
 
  LR_list = [model_name]
  LR_list.append(train_acc)
  LR_list.append(train_auc)
  LR_list.append(train_f1)
  LR_list.append(val_acc)
  LR_list.append(val_auc)
  LR_list.append(val_f1)
  LR_list.append(test_acc)
  LR_list.append(test_auc)
  LR_list.append(test_f1)
  LR_list.append(inference_time)
 
  eval = eval.append(pd.DataFrame([LR_list],columns=eval.columns))
  eval.to_csv(f"/content/drive/MyDrive/HateSpeech/PERFORMANCE2/{model_name}.csv")
  print(f"SAVED!!! {time.time()-data_start}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokenizer[0] : MobileBertTokenizerFast		Model[1] : TFMobileBertForSequenceClassification 	Name[2] : MobileBERT	Pretrained[3] : google/mobilebert-uncased
encoding time :  19.308887004852295
building dataset :  480.92417907714844
Epoch 1/1000
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU an



INFO:tensorflow:Assets written to: /content/drive/MyDrive/HateSpeech/Weight/MobileBERT/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/HateSpeech/Weight/MobileBERT/assets


Epoch 2/1000
























































INFO:tensorflow:Assets written to: /content/drive/MyDrive/HateSpeech/Weight/MobileBERT/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/HateSpeech/Weight/MobileBERT/assets


Epoch 3/1000
Epoch 4/1000
























































INFO:tensorflow:Assets written to: /content/drive/MyDrive/HateSpeech/Weight/MobileBERT/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/HateSpeech/Weight/MobileBERT/assets


Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 00007: early stopping
























Inferenced : 128.82943558692932s	TRAIN ROC_AUC : 0.5 VAL ROC_AUC : 0.5 TEST ROC_AUC : 0.5


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


SAVED!!! 22519.817007541656
Tokenizer[0] : RobertaTokenizerFast		Model[1] : TFRobertaForSequenceClassification    	Name[2] : twitter-roberta	Pretrained[3] : cardiffnlp/twitter-roberta-base-hate
encoding time :  27.990967273712158
building dataset :  358.93810081481934
Epoch 1/1000
















ResourceExhaustedError: ignored

In [None]:
"https://zzsza.github.io/mlops/2021/04/18/bentoml-basic/"

## BERT Family (Pytorch)

In [None]:
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification
Model_list = []

tokenizer_xtremedistil = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l12-h384-uncased")
model_xtremedistil = AutoModel.from_pretrained("microsoft/xtremedistil-l12-h384-uncased")
Model_list.append((tokenizer_xtremedistil,model_xtremedistil,'xtremedistil','xtremedistil-l12-h384-uncased','TF'))
  
tokenizer_squeeze = AutoTokenizer.from_pretrained("squeezebert/squeezebert-uncased")
model_squeeze = AutoModel.from_pretrained("squeezebert/squeezebert-uncased")
Model_list.append((tokenizer_squeeze,model_squeeze,'squeezebert',"squeezebert/squeezebert-uncased",'TF'))

tokenizer_dehatebert = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
model_dehatebert = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
Model_list.append((tokenizer_dehatebert,model_dehatebert,'squeezebert',"squeezebert/squeezebert-uncased",'TF'))

tokenizer_tweetroberta = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-offensive")
model_tweetroberta = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-offensive")
Model_list.append((tokenizer_tweetroberta,model_tweetroberta,'squeezebert',"squeezebert/squeezebert-uncased",'TF'))

for i in range(len(Model_list)):
  print(f"{i+1} - Tokenizer[0] : {str(Model_list[i][0].__class__).split('.')[-1][:-2]}",end='\t\t')
  print(f"Model[1] : {str(Model_list[i][1].__class__).split('.')[-1][:-2]:38s}",end='\t')
  print(f"Name[2] : {Model_list[i][2]:10s}",end='\t')
  print(f"Pretrained[3] : {Model_list[i][3]}")

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 - Tokenizer[0] : BertTokenizerFast		Model[1] : BertModel                             	Name[2] : xtremedistil	Pretrained[3] : xtremedistil-l12-h384-uncased
2 - Tokenizer[0] : SqueezeBertTokenizerFast		Model[1] : SqueezeBertModel                      	Name[2] : squeezebert	Pretrained[3] : squeezebert/squeezebert-uncased
3 - Tokenizer[0] : BertTokenizerFast		Model[1] : BertForSequenceClassification         	Name[2] : squeezebert	Pretrained[3] : squeezebert/squeezebert-uncased
4 - Tokenizer[0] : RobertaTokenizerFast		Model[1] : RobertaForSequenceClassification      	Name[2] : squeezebert	Pretrained[3] : squeezebert/squeezebert-uncased
