In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   
import re
import ktrain
from ktrain import text
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras import activations
from utils import bert_utils, plots
root_dir = os.path.abspath(os.curdir)
base_dir = os.path.dirname(root_dir)+"/Data/"
training_dataset = base_dir+'trainingDataset-toxicComments.csv'
#Create directory to save models
mod_dir = os.path.dirname(root_dir)+"/Models/"
os.makedirs(mod_dir, exist_ok=True)

## Load the Data Into Arrays

In [2]:
def focal_loss(gamma=2., alpha=4., from_logits=False):
    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax if from_logits is False.
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {4.0})

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        y_true = tf.cast(y_true, dtype=tf.float32)
        y_pred = tf.cast(y_pred, dtype=tf.float32)
        if from_logits:
            y_pred = activations.softmax(y_pred)

        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.math.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    return focal_loss_fixed

In [3]:
seed_value=13
categories = ["highly_toxic","slightly_toxic","non_toxic"]

def get_dataset_partitions_pd(df, train_split=0.8, val_split=0.1, test_split=0.1, target_variable=None):
    assert (train_split + test_split + val_split) == 1
    
    # Only allows for equal validation and test splits
    assert val_split == test_split 

    # Shuffle
    df_sample = df.sample(frac=1, random_state=seed_value)
    # Specify seed to always have the same split distribution between runs
    # If target variable is provided, generate stratified sets
    if target_variable is not None:
      grouped_df = df_sample.groupby(target_variable)
      arr_list = [np.split(g, [int(train_split * len(g)), int((1 - val_split) * len(g))]) for i, g in grouped_df]

      train_ds = pd.concat([t[0] for t in arr_list])
      val_ds = pd.concat([t[1] for t in arr_list])
      test_ds = pd.concat([v[2] for v in arr_list])

    else:
      indices_or_sections = [int(train_split * len(df)), int((1 - val_split) * len(df))]
      train_ds, val_ds, test_ds = np.split(df_sample, indices_or_sections)
    
    return train_ds, val_ds, test_ds


def generate_class_weights(class_series, multi_class=True, one_hot_encoded=False):
  if multi_class:
    # If class is one hot encoded, transform to categorical labels to use compute_class_weight   
    if one_hot_encoded:
      class_series = np.argmax(class_series, axis=1)
  
    # Compute class weights with sklearn method
    class_labels = np.unique(class_series)
    class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=class_series)
    return dict(zip(class_labels, class_weights))
  else:
    # It is neccessary that the multi-label values are one-hot encoded
    mlb = None
    if not one_hot_encoded:
      mlb = MultiLabelBinarizer()
      class_series = mlb.fit_transform(class_series)

    n_samples = len(class_series)
    n_classes = len(class_series[0])

    # Count each class frequency
    class_count = [0] * n_classes
    for classes in class_series:
        for index in range(n_classes):
            if classes[index] != 0:
                class_count[index] += 1
    
    # Compute class weights using balanced method
    class_weights = [n_samples / (n_classes * freq) if freq > 0 else 1 for freq in class_count]
    class_labels = range(len(class_weights)) if mlb is None else mlb.classes_
    return dict(zip(class_labels, class_weights))

def clean(text, newline=True, quote=True, bullet_point=True, 
          link=True, strikethrough=True, spoiler=True,
          code=True, superscript=True, table=True, heading=True):
    """
    Cleans text (string).
    Removes common Reddit special characters/symbols:
      * \n (newlines)
      * &gt; (> quotes)
      * * or &amp;#x200B; (bullet points)
      * []() (links)
      * etc (see below)
    Specific removals can be turned off, but everything is on by default.
    Standard punctuation etc is deliberately not removed, can be done in a
    second round manually, or may be preserved in any case.
    """
    # Newlines (replaced with space to preserve cases like word1\nword2)
    if newline:
        text = re.sub(r'\n+', ' ', text)

        # Remove resulting ' '
        text = text.strip()
        text = re.sub(r'\s\s+', ' ', text)

    # > Quotes
    if quote:
        text = re.sub(r'\"?\\?&?gt;?', '', text)

    # Bullet points/asterisk (bold/italic)
    if bullet_point:
        text = re.sub(r'\*', '', text)
        text = re.sub('&amp;#x200B;', '', text)

    # []() Link (Also removes the hyperlink)
    if link:
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    # Strikethrough
    if strikethrough:
        text = re.sub('~', '', text)

    # Spoiler, which is used with < less-than (Preserves the text)
    if spoiler:
        text = re.sub('&lt;', '', text)
        text = re.sub(r'!(.*?)!', r'\1', text)

    # Code, inline and block
    if code:
        text = re.sub('`', '', text)

    # Superscript (Preserves the text)
    if superscript:
        text = re.sub(r'\^\((.*?)\)', r'\1', text)

    # Table
    if table:
        text = re.sub(r'\|', ' ', text)
        text = re.sub(':-', '', text)

    # Heading
    if heading:
        text = re.sub('#', '', text)
    return text
             
    
def get_data():
    dataset = pd.read_csv(training_dataset)
    dataset = dataset.sample(frac=1,random_state=seed_value)
    dataset['comment_text'] = dataset['comment_text'].apply(lambda x: clean(x))
    # labels = dataset[categories].values.astype(np.float32)
    # lab_sentences = dataset["comment_text"].fillna("fillna").str.lower()
    # X = dataset.iloc[:,:2]
    myLabels = dataset[categories]
    
    dataset['category'] = myLabels.idxmax(axis=1)
    dataset['label']= dataset['category'].map({'highly_toxic':0, 'slightly_toxic':1,'non_toxic':2})
    dataset['label'] = dataset['label'].astype('int')
    y = dataset[['label']]
    htoxic, stoxic, neither = np.bincount(dataset['label'])
    total = neither + stoxic  + htoxic
    print('Examples:\n    Total: {}\n    highly toxic: {} ({:.2f}% of total)\n'.format(
        total, htoxic, 100 * htoxic / total))
    print('Examples:\n    Total: {}\n    slightly toxic: {} ({:.2f}% of total)\n'.format(
        total, stoxic, 100 * stoxic / total))
    print('Examples:\n    Total: {}\n    Neither: {} ({:.2f}% of total)\n'.format(
        total, neither, 100 * neither / total))
    print()
    # eqs = calculate_bias(htoxic,stoxic,neither)
    # print(eqs)
    weights = generate_class_weights(dataset['label'].values, multi_class=True, one_hot_encoded=False)
    print(weights)
    print()
    print(f'Distribution in original set:  \n{y["label"].value_counts().sort_index() / len(y)}')
    train_ds, val_ds, test_ds = get_dataset_partitions_pd(dataset, target_variable="label")
    print(f'Distribution in training set: \n{train_ds["label"].value_counts().sort_index() / len(train_ds)}\n\n'+
      f'Distribution in validation set: \n{val_ds["label"].value_counts().sort_index() / len(val_ds)}\n\n'+
      f'Distribution in testing set: \n{test_ds["label"].value_counts().sort_index() / len(test_ds)}')
    # X_train_, test_sentences, y_train_, y_test = train_test_split(
    # lab_sentences,
    # dataset['label'].values,
    # test_size=0.10,
    # random_state=seed_value,
    # stratify=dataset['label'].values)
    # train_sentences, val_sentences, y_train,  y_val = train_test_split(
    # X_train_,
    # y_train_,
    # test_size=0.10,
    # random_state=seed_value,
    # stratify=y_train_)
    print()
    print(train_ds.shape, val_ds.shape, test_ds.shape)
    train_sentences = train_ds['comment_text'].fillna("fillna").str.lower()
    y_train = train_ds['label'].values.astype("int")
    val_sentences = val_ds['comment_text'].fillna("fillna").str.lower()
    y_val = val_ds['label'].values.astype("int")
    test_sentences = test_ds['comment_text'].fillna("fillna").str.lower()
    y_test = test_ds['label'].values.astype("int")
    return train_sentences, y_train, val_sentences, y_val, test_sentences, y_test,weights

## STEP 1: Preprocess Data

In [4]:
x_train, y_train, x_test,y_test,x_val, y_val,weights= get_data()

Examples:
    Total: 10083
    highly toxic: 667 (6.62% of total)

Examples:
    Total: 10083
    slightly toxic: 1191 (11.81% of total)

Examples:
    Total: 10083
    Neither: 8225 (81.57% of total)


{0: 5.0389805097451275, 1: 2.821998320738875, 2: 0.4086322188449848}

Distribution in original set:  
0    0.066151
1    0.118120
2    0.815729
Name: label, dtype: float64
Distribution in training set: 
0    0.066088
1    0.118041
2    0.815871
Name: label, dtype: float64

Distribution in validation set: 
0    0.066468
1    0.118056
2    0.815476
Name: label, dtype: float64

Distribution in testing set: 
0    0.066337
1    0.118812
2    0.814851
Name: label, dtype: float64

(8065, 7) (1008, 7) (1010, 7)


In [5]:
MODEL_NAME = 'bert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=512, class_names=categories)
trn = t.preprocess_train(x_train.to_numpy(), y_train)
val = t.preprocess_test(x_val.to_numpy(), y_val)
model = t.get_classifier()
model.compile(loss=focal_loss(alpha=7.0,from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

#Run these two lines to find the best learning rate
# learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
# learner.lr_find(show_plot=True, max_epochs=2)

preprocessing train...
language: en
train sequence lengths:
	mean : 71
	95percentile : 189
	99percentile : 367


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 71
	95percentile : 175
	99percentile : 351


In [6]:
#Run this line to train the model
# learner.fit_onecycle(3e-5,4,class_weight=weights,checkpoint_folder=mod_dir)

In [7]:
#Load best
model.load_weights(mod_dir+'weights-01.hdf5')
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

In [8]:
learner.validate(class_names=t.get_classes())

                precision    recall  f1-score   support

  highly_toxic       0.74      0.69      0.71        67
slightly_toxic       0.60      0.72      0.66       120
     non_toxic       0.98      0.96      0.97       823

      accuracy                           0.91      1010
     macro avg       0.77      0.79      0.78      1010
  weighted avg       0.92      0.91      0.91      1010



array([[ 46,  21,   0],
       [ 16,  87,  17],
       [  0,  36, 787]], dtype=int64)

## Predict on New Data

In [9]:
p = ktrain.get_predictor(learner.model, preproc=t)

In [10]:
p.predict("Okay-- Take care sweetie.")

'non_toxic'

In [11]:
p.predict("If you don't stop immediately, I will kill you.")

'slightly_toxic'

In [12]:
p.predict("You fucking asshole! WTF is wrong with you?!")

'highly_toxic'

In [13]:
test = t.preprocess_test(x_test.to_numpy(),y_test)

preprocessing test...
language: en
test sequence lengths:
	mean : 70
	95percentile : 191
	99percentile : 417


In [14]:
learner.validate(val_data=(test),class_names=t.get_classes())

                precision    recall  f1-score   support

  highly_toxic       0.78      0.70      0.74        67
slightly_toxic       0.62      0.76      0.68       119
     non_toxic       0.98      0.95      0.96       822

      accuracy                           0.91      1008
     macro avg       0.79      0.80      0.80      1008
  weighted avg       0.92      0.91      0.92      1008



array([[ 47,  18,   2],
       [ 11,  90,  18],
       [  2,  37, 783]], dtype=int64)

In [15]:
predictions = learner.predict(val_data=(val))
print(classification_report(y_val, np.argmax(predictions, axis=-1),target_names=categories))
print()
print(confusion_matrix(y_val, np.argmax(predictions, axis=-1)))
print()
cv_ac = accuracy_score(y_val, np.argmax(predictions, axis=-1))
cv_pre = precision_score(y_val, np.argmax(predictions, axis=-1), average='macro')
cv_rec = recall_score(y_val, np.argmax(predictions, axis=-1), average='macro')
cv_f1s = f1_score(y_val, np.argmax(predictions, axis=-1), average='macro')
cv_score = roc_auc_score(y_val, predictions ,multi_class='ovr')
print('roc-auc score: ',cv_score)
print('f1: ',cv_f1s)
print('precision: ',cv_pre)
print('recall: ',cv_rec)
print('accuracy: ',cv_ac)
print()

                precision    recall  f1-score   support

  highly_toxic       0.74      0.69      0.71        67
slightly_toxic       0.60      0.72      0.66       120
     non_toxic       0.98      0.96      0.97       823

      accuracy                           0.91      1010
     macro avg       0.77      0.79      0.78      1010
  weighted avg       0.92      0.91      0.91      1010


[[ 46  21   0]
 [ 16  87  17]
 [  0  36 787]]

roc-auc score:  0.9668324365893058
f1:  0.7798979705720605
precision:  0.7749859573102231
recall:  0.7892749194489278
accuracy:  0.9108910891089109



In [16]:
predictions = learner.predict(val_data=(test))
print(classification_report(y_test, np.argmax(predictions, axis=-1),target_names=categories))
print()
print(confusion_matrix(y_test, np.argmax(predictions, axis=-1)))
print()
cv_ac = accuracy_score(y_test, np.argmax(predictions, axis=-1))
cv_pre = precision_score(y_test, np.argmax(predictions, axis=-1), average='macro')
cv_rec = recall_score(y_test, np.argmax(predictions, axis=-1), average='macro')
cv_f1s = f1_score(y_test, np.argmax(predictions, axis=-1), average='macro')
cv_score = roc_auc_score(y_test, predictions ,multi_class='ovr')
print('roc-auc score: ',cv_score)
print('f1: ',cv_f1s)
print('precision: ',cv_pre)
print('recall: ',cv_rec)
print('accuracy: ',cv_ac)
print()

                precision    recall  f1-score   support

  highly_toxic       0.78      0.70      0.74        67
slightly_toxic       0.62      0.76      0.68       119
     non_toxic       0.98      0.95      0.96       822

      accuracy                           0.91      1008
     macro avg       0.79      0.80      0.80      1008
  weighted avg       0.92      0.91      0.92      1008


[[ 47  18   2]
 [ 11  90  18]
 [  2  37 783]]

roc-auc score:  0.9629350532027287
f1:  0.7952226566084835
precision:  0.7930387960855604
recall:  0.8034499342824613
accuracy:  0.9126984126984127



In [17]:
p.save(mod_dir+'pytorchmodel')

# Convert model to ONNX

In [18]:
# Quantization Using PyTorch

# load the predictor, model, and tokenizer
from transformers import *
import ktrain
predictor = ktrain.load_predictor(mod_dir+'pytorchmodel')
model_pt = AutoModelForSequenceClassification.from_pretrained(mod_dir+'pytorchmodel', from_tf=True)
tokenizer = predictor.preproc.get_tokenizer() # or use AutoTokenizer.from_pretrained(predictor.preproc.model_name)
maxlen = predictor.preproc.maxlen
device = 'cpu'
class_names = predictor.preproc.get_classes()

# quantize model (INT8 quantization)
import torch
model_pt_quantized = torch.quantization.quantize_dynamic(
    model_pt.to(device), {torch.nn.Linear}, dtype=torch.qint8)

# make quantized predictions (x_test is a list of strings representing documents)
preds = []
for doc in x_test:
    model_inputs = tokenizer(doc, return_tensors="pt", max_length=maxlen, truncation=True)
    model_inputs_on_device = { arg_name: tensor.to(device) 
                              for arg_name, tensor in model_inputs.items()}
    pred = model_pt_quantized(**model_inputs_on_device)
    preds.append(class_names[ np.argmax( np.squeeze( pred[0].cpu().detach().numpy() ) ) ]) 

In [21]:
# Converting to ONNX (from PyTorch-converted model)


# set maxlen, class_names, and tokenizer (use settings employed when training the model - see above)
model_name = 'bert-base-uncased'
maxlen = 512
class_names = ["highly_toxic","slightly_toxic","non_toxic"]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


# imports
import numpy as np
from transformers.convert_graph_to_onnx import convert, optimize, quantize
from transformers import AutoModelForSequenceClassification
from pathlib import Path

# paths
predictor_path = mod_dir+'pytorchmodel'
pt_path = mod_dir+'pytorchmodel'
pt_onnx_path = pt_path +'_onnx/model.onnx'

# convert to ONNX
AutoModelForSequenceClassification.from_pretrained(predictor_path, 
                                                   from_tf=True).save_pretrained(pt_path)
convert(framework='pt', model=pt_path,output=Path(pt_onnx_path), opset=11, 
        tokenizer=model_name, pipeline_name='text-classification')
pt_onnx_quantized_path = quantize(optimize(Path(pt_onnx_path)))

# create ONNX session
def create_onnx_session(onnx_model_path, provider='CPUExecutionProvider'):
    """
    Creates ONNX inference session from provided onnx_model_path
    """

    from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers
    assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

    # Few properties that might have an impact on performances (provided by MS)
    options = SessionOptions()
    options.intra_op_num_threads = 0
    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

    # Load the model as a graph and prepare the CPU backend 
    session = InferenceSession(onnx_model_path, options, providers=[provider])
    session.disable_fallback()
    return session
sess = create_onnx_session(pt_onnx_quantized_path.as_posix())

ONNX opset version set to: 11
Loading pipeline (model: C:\Users\Hind\Desktop\GitHub code/Models/pytorchmodel, tokenizer: bert-base-uncased)
Creating folder C:\Users\Hind\Desktop\GitHub code\Models\pytorchmodel_onnx
Using framework PyTorch: 1.8.0a0+37c1f4a
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']
Optimized model has been written at C:\Users\Hind\Desktop\GitHub code\Models\pytorchmodel_onnx\model-optimized.onnx: ✔
/!\ Optimized model contains hardware specific operators which might not be portable. /!\


         Please use quantize_static for static quantization, quantize_dynamic for dynamic quantization.


As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.
This limitation will be removed in the next release of onnxruntime.
Quantized model has been written at C:\Users\Hind\Desktop\GitHub code\Models\pytorchmodel_onnx\model-optimized-quantized.onnx: ✔
