
## BERT Model 

#### A Fine Tuned BERT classifier

In [1]:
import preprocess
import fine_tunned_bert

import re 
import pandas as pd
import tensorflow as tf 

from time import time
%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     /home/slide/amieurn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/slide/amieurn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# model evaluation 
from statistics import mean

from sklearn.model_selection import KFold
!pip install tensorflow-addons



In [4]:
# import the advertising dataset 
# Each ad has a binary label: 0 for commercial and 1 for informational ads 
df = preprocess.get_data('/data/task1_dataset.csv', columns_to_drop= False )
df = df.iloc[: , 1:]

# removing links from the data 
df["text"] = df["text"].apply(lambda text: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text, flags=re.MULTILINE))

clean_data = preprocess.preprocess_data(df['text'])
# remove numbers
df['clean_data'] = clean_data
df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(r'[0-9]+', '', x))
df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(' +', ' ', x))

df.head()

Unnamed: 0,label,text,clean_data
0,0,"""My Thistle subscription has changed my work f...",thistle subscription changed work home life sa...
1,0,"After I bought this toy for my child, he staye...",bought toy child stayed away ipadtablet tv pro...
2,0,A Black Friday purchase you actually won't reg...,black friday purchase actually wont regret sto...
3,0,"The Nutcracker, accompany you and your family ...",nutcracker accompany family warm holiday
4,0,ohora’s NEW COLLECTION - shop right away ⠀ It...,ohora’s new collection shop right away ⠀ getti...


In [5]:
# The smaller BERT intended for environments with restricted computational resources
bert_model_name='small_bert/bert_en_uncased_L-4_H-512_A-8'

In [6]:
folds=5
# Cross validation with the tensorflow nn is performed differently
acc_per_fold = []
loss_per_fold = []
prec_per_fold = []
recall_per_fold = []
auc_per_fold = []
kfold = KFold(n_splits = folds, shuffle=True)           # Define the K-fold Cross Validator

for train, test in kfold.split(df['clean_data'], df['label']):

    train_data = tf.data.Dataset.from_tensor_slices(  (df['clean_data'].iloc[train], df['label'].iloc[train]) ).batch(32)
    test_data = tf.data.Dataset.from_tensor_slices( (df['clean_data'].iloc[test], df['label'].iloc[test]) ).batch(32)
    # create the fine_tunned_model
    bert_model = fine_tunned_bert.create_fine_tunned_bert(train_data,bert_model_name=bert_model_name)
    # training the model
    history = bert_model.fit(x=train_data,epochs =5,validation_data = test_data,verbose=True)
    # evaluation 
    scores = bert_model.evaluate(test_data)
    loss_per_fold.append(scores[0])
    acc_per_fold.append(scores[1])
    prec_per_fold.append(scores[2])
    recall_per_fold.append(scores[3])
    auc_per_fold.append(scores[4])

2022-03-21 12:54:23.102396: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-21 12:54:23.823933: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10413 MB memory:  -> device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:04:00.0, compute capability: 6.1


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
# Manually calculate accuracy, precesion, recall and f1score ann in cross validation
bert = {}
bert['test_accuracy'] = mean(acc_per_fold)
bert['test_precision']= mean(prec_per_fold)
bert['test_recall'] = mean(recall_per_fold)
bert['test_f1_score'] = (2 * mean(prec_per_fold) * mean(recall_per_fold)) / (mean(prec_per_fold) + mean(recall_per_fold))   
bert['test_roc_auc_score'] = mean(auc_per_fold)
# Create a dataframe with the models performance metrics scores
models_scores_table = pd.DataFrame({
                                    'Bert': [bert['test_accuracy'],
                                            bert['test_precision'],
                                            bert['test_recall'],
                                            bert['test_f1_score'], 
                                            bert['test_roc_auc_score']  
                                            ]
                                     },
                                  index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])

models_scores_table

Unnamed: 0,Bert
Accuracy,0.913364
Precision,0.914904
Recall,0.912308
F1 Score,0.913604
AUC,0.965111
