In [2]:
import torch
import math
import rich
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch   import nn
from termcolor import colored

from transformers import AlbertTokenizer, TFAlbertModel
from tensorflow.keras.layers import Input, Dense, Dropout, Layer
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks  import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

  from .autonotebook import tqdm as notebook_tqdm
2023-11-18 13:25:09.307099: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-18 13:25:09.330966: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-18 13:25:09.380753: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-18 13:25:09.380793: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-18 13:25:09.380817: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Una

In [3]:
# define the device to use
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
rich.print(f"Device: [red]{DEVICE}")

In [4]:
df = pd.read_csv('clean_data.csv')

# split the data
x = df['headline'].values
y = df['is_sarcastic'].values

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.20,
                                                    # shuffle = True,
                                                    random_state = 2456)

print(colored("\nDIVIDED SUCCESFULLY...", "green"))

[32m
DIVIDED SUCCESFULLY...[0m


In [5]:
unique, counts = np.unique(y_train, return_counts=True)
print("Training set distribution:")
print(dict(zip(unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print("\nTest set distribution:")
print(dict(zip(unique, counts)))

Training set distribution:
{0: 10173, 1: 9359}

Test set distribution:
{0: 2559, 1: 2324}


In [6]:
x_train_list = x_train.astype(str).tolist()
x_test_list  = x_test.astype(str).tolist()

tokenizer  = AlbertTokenizer.from_pretrained('albert-base-v2')
max_length = 512

x_train_encoded = tokenizer(x_train_list, add_special_tokens=True,
                            truncation=True,
                            padding='max_length',
                            max_length=max_length,
                            return_tensors='tf')
x_test_encoded = tokenizer(x_test_list, add_special_tokens=True, 
                           truncation=True,
                           padding='max_length', 
                           max_length=max_length,
                           return_tensors='tf')

# extract input IDs (token IDs) and attention masks
x_train_input_ids = x_train_encoded['input_ids']
x_test_input_ids  = x_test_encoded['input_ids']
x_train_attention_masks = x_train_encoded['attention_mask']
x_test_attention_masks  = x_test_encoded['attention_mask']

print(x_train_input_ids.shape, x_test_input_ids.shape)

(19532, 512) (4883, 512)


In [7]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = "auto", 
                               verbose = 1, monitor = "val_acc",
                               patience = 3)
callbacks = [early_stopping]

In [8]:
# load the pre-trained BERT model
albert_model = TFAlbertModel.from_pretrained('albert-base-v2')

# constructing the model
input_ids = Input(shape=(max_length,), dtype='int32', 
                  name='input_ids')
attention_mask = Input(shape=(max_length,), dtype='int32', 
                       name='attention_mask')

# Use the ALBERT model as a layer
albert_output    = albert_model(input_ids, attention_mask=attention_mask)[0]
cls_token_output = albert_output[:, 0, :] 

# adding other layers on top of ALBERT
x = Dense(25, activation='relu')(cls_token_output)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

# the final model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(loss = "binary_crossentropy", 
              optimizer = Adam(learning_rate = 0.004),
              metrics = ["accuracy"])
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_albert_model (TFAlbertM  TFBaseModelOutputWithPooli   1168358   ['input_ids[0][0]',           
 odel)                       ng(last_hidden_state=(None   4          'attention_mask[0][0]']      
                             , 512, 768),                                                     

In [9]:
history = model.fit(
    [x_train_input_ids, x_train_attention_masks], y_train, 
    epochs=5, 
    validation_data=([x_test_input_ids, x_test_attention_masks], y_test), 
    batch_size=16, 
    verbose=1, 
    callbacks=callbacks
)

Epoch 1/5


2023-11-18 13:25:49.910776: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (mklcpu) ran out of memory trying to allocate 96.00MiB (rounded to 100663296)requested by op model/tf_albert_model/albert/encoder/albert_layer_groups_._0/albert_layers_._0/Gelu_10/mul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-11-18 13:25:49.910818: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for mklcpu
2023-11-18 13:25:49.910827: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-11-18 13:25:49.910834: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-11-18

Unexpected exception formatting exception. Falling back to standard exception


2023-11-18 13:26:09.915069: I tensorflow/tsl/framework/bfc_allocator.cc:1075] Next region of size 2183445248
2023-11-18 13:26:09.915085: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4df8cb040 of size 25165824 next 177
2023-11-18 13:26:09.915090: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4e10cb040 of size 100663296 next 178
2023-11-18 13:26:09.915095: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4e70cb040 of size 100663296 next 179
2023-11-18 13:26:09.915100: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4ed0cb040 of size 100663296 next 180
2023-11-18 13:26:09.915105: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4f30cb040 of size 100663296 next 181
2023-11-18 13:26:09.915110: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4f90cb040 of size 100663296 next 182
2023-11-18 13:26:09.915115: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7fd4ff0cb040 of size 25165824 next 183
2023

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
epoch_num = range(1, len(history.history["loss"]) + 1)
plt.plot(epoch_num, history.history["loss"], "r--")
plt.plot(epoch_num, history.history["val_loss"], "b-")
plt.legend(["Training loss", "Validation loss"])
plt.xlabel("Epoch numbers")
plt.ylabel("Loss")
plt.title('ALBERT model: Training and validation loss')
plt.savefig('ALBERT_loss_plot.png')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.xlabel("Epoch numbers")
plt.ylabel("Accuracy")
plt.title('ALBERT model: Training and validation accuracy')
plt.legend()
plt.savefig('ALBERT_acc_plot.png')
plt.show()