[Libs for Randomsearch]
TensorFlow version: 2.11.0
Transformers version: 4.17.0
Keras version: 2.11.0
[Libs for Bert]
TensorFlow version: 2.17.0
Transformers version: 4.45.2
Keras version: 3.6.0

In [2]:
import tensorflow as tf
import transformers
import keras

print(f"TensorFlow version: {tf.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Keras version: {keras.__version__}")

TensorFlow version: 2.17.0
Transformers version: 4.45.2
Keras version: 3.6.0


In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification, AdamWeightDecay, TFBertModel



In [31]:
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D

from tensorflow.keras.utils import custom_object_scope

from tensorflow.keras.models import Model
# from keras_tuner import HyperParameters, RandomSearch
from kerastuner.tuners import RandomSearch

import os
import pandas as pd
import numpy as np
import pickle

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

INPUT_DIR = "../Output/proto_models_rev2_1000"
# Load data
df_train = pd.read_csv(os.path.join(INPUT_DIR, 'train_cleaned.csv'))
df_test = pd.read_csv(os.path.join(INPUT_DIR, 'test_cleaned.csv'))
# df_test = processor.load_data()
X_train = df_train['review']
X_test = df_test['review']
y_train = df_train['polarity']
y_test = df_test['polarity']

In [32]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the datasets
X_train_tokens = tokenizer(
    text=list(X_train),
    add_special_tokens=True,
    max_length=100,
    padding='max_length',
    truncation=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True
)

X_test_tokens = tokenizer(
    text=list(X_test),
    add_special_tokens=True,
    max_length=100,
    padding='max_length',
    truncation=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True
)

In [7]:
# Define BERT Model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Use Hugging Face's AdamWeightDecay optimizer
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
# Compile the model using a standard loss function
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
bert_model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
# Model Summary
bert_model.summary()

# Train the model
hist = bert_model.fit(
    {'input_ids': X_train_tokens['input_ids'], 'attention_mask': X_train_tokens['attention_mask']},
    y_train,
    validation_data=({'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']}, y_test),
    epochs=3,
    batch_size=32
)

# Predict on test data
y_pred_logits = bert_model.predict({'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']}).logits
y_pred = np.argmax(y_pred_logits, axis=-1)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [8]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 95.50%
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       102
           1       0.97      0.94      0.95        98

    accuracy                           0.95       200
   macro avg       0.96      0.95      0.95       200
weighted avg       0.96      0.95      0.95       200



Use Tensorflow version 2.11

In [33]:
def build_bert_model(hp):
    input_ids = Input(shape=(100,), dtype='int32', name="input_ids")
    attention_mask = Input(shape=(100,), dtype='int32', name="attention_mask")
    
    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0] 
    pooled_output = GlobalAveragePooling1D()(bert_output)  
    
    dense = Dense(units=hp.Int("units", min_value=32, max_value=128, step=16), activation='relu')(pooled_output)
    output = Dense(2, activation='softmax')(dense)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    learning_rate = hp.Choice("learning_rate", values=[1e-5, 2e-5, 3e-5])
    optimizer = Adam(learning_rate=learning_rate)
    model.build()
    model.compile(optimizer=optimizer, loss=SparseCategoricalCrossentropy(), metrics=["accuracy"])
    
    return model

tuner = RandomSearch(
    build_bert_model,
    objective="val_accuracy",
    max_trials=20,
    executions_per_trial=1,
    directory="bert_tuning",
    project_name="bert_sentiment_analysis"
)

train_data = (
    {"input_ids": X_train_tokens["input_ids"], "attention_mask": X_train_tokens["attention_mask"]},
    y_train
)
val_data = (
    {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
    y_test
)

tuner.search(
    x=train_data[0],
    y=train_data[1],
    validation_data=val_data,
    epochs=3,
    batch_size=32
)



Reloading Tuner from bert_tuning/bert_sentiment_analysis/tuner0.json


In [34]:
tuner.results_summary()

Results summary
Results in bert_tuning/bert_sentiment_analysis
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 0 summary
Hyperparameters:
units: 64
learning_rate: 2e-05
Score: 0.8694999814033508

Trial 4 summary
Hyperparameters:
units: 128
learning_rate: 2e-05
Score: 0.8684999942779541

Trial 3 summary
Hyperparameters:
units: 96
learning_rate: 2e-05
Score: 0.8675000071525574

Trial 05 summary
Hyperparameters:
units: 128
learning_rate: 3e-05
Score: 0.8675000071525574

Trial 07 summary
Hyperparameters:
units: 32
learning_rate: 1e-05
Score: 0.8615000247955322

Trial 09 summary
Hyperparameters:
units: 128
learning_rate: 1e-05
Score: 0.8600000143051147

Trial 06 summary
Hyperparameters:
units: 32
learning_rate: 2e-05
Score: 0.8585000038146973

Trial 10 summary
Hyperparameters:
units: 96
learning_rate: 1e-05
Score: 0.8585000038146973

Trial 08 summary
Hyperparameters:
units: 64
learning_rate: 1e-05
Score: 0.8550000190734863

Trial 2 summary
Hyperparameters:
unit

In [35]:
# Retrieve the best model and hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print("Best Hyperparameters:")
print(f"Learning Rate: {best_hyperparameters.get('learning_rate')}")
# print(f"Weight Decay: {best_hyperparameters.get('weight_decay')}")

# Evaluate the best model on the test set
best_model.evaluate(
    {'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']},
    y_test
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 100), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 100), dtype=int32, sparse=False, name=attention_mask>
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [6]:
# Predict with the best model
y_pred_logits = best_model.predict({'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']})
y_pred = np.argmax(y_pred_logits, axis=-1)
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 86.95%
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.83      0.85       893
           1       0.87      0.90      0.88      1107

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000



In [7]:
model_filename = os.path.join(INPUT_DIR, 'bert_best_model.h5')
params_filename = os.path.join(INPUT_DIR,'bert_best_params.pkl')

from tensorflow.keras.models import save_model

# Save the best hyperparameters and model after tuning
def save_model_and_params(best_model, best_params, model_filename=model_filename, params_filename=params_filename):
    # Save best model
    best_model.save(model_filename)
        
    # Save best hyperparameters
    with open(params_filename, 'wb') as params_file:
        pickle.dump(best_params, params_file)

# Example usage (after tuning)
save_model_and_params(best_model, best_hyperparameters)

In [11]:
model_filename = os.path.join(INPUT_DIR, 'bert_best_model.h5')
params_filename = os.path.join(INPUT_DIR,'bert_best_params.pkl')

from tensorflow.keras.models import load_model

# Load the best hyperparameters and model from the saved files
def load_model_and_params(model_filename=model_filename, params_filename=params_filename):
    # Load best hyperparameters
    with open(params_filename, 'rb') as params_file:
        best_params = pickle.load(params_file)

    # Load best model using custom object scope for TFBertModel
    with custom_object_scope({'TFBertModel': TFBertModel}):
        best_model = load_model(model_filename)
            

    return  best_model, best_params

# Example usage
best_model, best_params = load_model_and_params()


In [9]:

# Build a new BERT model with the loaded hyperparameters
def build_bert_model_with_params(best_params):

    input_ids = Input(shape=(100,), dtype='int32', name="input_ids")
    attention_mask = Input(shape=(100,), dtype='int32', name="attention_mask")
    
    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0] 
    pooled_output = GlobalAveragePooling1D()(bert_output)  
    
    dense = Dense(units=68, activation='relu')(pooled_output)
    output = Dense(2, activation='softmax')(dense)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=output)

    # # Create a new BERT model with the best hyperparameters
    # bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # Compile the model using the best hyperparameters
    learning_rate = best_params.get('learning_rate')  # Default value if not found
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])

    return model

# Build a new model with the best hyperparameters
best_model = build_bert_model_with_params(best_params)

# Print the model summary
best_model.summary()

model_filename = os.path.join(INPUT_DIR, 'bert_best_model.h5')
best_model.save(model_filename)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_model_2 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                         

In [12]:
# Train the loaded model with the same dataset
history = best_model.fit(
    {'input_ids': X_train_tokens['input_ids'], 'attention_mask': X_train_tokens['attention_mask']},
    y_train,
    validation_data=({'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']}, y_test),
    epochs=2,
    batch_size=32
)

Epoch 1/2
Epoch 2/2


In [13]:
# Predict with the best model
y_pred_logits = best_model.predict({'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']})
y_pred = np.argmax(y_pred_logits, axis=-1)
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 86.05%
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       893
           1       0.89      0.85      0.87      1107

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



In [None]:
import matplotlib.pyplot as plt

# Define a function to plot training history
def plot_training_history(history, title="Model Training History"):
    # Extract values from history
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    epochs = range(1, len(acc) + 1)
    
    # Plot training and validation accuracy
    plt.figure(figsize=(14, 5))
    
    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
    plt.title(f"{title} - Accuracy")
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title(f"{title} - Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Plot for hist
plot_training_history(hist, title="Initial Model Training History")

# Plot for history
plot_training_history(history, title="Searched Model Training History")