In [10]:
# verifying version of transformers library

import numpy as np
import pandas as pd

import transformers
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from transformers import BertTokenizer, TFBertForSequenceClassification

In [11]:
# import cudnn and cuda for GPU acceleration
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8014373433050757738
xla_global_id: -1
]


In [12]:
print("TensorFlow Version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

TensorFlow Version: 2.16.2
Num GPUs Available:  0


In [3]:

# Load the data
df_review = pd.read_csv('df_review_clean.csv')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize texts
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

# Tokenize the texts
X_text = df_review['review_text'].tolist()
tokenized_texts = tokenize_texts(X_text, tokenizer)

# Encode target labels to one-hot
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_review['target'])
y = tf.keras.utils.to_categorical(y, num_classes=2)  # Convert to one-hot encoding

# Convert tokenized texts to NumPy arrays
X_input_ids = tokenized_texts['input_ids'].numpy()
X_attention_masks = tokenized_texts['attention_mask'].numpy()

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_input_ids, y, test_size=0.2, random_state=42)

# Extract attention masks for train and test sets
attention_masks_train = X_attention_masks[:len(X_train)]
attention_masks_test = X_attention_masks[len(X_train):]


TensorFlow Version: 2.16.2
Num GPUs Available:  0


In [4]:
# Function to train the BERT model
def train_bert(X_train, y_train, X_test, y_test, attention_masks_train, attention_masks_test):

    # Create the BERT model
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    model.compile(optimizer='adagrad', loss=loss, metrics=['accuracy'],learning_rate=0.001)

    # Train the model
    model.fit(
        {'input_ids': X_train, 'attention_mask': attention_masks_train},
        y_train,
        validation_data=({'input_ids': X_test, 'attention_mask': attention_masks_test}, y_test),
        epochs=5,
        batch_size=16,
    )

    # Evaluate the model
    y_pred = np.argmax(model.predict({'input_ids': X_test, 'attention_mask': attention_masks_test}).logits, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)
    classification_report_str = classification_report(y_test_labels, y_pred, target_names=label_encoder.classes_, zero_division=0)
    accuracy = accuracy_score(y_test_labels, y_pred)

    print("Classification Report:\n", classification_report_str)
    print("Accuracy:", accuracy)

    model.save_pretrained('bert_model')
    tokenizer.save_pretrained('bert_model')

    return classification_report_str, accuracy

In [30]:
# Train the BERT model
report_bert, accuracy_bert = train_bert(X_train, y_train, X_test, y_test, attention_masks_train, attention_masks_test)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


 29/365 [=>............................] - ETA: 21:36 - loss: 0.7400 - accuracy: 0.5927

KeyboardInterrupt: 

In [None]:
print(report_bert)
print('Accuracy:', accuracy_bert)
