In [51]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
import numpy as np

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

class CustomTFBertForSequenceClassification(TFBertForSequenceClassification):
    def train_step(self, data):
        # Unpack the data manually
        x, y, sample_weight = self.unpack_data(data)

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)

        # Backward pass and optimization
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
        
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        # Unpack the data manually (same as train_step)
        x, y, sample_weight = self.unpack_data(data)

        # Forward pass
        y_pred = self(x, training=False)
        # Compute loss
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)

        # Update the metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)

        return {m.name: m.result() for m in self.metrics}

    def unpack_data(self, data):
        if len(data) == 2:
            return data[0], data[1], None  # inputs, labels, sample_weights
        elif len(data) == 3:
            return data  # inputs, labels, sample_weights
        else:
            raise ValueError("Unexpected number of elements in `data`")

In [54]:
# Load the custom BERT model for sequence classification
model = CustomTFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

All PyTorch model weights were used when initializing CustomTFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model CustomTFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
import os
import tensorflow as tf

# Function to load texts and labels from subdirectories
def load_texts_and_labels_from_directories(base_directory):
    texts = []
    labels = []
    label_map = {}  # Dictionary to map string labels to integers
    
    label_index = 0
    
    # Loop through each subdirectory (each subdirectory name is a label)
    for label in os.listdir(base_directory):
        label_dir = os.path.join(base_directory, label)
        
        if os.path.isdir(label_dir):  # Ensure it's a directory
            # Map the label to an integer if it's not already in the map
            if label not in label_map:
                label_map[label] = label_index
                label_index += 1

            for filename in os.listdir(label_dir):
                filepath = os.path.join(label_dir, filename)
                
                if os.path.isfile(filepath) and filename.endswith('.txt'):
                    with open(filepath, 'r', encoding='utf-8') as file:
                        texts.append(file.read())  # Read the content of the file
                        labels.append(label_map[label])  # Use the mapped integer label
    
    return texts, labels, label_map

# Function to tokenize the input texts
def tokenize_texts(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

# Sample usage
base_directory = 'data_v1'  # Replace with the actual path to your text files (each subdirectory is a label)
texts, labels, label_map = load_texts_and_labels_from_directories(base_directory)

# Assuming you already have your tokenizer initialized
# tokenizer = ... (initialize your tokenizer here)

# Tokenize the texts loaded from files
input_ids, attention_masks = tokenize_texts(texts, tokenizer)

# Convert labels into tensors
labels = tf.convert_to_tensor(labels)

# Optionally, print the label map to see the mapping from string labels to integers
print("Label mapping:", label_map)


Label mapping: {'eraser': 0, 'neutral': 1, 'keys': 2}


In [59]:
# Tokenize the input texts
def tokenize_texts(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

# Convert texts and labels into tensors
input_ids, attention_masks = tokenize_texts(texts, tokenizer)
labels = tf.convert_to_tensor(labels)

In [60]:
# Split data into train and validation sets (you should use real data for training)
train_size = int(0.8 * len(texts))
train_inputs, validation_inputs = input_ids[:train_size], input_ids[train_size:]
train_labels, validation_labels = labels[:train_size], labels[train_size:]
train_masks, validation_masks = attention_masks[:train_size], attention_masks[train_size:]

In [61]:
# Define the optimizer, loss function, and metrics
optimizer = Adam(learning_rate=2e-5, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])



In [62]:
# Train the model
history = model.fit(
    [train_inputs, train_masks], 
    train_labels,
    validation_data=([validation_inputs, validation_masks], validation_labels),
    epochs=4,
    batch_size=1,
)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate([validation_inputs, validation_masks], validation_labels)
print(f"Validation Accuracy: {accuracy}")

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation Accuracy: 1.0
