In [1]:
import os
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CustomTFBertForSequenceClassification(TFBertForSequenceClassification):
    def train_step(self, data):
        x, y, sample_weight = self.unpack_data(data)
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        x, y, sample_weight = self.unpack_data(data)
        y_pred = self(x, training=False)
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
        return {m.name: m.result() for m in self.metrics}

    def unpack_data(self, data):
        if len(data) == 2:
            return data[0], data[1], None
        elif len(data) == 3:
            return data
        else:
            raise ValueError("Unexpected number of elements in `data`")

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = CustomTFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

All PyTorch model weights were used when initializing CustomTFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model CustomTFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def load_texts_and_labels(csv_files, categories):
    texts = []
    labels = []
    label_map = {category: idx for idx, category in enumerate(categories)}
    
    for csv_file, category in zip(csv_files, categories):
        try:
            with open(csv_file, 'r') as file:
                text_data = file.read().splitlines()
            
            text_data = [line for line in text_data if line.strip()]
            texts.extend(text_data)
            labels.extend([label_map[category]] * len(text_data))
        
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")
    
    return texts, labels, label_map

csv_files = ['all_csv/eraser.csv', 'all_csv/keys.csv', 'all_csv/neutral_v6.csv']
categories = ['eraser', 'keys', 'neutral']

texts, labels, label_map = load_texts_and_labels(csv_files, categories)

In [8]:
def tokenize_texts(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

input_ids, attention_masks = tokenize_texts(texts, tokenizer)

labels = tf.convert_to_tensor(labels)

In [9]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()
labels_np = labels.numpy()

for train_index, temp_index in sss.split(input_ids_np, labels_np):
    train_inputs, temp_inputs = input_ids_np[train_index], input_ids_np[temp_index]
    train_labels, temp_labels = labels_np[train_index], labels_np[temp_index]
    train_masks, temp_masks = attention_masks_np[train_index], attention_masks_np[temp_index]

sss_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in sss_val_test.split(temp_inputs, temp_labels):
    validation_inputs, test_inputs = temp_inputs[val_index], temp_inputs[test_index]
    validation_labels, test_labels = temp_labels[val_index], temp_labels[test_index]
    validation_masks, test_masks = temp_masks[val_index], temp_masks[test_index]

In [10]:
train_inputs, validation_inputs, test_inputs = map(tf.convert_to_tensor, [train_inputs, validation_inputs, test_inputs])
train_masks, validation_masks, test_masks = map(tf.convert_to_tensor, [train_masks, validation_masks, test_masks])
train_labels, validation_labels, test_labels = map(tf.convert_to_tensor, [train_labels, validation_labels, test_labels])

optimizer = Adam(learning_rate=2e-5, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

history = model.fit(
    [train_inputs, train_masks],
    train_labels,
    validation_data=([validation_inputs, validation_masks], validation_labels),
    epochs=4,
    batch_size=4
)



Epoch 1/4


2024-11-07 16:35:03.993545: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [12]:
model_save_path = "models/trained_v2"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved at {model_save_path}")

Model and tokenizer saved at models/trained_v2


In [None]:
test_loss, test_accuracy = model.evaluate([test_inputs, test_masks], test_labels)
print(f"Test Accuracy: {test_accuracy}")

predictions = model.predict([test_inputs, test_masks])
predicted_labels = tf.argmax(predictions.logits, axis=-1)
conf_matrix = confusion_matrix(test_labels, predicted_labels)

print("Confusion Matrix:")
for row in conf_matrix:
    print(' '.join(map(str, row)))

print("\nLabel mapping (index -> label name):")
for label_name, index in label_map.items():
    print(f"{index}: {label_name}")

Test Accuracy: 0.9733333587646484
Confusion Matrix:
42 3 0
1 44 0
0 0 60

Label mapping (index -> label name):
0: eraser
1: keys
2: neutral
