In [1]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Load Legal BERT model
model_name = "nlpaueb/legal-bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import tensorflow as tf

In [3]:
import pandas as pd

In [4]:
df=pd.read_csv('train31.csv')

In [5]:
df.fillna(' ',inplace=True)

In [6]:
text=[]
labels=[]
for i in range(len(df)):
  temp=df['question'][i]+df['answer'][i]+df['expl'][i]
  text.append(temp)
  labels.append(df['label'][i])

In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)

In [8]:
print(class_weights)
class_weight_dict = {0: class_weights[0], 1: 1.1*class_weights[1]}

[0.65940594 2.06832298]


In [9]:
tokenized_input = tokenizer(text, return_tensors="tf", padding=True, truncation=True)

In [10]:
import numpy as np
input_ids = tokenized_input['input_ids']
attention_mask = tokenized_input['attention_mask']
labels = np.array(labels)

In [11]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = self.add_weight(name='precision', initializer='zeros')
        self.recall = self.add_weight(name='recall', initializer='zeros')
        self.f1 = self.add_weight(name='f1', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.keras.backend.cast(y_true, 'float')
        y_pred = tf.keras.backend.cast(tf.keras.backend.round(y_pred), 'float')

        true_positives = tf.keras.backend.sum(tf.keras.backend.cast(y_true * y_pred, 'float'))
        predicted_positives = tf.keras.backend.sum(y_pred)
        possible_positives = tf.keras.backend.sum(y_true)

        precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
        recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())

        self.precision.assign_add(precision)
        self.recall.assign_add(recall)
        self.f1.assign_add(f1)

    def result(self):
        return self.f1



In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
epochs = 1
batch_size = 2
model.trainable = True

# optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# f1_metric = F1Score()
# model.compile(optimizer=optimizer, loss=loss_fn, metrics=[f1_metric])
# epochs = 1
# batch_size = 2
# model.trainable = True




In [13]:
for epoch in range(epochs):
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]

        # Calculate class weights for the current batch
        batch_class_weights = {0: class_weight_dict[0], 1: class_weight_dict[1]}
        print(f'Epoch: {epoch + 1}, Batch: {i // batch_size + 1}, Class Weights: {batch_class_weights}')

        # Convert class weights to tensor
        class_weights_tensor = tf.convert_to_tensor(list(batch_class_weights.values()), dtype=tf.float32)

        # Train the model on the current batch using train_on_batch
        weighted_loss = model.train_on_batch([batch_input_ids, batch_attention_mask], batch_labels, class_weight=batch_class_weights)

        # Get model predictions
        output = model.predict([batch_input_ids, batch_attention_mask])
        logits = output.logits  # Extract logits from TFSequenceClassifierOutput
        predicted_labels = tf.argmax(tf.nn.softmax(logits), axis=-1).numpy()

        # Print metrics or log them as needed
        print(f'Epoch: {epoch + 1}, Batch: {i // batch_size + 1}, Loss: {weighted_loss}')
        print(f'Predictions: {predicted_labels}')


Epoch: 1, Batch: 1, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}
Epoch: 1, Batch: 1, Loss: [0.5299914479255676, 0.5]
Predictions: [0 1]
Epoch: 1, Batch: 2, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}
Epoch: 1, Batch: 2, Loss: [1.4694968461990356, 0.5]
Predictions: [0 0]
Epoch: 1, Batch: 3, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}
Epoch: 1, Batch: 3, Loss: [1.0999786853790283, 0.5]
Predictions: [0 0]
Epoch: 1, Batch: 4, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}
Epoch: 1, Batch: 4, Loss: [0.4904484748840332, 0.0]
Predictions: [0 0]
Epoch: 1, Batch: 5, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}
Epoch: 1, Batch: 5, Loss: [1.2287830114364624, 0.5]
Predictions: [1 1]
Epoch: 1, Batch: 6, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}
Epoch: 1, Batch: 6, Loss: [0.47171688079833984, 0.5]
Predictions: [1 1]
Epoch: 1, Batch: 7, Class Weights: {0: 0.6594059405940594, 1: 2.2751552795031054}

In [None]:
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit(
#     {'input_ids': input_ids, 'attention_mask': attention_mask},
#     {'labels': labels},  # Assuming your labels are named 'labels'
#     epochs=1,
#     batch_size=2,
#     class_weight=class_weight_dict
# )


In [None]:
df1=pd.read_csv('dev31.csv')
df1.fillna(' ',inplace=True)

In [None]:
df1.head()

In [None]:
test_text=[]
test_labels=[]
for i in range(len(df1)):
  temp=df1['question'][i]+df1['answer'][i]+df1['expl'][i]
  test_text.append(temp)
  test_labels.append(df['label'][i])


In [None]:
tokenized_test = tokenizer(test_text, return_tensors="tf", padding=True, truncation=True)
test_input_ids = tokenized_test['input_ids']
test_attention_mask = tokenized_test['attention_mask']
test_labels=np.array(test_labels)

In [None]:
eval_result = model.evaluate(
    {'input_ids': test_input_ids, 'attention_mask': test_attention_mask},
    test_labels
)
print("Test Accuracy:", eval_result[1])


In [None]:
predictions = model.predict(
    {'input_ids': test_input_ids, 'attention_mask': test_attention_mask},
    verbose=0
)
# predicted_labels = np.argmax(predictions, axis=1)


In [None]:
predictions_np = np.array(predictions.logits)

# Check the shape
print("Shape of predictions array:", predictions_np.shape)

# Get the predicted labels
predicted_labels = np.argmax(predictions_np, axis=1)

# Print the predicted labels
print("Predicted Labels:", predicted_labels)


In [None]:
print("Predicted Labels:", predicted_labels)