In [1]:
import random
import json
import pandas as pd
from tqdm import tqdm

from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow_text as text
import tensorflow as tf
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
json_file_dict = json.load(open('fine-tune-data.json', 'r'))
# case_csv = pd.read_csv('data.csv')
keys = list(json_file_dict.keys())
random.shuffle(keys)

split_index = int(len(keys) * 0.8)
train_keys = keys[:split_index]
test_keys = keys[split_index:]

train_dict = {k: json_file_dict[k] for k in train_keys}
test_dict = {k: json_file_dict[k] for k in test_keys}

In [3]:
train_df = pd.DataFrame(columns=[
    'Query',
    'Case',
    'Labels'
])
for i, key in enumerate(train_dict.keys()):
    query = key
    print(f"Query {i+1}/{len(train_dict.keys())} processed...", end='\r')
    for j, case in enumerate(train_dict[key]):
        case_text = case[0]
        label = case[1]
        train_df.loc[len(train_df)] = [query, case, label]        

Query 3093/3093 processed...

In [None]:
train_df.head()

In [4]:
# tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

path = kagglehub.model_download("tensorflow/bert/tensorFlow2/bert-en-uncased-l-12-h-768-a-12")
bert_model = tf.saved_model.load(path)

2024-08-29 17:28:42.797161: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-08-29 17:28:42.797190: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-29 17:28:42.797200: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-29 17:28:42.797217: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-29 17:28:42.797228: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
def preprocess_text(query, case, seq_length=512):

    combined_text = query + "[SEP]" + case

    # Load the BERT tokenizer
    path = kagglehub.model_download("tensorflow/bert/tensorFlow2/en-uncased-preprocess")
    tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
    
    # Tokenize the text
    combined_tokens = tokenizer(combined_text, max_length=seq_length, truncation=True, padding='max_length')
    
    return combined_tokens

class ClassifierModel(tf.keras.Model):
    def __init__(self, bert_model, seq_length=512):
        super(ClassifierModel, self).__init__()
        self.seq_length = seq_length
        self.bert = bert_model
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.classifier = tf.keras.layers.Dense(1, activation=None)
    
    def call(self, inputs):

        # input_ids = inputs['input_ids']
        # attention_mask = inputs['attention_mask']

        # Run the BERT model
        outputs = self.bert(inputs)

        # Use pooled_output for classification
        net = outputs['pooled_output']  # Change 'pooled_output' to 'pooler_output' for Hugging Face models
        net = self.dropout(net)
        net = self.classifier(net)
        
        return net


In [6]:
def create_tfds(dataframe, batch_size=8):
    input_word_ids = []
    input_type_ids = []
    input_mask = []
    labels = []

    for i in tqdm(range(len(dataframe))):
        query = str(dataframe['Query'][i])
        case = str(dataframe['Case'][i])
        label = dataframe['Labels'][i]
        encoder_inputs = preprocess_text(query, case)
        
        input_word_ids.append(encoder_inputs["input_ids"])
        input_type_ids.append(encoder_inputs["token_type_ids"])
        input_mask.append(encoder_inputs["attention_mask"])
        labels.append(label)

    # Convert lists to tensors
    input_word_ids = tf.convert_to_tensor(input_word_ids)
    input_type_ids = tf.convert_to_tensor(input_type_ids)
    input_mask = tf.convert_to_tensor(input_mask)
    labels = tf.convert_to_tensor(labels, dtype=tf.int32)
    
    # Create TensorFlow Dataset
    dataset = tf.data.Dataset.from_tensor_slices(({
        "input_word_ids": input_word_ids,
        "input_type_ids": input_type_ids,
        "input_mask": input_mask,
    }, labels))
    
    dataset = dataset.shuffle(1000).batch(batch_size)

    return dataset

In [10]:
train_ds = create_tfds(train_df.head(1000), batch_size=8)


100%|██████████| 1000/1000 [15:16<00:00,  1.09it/s]


In [11]:
tf.data.experimental.save(train_ds, 'train_ds')

Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.


Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.


In [None]:
model = ClassifierModel(bert_model)

In [None]:
for batch in train_ds.take(1):
    batch_inputs, batch_labels = batch

    # Run the model on this batch
    initial_outputs = model.call(batch_inputs)

    # Print the outputs
    print("Model outputs before training:")
    print(initial_outputs.numpy())

    # Print the corresponding labels for comparison
    print("\nCorresponding labels:")
    print(batch_labels.numpy())

In [None]:
early_stopping = EarlyStopping(
    monitor='loss',
    patience=3,
    restore_best_weights=True
)

In [None]:
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.01),
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
history = model.fit(
    train_ds,
    epochs=10
)

In [None]:
def tokenize_function(query, case, max_length=512):
    return tokenizer(query, case, padding='max_length', truncation=True, max_length=max_length)

def encode_data(df, max_length=512):
    input_ids = []
    attention_masks = []
    labels = []

    for i in tqdm(range((len(df))), desc="Encoding data..."):
        encoding = tokenize_function(df['Query'][i], df['Case'][i], max_length=max_length)
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
        labels.append(df['Labels'][i])

    return input_ids, attention_masks, labels

small_df = train_df.head(10)
input_ids, attention_masks, labels = encode_data(small_df)

In [None]:
def create_tf_dataset(input_ids, attention_masks, labels, batch_size=8):
    # Convert lists to tensors
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    labels = tf.convert_to_tensor(labels)

    # Create a TensorFlow Dataset
    dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': input_ids,
        'attention_mask': attention_masks,
    }, labels))

    # Shuffle and batch the dataset
    dataset = dataset.shuffle(len(labels)).batch(batch_size)
    
    return dataset


In [None]:
# Create the dataset
train_dataset = create_tf_dataset(input_ids, attention_masks, labels)

In [None]:
model = TFBertForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=2)

model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=3e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
for layer in model.layers:
    print(layer.name, layer.trainable)

In [None]:
for batch in train_dataset.take(1):
    inputs, labels = batch
    logits = model(inputs, training=False).logits  # Perform a forward pass through the model

# Print the logits
print("Initialized logits before training:")
print(logits)

In [None]:
history = model.fit(
    train_dataset, 
    epochs=10, 
    verbose=1, 
)