# AI Juris

In [1]:
import pandas as pd
import numpy as np
import nltk
import shutil
import evaluate
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Removes folders if they already exist. This avoids errors when running jupyter from the second time onwards
try:
    shutil.rmtree('train_logs')
    shutil.rmtree('train_results')
    shutil.rmtree('saved_model')
except:
    print('The folders do not exist or have already been removed!')

The folders do not exist or have already been removed!


## Load Data

In [3]:
# File name
filename = 'data/dataset.csv'

# Load data
dataset = load_dataset('csv', data_files=filename)

# Splitting into training and testint with 80/20 ratio
dataset = dataset['train'].train_test_split(test_size = 0.2)

# Show dataset format
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2993
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 749
    })
})

# Tokenizer and LLM Open-Source

https://huggingface.co/google/flan-t5-base

In [4]:
# Load Tokenizer
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')

# Showing the tokenizer
tokenizer

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5Tokenizer(name_or_path='google/flan-t5-base', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '

In [5]:
# Load pretrained LLM
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')

# Show the model
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [6]:
# Data collator to concatenate the tokenizer and the model
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Show the Data Collator
data_collator

DataCollatorForSeq2Seq(tokenizer=T5Tokenizer(name_or_path='google/flan-t5-base', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<

## Data Preprocessing

In [7]:
# Every input will receive the prefix: "answer the question"
prefix = "answer the question: "

In [8]:
# Preprocessing function
def data_preprocess(data):
    # Concatenate the prefix to each question in the list of questions given in data["question"]
    inputs = [prefix + doc for doc in data['question']]

    # Uses the tokenizer to convert the processed questions into tokens with a maximum lenght of 128, truncating any that are longer
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize the responses given in data['answer] with a maximum lenght of 512, truncating any that are longer
    labels = tokenizer(text_target = data['answer'], max_length=512, truncation=True)

    # Add the tokens of response as labels in the input dictionary of the model
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [9]:
# Applies the preprocessing function to the dataset, generating the tokenized dataset 
dataset_tokenized = dataset.map(data_preprocess, batched=True)

Map:   0%|          | 0/2993 [00:00<?, ? examples/s]

Map:   0%|          | 0/749 [00:00<?, ? examples/s]

In [10]:
# Show the dataset tokenized
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2993
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 749
    })
})

In [11]:
dataset_tokenized['train']['question'][0]

"Q: Who receives the judgement or settlement in a wrongful death suit in Michigan?. I may be able to enter into the Camp Lejeune lawsuit on behalf of my late father, but I am also not on speaking terms with my mother, and if she will receive the proceeds from any settlement then I'm not going to bother with it. Also would I need to be executor of his estate to file? "

In [12]:
dataset_tokenized['train']['answer'][0]

'A:Yes, you would need to be the personal representative, and yes, without a will, the spouse would receive a significant portion of not all of it.'

In [13]:
dataset_tokenized['train']['input_ids'][0]

[1525,
 8,
 822,
 10,
 1593,
 10,
 2645,
 911,
 7,
 8,
 22555,
 42,
 7025,
 16,
 3,
 9,
 3,
 30721,
 1687,
 3237,
 16,
 5847,
 58,
 5,
 27,
 164,
 36,
 3,
 179,
 12,
 2058,
 139,
 8,
 4594,
 312,
 1924,
 444,
 9953,
 30,
 6089,
 13,
 82,
 1480,
 2353,
 6,
 68,
 27,
 183,
 92,
 59,
 30,
 4461,
 1353,
 28,
 82,
 2039,
 6,
 11,
 3,
 99,
 255,
 56,
 911,
 8,
 14942,
 45,
 136,
 7025,
 258,
 27,
 31,
 51,
 59,
 352,
 12,
 13965,
 28,
 34,
 5,
 1203,
 133,
 27,
 174,
 12,
 36,
 9362,
 127,
 13,
 112,
 2052,
 12,
 1042,
 58,
 1]

## Defining the Evaluate Metric

In [14]:
# The "punkt" package is specifically for the task of tokenization, which involves splitting a text
# into a list of sentences
nltk.download("punkt", quiet = True)

True

In [15]:
# Defining the metric
metric = evaluate.load('rouge')


In [18]:
# Metric calculate function
def calculate_metric(eval_pres):

    # Unpack the predictions and labels from the eval_preds argument
    predictions, labels = eval_pres

    # Replace all non--100 values ​​in labels with the padding token ID
    labels = np.where(labels != -100,
                      labels,
                      tokenizer.pad_token_id)
    
    # Decode predictions to text, ignoring special tokens
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Decode labels to text, ignoring special tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Add a new line after each sentence to the decoded predictions, preparing them for ROUGE evaluation
    decoded_predictions = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_predictions]
    
    # Add a new line after each label to the decoded predictions, preparing them for ROUGE evaluation
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]


    # Calculate the ROUGE metric between predictions and decoded labels, using a stemmer
    result = metric.compute(predictions = decoded_predictions,
                            references = decoded_labels,
                            use_stemmer = True)
    
    # Returns the result of ROUGE metric
    return result


In [17]:
# Define the train arguments
training_args = Seq2SeqTrainingArguments(output_dir = "train_results",
                                        evaluation_strategy = "epoch",
                                        learning_rate = 3e-4,
                                        logging_dir = "logs_treino",
                                        logging_steps = 1,
                                        per_device_train_batch_size = 4,
                                        per_device_eval_batch_size = 2,
                                        weight_decay = 0.01,
                                        save_total_limit = 3,
                                        num_train_epochs = 3,
                                        predict_with_generate = True,
                                        push_to_hub = False)

In [19]:
# Defining the trainer
trainer = Seq2SeqTrainer(model = model,
                        args = training_args,
                        train_dataset = dataset_tokenized["train"],
                        eval_dataset = dataset_tokenized["test"],
                        tokenizer = tokenizer,
                        data_collator = data_collator,
                        compute_metrics = calculate_metric)

## Training the model

In [20]:
%%time
trainer.train()

  0%|          | 0/2247 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'loss': 2.896, 'grad_norm': 2.494243860244751, 'learning_rate': 0.00029986648865153534, 'epoch': 0.0}
{'loss': 3.2524, 'grad_norm': 2.5681724548339844, 'learning_rate': 0.0002997329773030707, 'epoch': 0.0}
{'loss': 3.0126, 'grad_norm': 1.8660629987716675, 'learning_rate': 0.0002995994659546061, 'epoch': 0.0}
{'loss': 3.5187, 'grad_norm': 2.842353105545044, 'learning_rate': 0.0002994659546061415, 'epoch': 0.01}
{'loss': 3.1587, 'grad_norm': 1.6370601654052734, 'learning_rate': 0.0002993324432576769, 'epoch': 0.01}
{'loss': 3.0298, 'grad_norm': 1.2512069940567017, 'learning_rate': 0.00029919893190921226, 'epoch': 0.01}
{'loss': 2.698, 'grad_norm': 1.812462568283081, 'learning_rate': 0.00029906542056074763, 'epoch': 0.01}
{'loss': 3.2376, 'grad_norm': 1.8051193952560425, 'learning_rate': 0.00029893190921228305, 'epoch': 0.01}
{'loss': 2.6015, 'grad_norm': 1.9065868854522705, 'learning_rate': 0.0002987983978638184, 'epoch': 0.01}
{'loss': 2.9194, 'grad_norm': 1.5426851511001587, 'learning

KeyboardInterrupt: 