In [None]:
!pip install -q transformers datasets evaluate scikit-learn accelerate
from datasets import load_dataset

# Load the SCOTUS split from LexGLUE
dataset = load_dataset("lex_glue", "scotus")

# Check dataset structure
print(dataset)
print(dataset["train"][0])

# Get label names directly from dataset
label_names = dataset["train"].features["label"].names
num_labels = len(label_names)
print("Number of labels:", num_labels)
print("Labels:", label_names)


from transformers import AutoTokenizer

# Use Legal-BERT for legal domain texts
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize, batched=True)


from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import transformers

print("Transformers version:", transformers.__version__)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    fp16=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


metrics = trainer.evaluate(tokenized["test"])
print("\nFinal Test Metrics:", metrics)


from sklearn.metrics import classification_report

preds_output = trainer.predict(tokenized["test"])
y_true = preds_output.label_ids
y_pred = np.argmax(preds_output.predictions, axis=1)

print("\nDetailed Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=label_names))


save_directory = "./legalbert_scotus_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"\nModel and tokenizer saved successfully at: {save_directory}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

scotus/train-00000-of-00001.parquet:   0%|          | 0.00/94.4M [00:00<?, ?B/s]

scotus/test-00000-of-00001.parquet:   0%|          | 0.00/40.0M [00:00<?, ?B/s]

scotus/validation-00000-of-00001.parquet:   0%|          | 0.00/39.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
})
{'text': "329 U.S. 29\n67 S.Ct. 1\n91 L.Ed. 22\nCHAMPLIN REFINING COv.UNITED STATES et al.\nNo. 21.\nArgued Oct. 18, 21, 1946.\nDecided Nov. 18, 1946.\nRehearing Denied Dec. 16, 1946.\n\nSee 329 U.S. 831, 67 S.Ct. 363.\nAppeal from the District Court of the United States for the Western District of Oklahoma.\nMessrs.Dan Moody, of Austin, Tex., and Harry O. Glasser, of Enid, Okla., for appellant.\nMr. Edward Dumbauld, of Washington, D.C., for appel-\n[Argument of Counsel from page 30 intentionally omitted]\nlees. Mr. Justice JACKSON delivered the opinion of the Court.\n\n\n1\nThe Interstate Commerce Commission, acting under § 19a of the Interstate Commerce Act,1 ordered the appellant to furnish certain inve

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Transformers version: 4.57.1


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjp-developer1204[0m ([33mjp-developer1204-navrachana-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.2596,0.935524,0.703571,0.514794
2,0.7036,0.873533,0.749286,0.638271
3,0.5109,0.891372,0.767857,0.688699
4,0.2688,1.112505,0.754286,0.663707
5,0.1535,1.1388,0.772143,0.697705
6,0.13,1.289748,0.776429,0.694649
7,0.0839,1.377005,0.773571,0.695155
8,0.0374,1.386775,0.766429,0.690086



Final Test Metrics: {'eval_loss': 1.3741693496704102, 'eval_accuracy': 0.7371428571428571, 'eval_macro_f1': 0.6171304008931633, 'eval_runtime': 10.5724, 'eval_samples_per_second': 132.421, 'eval_steps_per_second': 16.553, 'epoch': 8.0}

Detailed Classification Report:

              precision    recall  f1-score   support

           1       0.82      0.93      0.87       372
           2       0.77      0.75      0.76       222
           3       0.80      0.93      0.86        88
           4       0.60      0.57      0.59        51
           5       0.50      0.39      0.44        28
           6       0.76      0.76      0.76        17
           7       0.75      0.75      0.75        24
           8       0.65      0.79      0.71       260
           9       0.66      0.55      0.60       200
          10       0.46      0.14      0.22        83
          11       0.75      0.40      0.52        15
          12       0.94      0.92      0.93        37
          13       0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model and tokenizer saved successfully at: ./legalbert_scotus_model
