In [1]:
%pip install transformers[torch] pandas scikit-learn matplotlib seaborn accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install --upgrade transformers accelerate

Collecting accelerate
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.9.0-py3-none-any.whl (367 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.8.1
    Uninstalling accelerate-1.8.1:
      Successfully uninstalled accelerate-1.8.1
Successfully installed accelerate-1.9.0
Note: you may need to restart the kernel to use updated packages.




In [3]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Step 0: Install required packages
# Using -q for a quieter installation
%pip install -q pandas numpy scikit-learn transformers[torch] datasets matplotlib seaborn

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# ==============================================================================
# Step 2: Data Loading Function (***MAJOR FIX APPLIED HERE***)
# ==============================================================================

def load_data(file_path):
    """
    Load data from a JSON file. This function is robust and can handle both
    a simple JSON array of objects and a dictionary containing the list of objects.
    """
    print(f"Attempting to load data from: {file_path}")
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        print("Creating a dummy 'resources/training_data.json' for demonstration.")
        os.makedirs('resources', exist_ok=True)
        # Creating a nested JSON to match the likely cause of the error
        dummy_data = {
            "training_data": [
                {"text": "The defendant's actions constitute a breach of contract.", "type": "Claim"},
                {"text": "The contract was signed by both parties on May 1st.", "type": "Premise"},
                {"text": "This is stated in section 4a of the agreement.", "type": "Premise"},
                {"text": "The plaintiff is therefore entitled to damages.", "type": "Claim"},
                {"text": "The court session was held on a Tuesday.", "type": "Other"}
            ]
        }
        with open(file_path, 'w') as f:
            json.dump(dummy_data, f, indent=2)

    try:
        # First, load the entire JSON structure into a Python object
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Check if the loaded data is a dictionary (like {"training_data": [...]})
        if isinstance(data, dict):
            # Find the key that holds the list of records
            for key in ['training_data', 'data', 'records']:
                if key in data and isinstance(data[key], list):
                    records = data[key]
                    break
            else:
                 raise ValueError("JSON is a dictionary but does not contain a recognizable key ('training_data', 'data', 'records') with a list of records.")
        # Or if it's already a list (like [{...}, {...}])
        elif isinstance(data, list):
            records = data
        else:
            raise TypeError("JSON file is not in a supported format (list of objects or dict with a list).")

        # Now, safely create the DataFrame from the extracted list of records
        df = pd.DataFrame(records)

        if 'type' in df.columns:
            df.rename(columns={'type': 'label'}, inplace=True)
        if not all(col in df.columns for col in ['text', 'label']):
             raise ValueError("The records in the JSON file must contain 'text' and 'type'/'label' keys.")

        print("Data loaded and parsed successfully!")
        return df

    except Exception as e:
        print(f"Error loading or processing data: {str(e)}")
        return None


# ==============================================================================
# Step 3: Main Execution Block
# ==============================================================================

# Define file path and load data
file_path = 'resources/training_data.json'
df = load_data(file_path)

# Proceed only if data is loaded successfully
if df is not None and not df.empty:
    print("\n--- Data Overview ---")
    df.info()
    print("\nLabel distribution:")
    print(df['label'].value_counts())

    # --- Data Preparation ---
    print("\n--- Preparing Data for Model ---")
    labels = sorted(df['label'].unique().tolist())
    label_to_id = {label: i for i, label in enumerate(labels)}
    id_to_label = {i: label for label, i in label_to_id.items()}
    df['label_id'] = df['label'].map(label_to_id)

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['text'].tolist(), df['label_id'].tolist(), test_size=0.2, random_state=42, stratify=df['label_id']
    )

    model_name = "nlpaueb/legal-bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

    class LegalDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels
        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item
        def __len__(self):
            return len(self.labels)

    train_dataset = LegalDataset(train_encodings, train_labels)
    test_dataset = LegalDataset(test_encodings, test_labels)

    # --- Model Training ---
    print("\n--- Initializing Model and Trainer ---")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(id_to_label), id2label=id_to_label, label2id=label_to_id
    )

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
        acc = accuracy_score(labels, preds)
        return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

    training_args = TrainingArguments(
        output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8,
        per_device_eval_batch_size=8, warmup_steps=100, weight_decay=0.01,
        logging_dir='./logs', logging_steps=10, 
        eval_strategy="epoch",
        save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="f1",
    )

    trainer = Trainer(
        model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=test_dataset, compute_metrics=compute_metrics,
    )

    print("\n--- Starting Model Training ---")
    trainer.train()
    print("\n--- Training Finished ---")

    print("\n--- Final Evaluation on Test Set ---")
    eval_results = trainer.evaluate()
    print(eval_results)

    # --- Save Model and Run Inference ---
    output_dir = "./legal_argument_model"
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"\nModel and tokenizer saved to {output_dir}")

    def predict(text, model_path, tokenizer_path):
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
        return model.config.id2label[predicted_class_id]

    example_text = "The defendant's actions were clearly in violation of the contract terms."
    predicted_label = predict(example_text, output_dir, output_dir)
    print(f"\n--- Example Prediction ---")
    print(f"Text: '{example_text}'")
    print(f"Predicted Label: {predicted_label}")

else:
    print("\nExecution halted because data could not be loaded or is empty.")

Note: you may need to restart the kernel to use updated packages.
Attempting to load data from: resources/training_data.json
Data loaded and parsed successfully!

--- Data Overview ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   241 non-null    object
 1   text    241 non-null    object
dtypes: object(2)
memory usage: 3.9+ KB

Label distribution:
label
Premise              117
Claim                 79
Non-Argumentative     45
Name: count, dtype: int64

--- Preparing Data for Model ---

--- Initializing Model and Trainer ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Model Training ---




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0919,1.026641,0.489796,0.322058,0.2399,0.489796
2,1.034,0.990838,0.489796,0.322058,0.2399,0.489796




SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "There is not enough space on the disk." })