In [37]:
import json

file_path = "/content/drive/MyDrive/FYP/cases_2024.json"

try:
    with open(file_path, 'r') as f:
        cases_data = json.load(f)
    print("JSON data loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from the file at {file_path}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

JSON data loaded successfully.


In [38]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import os # Import os for path checking

# Download the punkt tokenizer and punkt_tab if they haven't already been downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

# Ensure output_dir is defined - assuming it was defined in a previous cell and needs to be re-executed or is available in the environment
# If output_dir is not available, manually define it based on the previous successful cell's output.
# For this execution, we assume output_dir is available from a previous cell execution.
# If not, uncomment and define it:
output_dir = "/content/drive/MyDrive/FYP/legal_argument_model_roberta_base"

# Function to make predictions - Redefined here to be available
def predict(text, model_path, tokenizer_path):
    # Load model and tokenizer from local path
    # Ensure the paths exist
    if not os.path.exists(model_path):
        print(f"Error: Model path not found at {model_path}")
        return None
    if not os.path.exists(tokenizer_path):
        print(f"Error: Tokenizer path not found at {tokenizer_path}")
        return None

    model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)


    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted class
    predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
    return model.config.id2label[predicted_class_id]


model_path = output_dir
tokenizer_path = output_dir

predicted_cases = []

if 'cases_data' in locals() and cases_data:
    for case in cases_data[1:2]:
        case_text = case.get('text', '') # Assuming 'text' is the key for case content
        if case_text:
            sentences = sent_tokenize(case_text)
            case_predictions = []
            for sentence in sentences:
                prediction = predict(sentence, model_path, tokenizer_path)
                if prediction is not None: # Only append if prediction was successful
                    case_predictions.append({'sentence': sentence, 'prediction': prediction})
            predicted_cases.append({'case_id': case.get('id', 'N/A'), 'predictions': case_predictions}) # Assuming 'id' is the case identifier
        else:
            predicted_cases.append({'case_id': case.get('id', 'N/A'), 'predictions': []}) # Handle cases with no text

    print(f"Processed {len(predicted_cases)} cases.")
    print(predicted_cases)
else:
    print("No case data available to process.")

Processed 1 cases.
[{'case_id': '4aaafdf5-8ac9-4086-b62e-485d250b02bb', 'predictions': [{'sentence': 'CA/HCC 184/2017  \n \n1 | P a g e  \n  IN THE COURT OF APPEAL OF THE DEMOCRATIC SOCIALIST \nREPUBLIC OF SRI LANKA  \nIn the matter of an Appeal made under     \nSection 331 (1) of the Code of Criminal \nProcedure Act No.15 of 1979  read with \nArticle 138 of the Constitution of the \nDemocratic Socialist Republic of Sri \nLanka.', 'prediction': 'Non-Argumentative'}, {'sentence': 'Court of Appeal Case No.', 'prediction': 'Non-Argumentative'}, {'sentence': 'CA/HCC/  0184/2017   Harold Rex Jansen  \nHigh Court of Kalutara  \nCase No.', 'prediction': 'Non-Argumentative'}, {'sentence': 'HC/ 108/2009      \nACCUSED -APPELLANT  \nvs.', 'prediction': 'Non-Argumentative'}, {'sentence': 'The Hon.', 'prediction': 'Non-Argumentative'}, {'sentence': "Attorney General  \n       Attorney General's Department  \n    Colombo -12 \n          \n  COMPLAINANT -RESPONDENT  \n \nBEFORE    : Sampath B. Abaya

In [39]:
# prompt: table data according to class

import pandas as pd

# Initialize lists to store data for the table
all_data = []

# Iterate through the predicted cases
for case in predicted_cases:
    case_id = case['case_id']
    for pred_info in case['predictions']:
        sentence = pred_info['sentence']
        prediction = pred_info['prediction']
        all_data.append({'Case ID': case_id, 'Sentence': sentence, 'Predicted Class': prediction})

# Create a DataFrame
df = pd.DataFrame(all_data)

# Display the DataFrame
print("\nPredicted Classes per Sentence:")
df


Predicted Classes per Sentence:


Unnamed: 0,Case ID,Sentence,Predicted Class
0,4aaafdf5-8ac9-4086-b62e-485d250b02bb,CA/HCC 184/2017 \n \n1 | P a g e \n IN THE COURT OF APPEAL OF THE DEMOCRATIC SOCIALIST \nREPUBLIC OF SRI LANKA \nIn the matter of an Appeal made under \nSection 331 (1) of the Code of Criminal \nProcedure Act No.15 of 1979 read with \nArticle 138 of the Constitution of the \nDemocratic Socialist Republic of Sri \nLanka.,Non-Argumentative
1,4aaafdf5-8ac9-4086-b62e-485d250b02bb,Court of Appeal Case No.,Non-Argumentative
2,4aaafdf5-8ac9-4086-b62e-485d250b02bb,CA/HCC/ 0184/2017 Harold Rex Jansen \nHigh Court of Kalutara \nCase No.,Non-Argumentative
3,4aaafdf5-8ac9-4086-b62e-485d250b02bb,HC/ 108/2009 \nACCUSED -APPELLANT \nvs.,Non-Argumentative
4,4aaafdf5-8ac9-4086-b62e-485d250b02bb,The Hon.,Non-Argumentative
...,...,...,...
124,4aaafdf5-8ac9-4086-b62e-485d250b02bb,"CA/HCC 184/2017 \n \n17 | P a g e \n Considering all the circumstances discussed above, this Court could only \ncome to the conclusion that the prosecution has not prove n the case beyond \nreasonable doubt .",Non-Argumentative
125,4aaafdf5-8ac9-4086-b62e-485d250b02bb,"Therefore, the Appellant is acquitted from the charge s. \nThe appeal is allowed and the conviction is set aside.",Claim
126,4aaafdf5-8ac9-4086-b62e-485d250b02bb,The Registrar of this court is directed to send this judgment along with the \noriginal case record to the High Court of Kalutara .,Non-Argumentative
127,4aaafdf5-8ac9-4086-b62e-485d250b02bb,"JUDGE OF THE COURT OF APPEAL \n \nSAMPATH B. ABAYAKOON , J. \nI agree.",Non-Argumentative


In [40]:
# check support vs contradiction
# Step 0: Install and Import Libraries
# ==============================================================================
# Ensure you have the latest compatible libraries. If you face import errors,
# run this cell, restart your runtime, and then run the whole notebook.
%pip install -q --upgrade transformers torch pandas datasets accelerate

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np

# ==============================================================================
# Step 1: Load the Pre-trained NLI Model and Tokenizer
# ==============================================================================
# Set up the device (use GPU if available, otherwise CPU)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model name: A powerful, widely-used model for Natural Language Inference
model_name = "roberta-large-mnli"

# Load the tokenizer and model from Hugging Face
# This will download the model the first time you run it.
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    print(f"Successfully loaded NLI model '{model_name}'")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you have an active internet connection.")

# The model outputs three labels. Let's see them.
print("\nModel Labels (ID -> Label):")
print(model.config.id2label)

# Define our custom mapping for clearer results
label_mapping = {
    'CONTRADICTION': 'Contradictory',
    'ENTAILMENT': 'Supportive',
    'NEUTRAL': 'Unrelated'
}


# ==============================================================================
# Step 2: Prepare Your Input Data (Claims and Premises)
# ==============================================================================
# This is where you will insert the claims and premises you extracted from a
# SINGLE case document.

# --- REPLACE WITH YOUR OWN EXTRACTED DATA ---
# Here is an example to demonstrate how it works.
target_claim = "The defendant is liable for breach of contract due to the delayed delivery."

premises_from_case = [
    # Example of a supportive premise
    "Clause 5.1 of the signed agreement explicitly states the delivery deadline was May 31st, 2024.",
    # Example of another supportive premise
    "Shipping logs confirm the defendant's package was not dispatched until June 5th, 2024.",
    # Example of a contradictory premise
    "An email dated May 30th, signed by the plaintiff, granted an official extension for the delivery to June 15th.",
    # Example of a neutral/unrelated premise
    "The contract was printed on high-quality recycled paper.",
    # Another example of a contradictory premise
    "The plaintiff accepted the delivery on June 5th without raising any formal objection at the time."
]
# --- END OF DATA TO REPLACE ---

print(f"\nAnalyzing {len(premises_from_case)} premises against the claim:")
print(f"Claim: '{target_claim}'")


# ==============================================================================
# Step 3: Analyze the Relationship for Each Premise-Claim Pair
# ==============================================================================

results = []

# Loop through each premise and compare it against the target claim
for premise in premises_from_case:
    # NLI models expect a premise and a hypothesis.
    # In our case: premise = the legal premise, hypothesis = the legal claim.
    tokenized_input = tokenizer(premise, target_claim, return_tensors='pt').to(device)

    with torch.no_grad(): # Disable gradient calculation for faster inference
        outputs = model(**tokenized_input)

    logits = outputs.logits

    # Get the probabilities and the predicted class ID
    probabilities = torch.softmax(logits, dim=1)
    predicted_class_id = torch.argmax(probabilities, dim=1).item()

    # Get the model's label (e.g., 'ENTAILMENT')
    raw_label = model.config.id2label[predicted_class_id]

    # Map to our custom, more readable label (e.g., 'Supportive')
    relationship = label_mapping[raw_label]

    # Get the confidence score for the prediction
    confidence = probabilities[0][predicted_class_id].item()

    # Store the result
    results.append({
        "Claim": target_claim,
        "Premise": premise,
        "Relationship": relationship,
        "Confidence": f"{confidence:.2%}" # Format as percentage
    })

# ==============================================================================
# Step 4: Display the Results in a Clean Table
# ==============================================================================

# Convert the list of dictionaries to a pandas DataFrame for nice formatting
df_results = pd.DataFrame(results)

# Set pandas options for better display of long text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 200)

print("\n\n--- Argument Analysis Results ---")

# Check if there are any results before trying to print
if not df_results.empty:
    # To make the table even clearer, let's just show the premise and its relationship
    # since the claim is the same for all rows in this example.
    display_df = df_results[['Premise', 'Relationship', 'Confidence']]
    print(display_df.to_string())
else:
    print("No results to display. Please check your input data.")

print("\n\n--- Summary of Findings ---")
if not df_results.empty:
    # Use value_counts() to get a quick summary
    summary = df_results['Relationship'].value_counts()
    print(f"For the claim: '{target_claim}'")
    print("\nWe found the following relationships:")
    print(summary.to_string())
else:
    print("No analysis was performed.")

Using device: cpu


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Successfully loaded NLI model 'roberta-large-mnli'

Model Labels (ID -> Label):
{0: 'CONTRADICTION', 1: 'NEUTRAL', 2: 'ENTAILMENT'}

Analyzing 5 premises against the claim:
Claim: 'The defendant is liable for breach of contract due to the delayed delivery.'


--- Argument Analysis Results ---
                                                                                                          Premise Relationship Confidence
0                  Clause 5.1 of the signed agreement explicitly states the delivery deadline was May 31st, 2024.    Unrelated     94.59%
1                          Shipping logs confirm the defendant's package was not dispatched until June 5th, 2024.    Unrelated     99.57%
2  An email dated May 30th, signed by the plaintiff, granted an official extension for the delivery to June 15th.    Unrelated     94.59%
3                                                        The contract was printed on high-quality recycled paper.    Unrelated     95.19%
4               