In [1]:
# =============== PHASE 0: BUILD INDEX (IN-Abs, TRAIN = 1500) ===============

import os
import glob
import random
import pandas as pd

# 0. CONFIG ‚Äì set this to your dataset path inside /kaggle/input
# Example if your dataset folder is named "dataset":
#   /kaggle/input/dataset/IN-Abs/...
ROOT = "/kaggle/input/legal-datav2/dataset/IN-Abs"   # <-- CHANGE if your folder name is different

TRAIN_JUDG_DIR = os.path.join(ROOT, "train-data", "judgement")
TRAIN_SUM_DIR  = os.path.join(ROOT, "train-data", "summary")

TEST_JUDG_DIR  = os.path.join(ROOT, "test-data", "judgement")
TEST_SUM_DIR   = os.path.join(ROOT, "test-data", "summary")

MAX_TRAIN_CASES = 1500   # üî• limit training files


# 1. HELPER ‚Äì collect (case_id, judgment_path, summary_path)
def collect_pairs(judg_dir, sum_dir):
    """
    Return list of (case_id, judgment_path, summary_path)
    for all files that exist in BOTH judgement and summary folders.
    """
    judg_files = glob.glob(os.path.join(judg_dir, "*.txt"))
    case_ids = [os.path.splitext(os.path.basename(p))[0] for p in judg_files]

    pairs = []
    for cid in case_ids:
        j_path = os.path.join(judg_dir, f"{cid}.txt")
        s_path = os.path.join(sum_dir,  f"{cid}.txt")
        if os.path.exists(j_path) and os.path.exists(s_path):
            pairs.append((cid, j_path, s_path))
    return pairs


# 2. COLLECT TRAIN + TEST PAIRS
train_pairs = collect_pairs(TRAIN_JUDG_DIR, TRAIN_SUM_DIR)
print("Total IN-Abs TRAIN pairs found:", len(train_pairs))

test_pairs = collect_pairs(TEST_JUDG_DIR, TEST_SUM_DIR)
print("Total IN-Abs TEST pairs found:", len(test_pairs))


# 3. CAP TRAIN TO 1500
random.seed(42)          # reproducible subset
random.shuffle(train_pairs)
train_pairs = train_pairs[:MAX_TRAIN_CASES]
print("Capped TRAIN pairs used:", len(train_pairs))


# 4. BUILD INDEX ROWS
rows = []

# TRAIN rows (only 1500)
for cid, j, s in train_pairs:
    rows.append({
        "case_id": cid,
        "split": "train",
        "corpus": "IN-Abs",
        "judgment_path": j,
        "summary_path": s,
    })

# TEST rows (all)
for cid, j, s in test_pairs:
    rows.append({
        "case_id": cid,
        "split": "test",
        "corpus": "IN-Abs",
        "judgment_path": j,
        "summary_path": s,
    })

df_index = pd.DataFrame(rows)
print("Total rows in index (train + test):", len(df_index))
display(df_index.head())


# 5. SAVE INDEX  ‚úÖ must be in /kaggle/working (writable), NOT /kaggle/input
INDEX_PATH = "/kaggle/working/cases_index_indabs_1500.csv"
df_index.to_csv(INDEX_PATH, index=False)
print("‚úÖ Saved Phase 0 index to:", INDEX_PATH)

Total IN-Abs TRAIN pairs found: 7030
Total IN-Abs TEST pairs found: 100
Capped TRAIN pairs used: 1500
Total rows in index (train + test): 1600


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...


‚úÖ Saved Phase 0 index to: /kaggle/working/cases_index_indabs_1500.csv


In [2]:
# ============================
# PHASE 1 (STEPS 1‚Äì5)
# Embeddings for IN-Abs (train + test)
# ============================

# STEP 1: Load Phase 0 index
import os
import pandas as pd

INDEX_PATH = "/kaggle/working/cases_index_indabs_1500.csv"  # from Phase 0
df_index = pd.read_csv(INDEX_PATH)

df_train = df_index[df_index["split"] == "train"].reset_index(drop=True)
df_test  = df_index[df_index["split"] == "test"].reset_index(drop=True)

print("Train rows:", len(df_train))
print("Test rows:", len(df_test))
df_index.head()

Train rows: 1500
Test rows: 100


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...


In [3]:
# STEP 2: Helper ‚Äì load a representative text snippet for each judgment

from tqdm import tqdm

def load_judgment_snippet(path, max_chars=4000):
    """
    Read judgment file and return a cleaned snippet (first max_chars chars).
    Light cleaning only: remove extra spaces and newlines.
    """
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
    except Exception as e:
        print(f"[WARN] Could not read {path}: {e}")
        return ""
    
    # basic cleaning
    text = text.replace("\r", " ").replace("\n", " ")
    text = " ".join(text.split())   # collapse multiple spaces
    
    # cut to max_chars
    return text[:max_chars]

# Build lists of texts and case_ids for train and test
train_texts = []
train_case_ids = []

print("Building train snippets...")
for _, row in tqdm(df_train.iterrows(), total=len(df_train)):
    snippet = load_judgment_snippet(row["judgment_path"], max_chars=4000)
    train_texts.append(snippet)
    train_case_ids.append(row["case_id"])

test_texts = []
test_case_ids = []

print("Building test snippets...")
for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    snippet = load_judgment_snippet(row["judgment_path"], max_chars=4000)
    test_texts.append(snippet)
    test_case_ids.append(row["case_id"])

print("Example train snippet length:", len(train_texts[0]))

Building train snippets...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1500/1500 [00:08<00:00, 177.37it/s]


Building test snippets...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 184.62it/s]

Example train snippet length: 4000





In [4]:
# STEP 3: Load sentence embedding model

!pip install -q sentence-transformers

import torch
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# small + fast model (good for Kaggle)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(model_name, device=device)

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m363.4/363.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m13.8/13.8 MB[0m [31m114.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.6/24.6 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m883.7/883.7 kB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00

2025-12-09 15:58:02.777054: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765295882.957429      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765295883.007892      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# STEP 4: Compute embeddings for TRAIN cases

import numpy as np

print("Encoding TRAIN embeddings...")
train_embeddings = embed_model.encode(
    train_texts,
    batch_size=16,
    convert_to_numpy=True,
    show_progress_bar=True
)

print("Train embeddings shape:", train_embeddings.shape)

# Save to /kaggle/working
np.save("/kaggle/working/phase1_embeddings_train.npy", train_embeddings)

# Also save case_ids so we can align later
import json
with open("/kaggle/working/phase1_train_case_ids.json", "w") as f:
    json.dump(train_case_ids, f)

print("‚úÖ Saved train embeddings and case_ids.")

Encoding TRAIN embeddings...


Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Train embeddings shape: (1500, 384)
‚úÖ Saved train embeddings and case_ids.


In [6]:
# STEP 5: Compute embeddings for TEST cases

print("Encoding TEST embeddings...")
test_embeddings = embed_model.encode(
    test_texts,
    batch_size=16,
    convert_to_numpy=True,
    show_progress_bar=True
)

print("Test embeddings shape:", test_embeddings.shape)

np.save("/kaggle/working/phase1_embeddings_test.npy", test_embeddings)

import json
with open("/kaggle/working/phase1_test_case_ids.json", "w") as f:
    json.dump(test_case_ids, f)

print("‚úÖ Saved test embeddings and case_ids.")

Encoding TEST embeddings...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Test embeddings shape: (100, 384)
‚úÖ Saved test embeddings and case_ids.


In [7]:
# ============================
# PHASE 1 ‚Äì STEPS 6‚Äì10
# Clustering + update index
# ============================

import os
import json
import numpy as np
import pandas as pd

# 1. Load index from Phase 0
INDEX_PATH = "/kaggle/working/cases_index_indabs_1500.csv"
df_index = pd.read_csv(INDEX_PATH)

print("Index shape:", df_index.shape)
display(df_index.head())

# 2. Load embeddings (from Phase 1 steps 1‚Äì5)
train_embeddings = np.load("/kaggle/working/phase1_embeddings_train.npy")
test_embeddings  = np.load("/kaggle/working/phase1_embeddings_test.npy")

print("Train embeddings shape:", train_embeddings.shape)
print("Test embeddings shape:", test_embeddings.shape)

# 3. Load case_id lists
with open("/kaggle/working/phase1_train_case_ids.json", "r") as f:
    train_case_ids = json.load(f)

with open("/kaggle/working/phase1_test_case_ids.json", "r") as f:
    test_case_ids = json.load(f)

print("Train case_ids:", len(train_case_ids))
print("Test case_ids:", len(test_case_ids))

Index shape: (1600, 5)


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...


Train embeddings shape: (1500, 384)
Test embeddings shape: (100, 384)
Train case_ids: 1500
Test case_ids: 100


In [8]:
from sklearn.cluster import KMeans

# STEP 6: choose number of clusters
# You can change this if you want to experiment (e.g., 4, 6, 8...)
K = 6
print("Using K =", K)

# STEP 7: Fit KMeans on TRAIN embeddings
kmeans = KMeans(
    n_clusters=K,
    random_state=42,
    n_init=10,
    max_iter=300,
)

kmeans.fit(train_embeddings)
print("KMeans fitted on train embeddings.")

Using K = 6
KMeans fitted on train embeddings.


In [9]:
# Predict cluster id for train and test
train_cluster_ids = kmeans.labels_                # already computed during fit
test_cluster_ids  = kmeans.predict(test_embeddings)

print("Train cluster_ids shape:", train_cluster_ids.shape)
print("Test cluster_ids shape:", test_cluster_ids.shape)

# Build mapping: case_id -> cluster_id
cluster_map_train = {str(cid): int(cl_id) for cid, cl_id in zip(train_case_ids, train_cluster_ids)}
cluster_map_test  = {str(cid): int(cl_id) for cid, cl_id in zip(test_case_ids,  test_cluster_ids)}

# Merge both into one mapping
cluster_map = {}
cluster_map.update(cluster_map_train)
cluster_map.update(cluster_map_test)

print("Total case_ids in cluster_map:", len(cluster_map))

Train cluster_ids shape: (1500,)
Test cluster_ids shape: (100,)
Total case_ids in cluster_map: 1600


In [10]:
# Ensure case_id in df_index is treated as string for mapping
df_index["case_id"] = df_index["case_id"].astype(str)

# STEP 9: Add cluster_id column from mapping
df_index["cluster_id"] = df_index["case_id"].map(cluster_map)

# Optional simple human-readable label: "CL_0", "CL_1", ...
df_index["cluster_label"] = df_index["cluster_id"].apply(lambda x: f"CL_{x}" if pd.notnull(x) else None)

print("Unique cluster_ids:", df_index["cluster_id"].unique())
display(df_index.head())

# Check distribution
print(df_index["cluster_id"].value_counts())

# STEP 10: Save updated index
OUTPUT_INDEX_PATH = "/kaggle/working/cases_index_indabs_1500_phase1.csv"
df_index.to_csv(OUTPUT_INDEX_PATH, index=False)
print("‚úÖ Saved Phase 1 index to:", OUTPUT_INDEX_PATH)

Unique cluster_ids: [3 0 2 1 5 4]


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3


cluster_id
3    490
1    280
2    259
4    206
0    199
5    166
Name: count, dtype: int64
‚úÖ Saved Phase 1 index to: /kaggle/working/cases_index_indabs_1500_phase1.csv


In [11]:
# ============================
# PHASE 2 ‚Äì STEP 1
# Load Phase 1 index + create cleaned dir
# ============================

import os
import pandas as pd

# Path to Phase 1 index (from previous step)
INDEX_PHASE1_PATH = "/kaggle/working/cases_index_indabs_1500_phase1.csv"

# Load index
df_index = pd.read_csv(INDEX_PHASE1_PATH)
print("Index shape:", df_index.shape)
display(df_index.head())

# Directory where we'll save cleaned judgments
CLEANED_DIR = "/kaggle/working/cleaned"

# Create directory if not exists
os.makedirs(CLEANED_DIR, exist_ok=True)
print("Cleaned files will be stored in:", CLEANED_DIR)

Index shape: (1600, 7)


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3


Cleaned files will be stored in: /kaggle/working/cleaned


In [12]:
# ============================
# PHASE 2 ‚Äì STEP 2
# Define text cleaning pipeline
# ============================

import re

def normalize_newlines_and_spaces(text: str) -> str:
    """Unify newlines and collapse multiple spaces/newlines."""
    # unify CRLF / CR to LF
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    
    # remove trailing spaces on each line
    lines = [line.strip() for line in text.split("\n")]
    
    # drop empty lines that are purely whitespace
    # (we'll join with single newline then later turn to spaces if needed)
    text = "\n".join(line for line in lines if line != "")
    
    # collapse multiple spaces inside lines
    text = re.sub(r"[ \t]+", " ", text)
    
    return text


def normalize_punctuation(text: str) -> str:
    """Normalize fancy quotes and some odd punctuation."""
    # fancy quotes ‚Üí plain
    text = text.replace("‚Äú", '"').replace("‚Äù", '"')
    text = text.replace("‚Äò", "'").replace("‚Äô", "'")
    
    # sometimes there are weird long dashes etc.
    text = text.replace("‚Äì", "-").replace("‚Äî", "-")
    
    # optionally remove other non-printable chars
    text = "".join(ch for ch in text if ch == "\n" or ch.isprintable())
    
    return text


def remove_boilerplate_lines(text: str) -> str:
    """
    Remove very obvious header/footer/garbage lines:
    - Court headings
    - REPORTABLE, NON-REPORTABLE
    - Pure page numbers
    - Separator lines (----, *****, etc.)
    - Judge signature patterns
    """
    header_keywords = [
        "IN THE SUPREME COURT OF INDIA",
        "IN THE HIGH COURT",
        "HIGH COURT OF",
        "SUPREME COURT OF INDIA",
        "REPORTABLE",
        "NON-REPORTABLE",
        "CIVIL APPEAL NO.",
        "CRIMINAL APPEAL NO.",
        "SPECIAL LEAVE PETITION",
    ]
    
    footer_keywords = [
        "................",   # judge signatures like "................J."
        "JUDGE",
        "NEW DELHI",
        "NEW DELHI;",
        "DATED:",
    ]
    
    cleaned_lines = []
    for line in text.split("\n"):
        stripped = line.strip()
        upper = stripped.upper()
        
        # skip empty line
        if stripped == "":
            continue
        
        # drop separator-only lines
        if set(stripped) <= set("-=*_.‚Ä¢*+"):
            continue
        
        # drop pure page numbers (e.g. "1", "2", "Page 3 of 20")
        if stripped.isdigit():
            continue
        if re.match(r"^PAGE\s+\d+(\s+OF\s+\d+)?$", upper):
            continue
        
        # drop header keywords
        if any(kw in upper for kw in header_keywords):
            continue
        
        # drop footer keywords
        if any(kw in upper for kw in footer_keywords):
            continue
        
        cleaned_lines.append(stripped)
    
    return "\n".join(cleaned_lines)


def final_collapse_to_single_paragraph(text: str) -> str:
    """
    Optional: turn multiple newlines into single spaces so we get one
    continuous paragraph (good for later tokenization).
    """
    # first collapse multiple newlines
    text = re.sub(r"\n+", " ", text)
    # then collapse multiple spaces again
    text = re.sub(r"[ \t]+", " ", text)
    return text.strip()


def clean_text(raw_text: str) -> str:
    """
    Master cleaning function that applies all steps in order.
    """
    if not raw_text:
        return ""
    
    text = raw_text
    
    # 1) normalize newlines & spaces
    text = normalize_newlines_and_spaces(text)
    
    # 2) normalize punctuation
    text = normalize_punctuation(text)
    
    # 3) remove obvious boilerplate lines
    text = remove_boilerplate_lines(text)
    
    # 4) final collapse (optional)
    text = final_collapse_to_single_paragraph(text)
    
    # if everything vanished, fall back to original raw
    if not text.strip():
        return raw_text.strip()
    
    return text

In [13]:
# ============================
# PHASE 2 ‚Äì STEP 3
# Quick sanity test on a single judgment
# ============================

# Pick first row (you can change the index)
sample_row = df_index.iloc[0]
sample_path = sample_row["judgment_path"]
sample_case_id = sample_row["case_id"]

print("Sample case_id:", sample_case_id)
print("Raw judgment path:", sample_path)

# Read raw text
with open(sample_path, "r", encoding="utf-8", errors="ignore") as f:
    raw_text = f.read()

cleaned_sample = clean_text(raw_text)

print("\n--- RAW (first 500 chars) ---\n")
print(raw_text[:500])

print("\n--- CLEANED (first 500 chars) ---\n")
print(cleaned_sample[:500])

Sample case_id: 2502
Raw judgment path: /kaggle/input/legal-datav2/dataset/IN-Abs/train-data/judgement/2502.txt

--- RAW (first 500 chars) ---

ivil Appeal No. 1832 of 1967.
Appeal under section 116 A of the Representation of the People Act, 1951 from the judgment and order dated October 31, 1967 of the Madhya Pradesh High Court, Indore Bench in Election Petition No. 40 of 1967.
S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant.
Sarjoo Prasad and D.N. Misra for respondent No. 1.
The Judgment of the Court was delivered by, Mitter, J.
This is an appeal from a judgment of the Madhya Pradesh High Court by a re

--- CLEANED (first 500 chars) ---

ivil Appeal No. 1832 of 1967. Appeal under section 116 A of the Representation of the People Act, 1951 from the judgment and order dated October 31, 1967 of the Madhya Pradesh High Court, Indore Bench in Election Petition No. 40 of 1967. S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant.

In [14]:
# ============================
# PHASE 2 ‚Äì STEP 4
# Clean all judgments and save to /kaggle/working/cleaned/<case_id>.txt
# ============================

from tqdm import tqdm

cleaned_paths = []

for idx, row in tqdm(df_index.iterrows(), total=len(df_index)):
    case_id = str(row["case_id"])
    src_path = row["judgment_path"]
    
    # read raw text
    try:
        with open(src_path, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
    except Exception as e:
        print(f"[WARN] Could not read {src_path} for case_id={case_id}: {e}")
        raw_text = ""
    
    # apply cleaning
    cleaned_text = clean_text(raw_text)
    
    # decide output path
    out_path = os.path.join(CLEANED_DIR, f"{case_id}.txt")
    
    # write cleaned text
    try:
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)
    except Exception as e:
        print(f"[WARN] Could not write cleaned file {out_path} for case_id={case_id}: {e}")
        out_path = None  # mark failure
    
    cleaned_paths.append(out_path)

# attach cleaned paths to df_index
df_index["cleaned_path"] = cleaned_paths

print("Number of rows with missing cleaned_path:", df_index["cleaned_path"].isna().sum())
df_index.head()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1600/1600 [00:10<00:00, 145.91it/s]

Number of rows with missing cleaned_path: 0





Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt


In [15]:
# ============================
# PHASE 2 ‚Äì STEP 5
# Save updated index with cleaned_path
# ============================

# Quick sanity: check for any missing cleaned_path
missing_cleaned = df_index["cleaned_path"].isna().sum()
print("Rows with missing cleaned_path:", missing_cleaned)

if missing_cleaned > 0:
    print("‚ö†Ô∏è Warning: some rows have no cleaned_path. Inspect before proceeding.")
else:
    print("‚úÖ All rows have cleaned_path set.")

# Save Phase 2 index
INDEX_PHASE2_PATH = "/kaggle/working/cases_index_indabs_1500_phase2.csv"
df_index.to_csv(INDEX_PHASE2_PATH, index=False)

print("‚úÖ Saved Phase 2 index to:", INDEX_PHASE2_PATH)
display(df_index.head())

Rows with missing cleaned_path: 0
‚úÖ All rows have cleaned_path set.
‚úÖ Saved Phase 2 index to: /kaggle/working/cases_index_indabs_1500_phase2.csv


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt


In [16]:
# ============================
# PHASE 2 ‚Äì STEP 6
# Optional: Quick quality checks on cleaning
# ============================

import random

def show_raw_vs_cleaned(row, max_chars=400):
    case_id = row["case_id"]
    raw_path = row["judgment_path"]
    clean_path = row["cleaned_path"]
    
    with open(raw_path, "r", encoding="utf-8", errors="ignore") as f:
        raw_text = f.read()
    with open(clean_path, "r", encoding="utf-8", errors="ignore") as f:
        clean_text_str = f.read()
    
    print(f"\n====== case_id: {case_id} ======")
    print(f"Raw length: {len(raw_text)}, Cleaned length: {len(clean_text_str)}")
    
    print("\n--- RAW (first {0} chars) ---".format(max_chars))
    print(raw_text[:max_chars])
    
    print("\n--- CLEANED (first {0} chars) ---".format(max_chars))
    print(clean_text_str[:max_chars])

# 1) Basic length stats
df_index["raw_len"] = df_index["judgment_path"].apply(
    lambda p: len(open(p, "r", encoding="utf-8", errors="ignore").read())
)
df_index["clean_len"] = df_index["cleaned_path"].apply(
    lambda p: len(open(p, "r", encoding="utf-8", errors="ignore").read())
)

print("Raw length (mean, min, max):",
      df_index["raw_len"].mean(), df_index["raw_len"].min(), df_index["raw_len"].max())
print("Cleaned length (mean, min, max):",
      df_index["clean_len"].mean(), df_index["clean_len"].min(), df_index["clean_len"].max())

# 2) Inspect a few random cases
sample_indices = random.sample(range(len(df_index)), k=3)
for i in sample_indices:
    show_raw_vs_cleaned(df_index.iloc[i], max_chars=400)

Raw length (mean, min, max): 24731.178125 1244 466374
Cleaned length (mean, min, max): 23198.43 1080 459193

Raw length: 31291, Cleaned length: 29565

--- RAW (first 400 chars) ---
Civil Appeal No. 2182 of 1984.
Appeal by Special leave from the Judgment and Order dated the 17th April, 1984 of the Punjab and Haryana High Court in W.P. No. Nil of 1984 559 S.S. Ray and Krishnamurthi Swami for the Appellant.
K.G. Bhagat Addl.
General, A.K. Sen, H.B. Singh Advocate of Harayana, A. Subbha Rao, CV. and R.N. Poddar, for Respondent.
General, The following Judgments were delivered CHA

--- CLEANED (first 400 chars) ---
Appeal by Special leave from the Judgment and Order dated the 17th April, 1984 of the Punjab and Haryana High Court in W.P. No. Nil of 1984 559 S.S. Ray and Krishnamurthi Swami for the Appellant. K.G. Bhagat Addl. General, A.K. Sen, H.B. Singh Advocate of Harayana, A. Subbha Rao, CV. and R.N. Poddar, for Respondent. The High Court, by its aforesaid order, had stayed the issuance a

In [17]:
# ============================
# PHASE 3 ‚Äì STEP 1
# Load Phase 2 index + create structured dir
# ============================

import os
import pandas as pd

# Path to Phase 2 index (output of Phase 2)
INDEX_PHASE2_PATH = "/kaggle/working/cases_index_indabs_1500_phase2.csv"

# Load index
df_index = pd.read_csv(INDEX_PHASE2_PATH)
print("Index shape:", df_index.shape)
display(df_index.head())

# Directory to store structured JSON files
STRUCTURED_DIR = "/kaggle/working/structured"
os.makedirs(STRUCTURED_DIR, exist_ok=True)

print("Structured files will be stored in:", STRUCTURED_DIR)

Index shape: (1600, 8)


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt


Structured files will be stored in: /kaggle/working/structured


In [18]:
# ============================
# PHASE 3 ‚Äì STEP 2
# Define canonical section schema
# ============================

# We will force every judgment into this fixed set of sections
SECTION_SCHEMA = [
    "FACTS",
    "ISSUES",
    "ARGUMENTS",
    "REASONING",
    "FINAL_ORDER",
]

print("Canonical sections:", SECTION_SCHEMA)

Canonical sections: ['FACTS', 'ISSUES', 'ARGUMENTS', 'REASONING', 'FINAL_ORDER']


In [19]:
# ============================
# PHASE 3 ‚Äì STEP 3
# Define heading / keyword patterns per section
# ============================

import re

# For each canonical section, we list common heading variants
SECTION_PATTERNS = {
    "FACTS": [
        r"\bFACTS OF THE CASE\b",
        r"\bFACTS IN BRIEF\b",
        r"\bBRIEF FACTS\b",
        r"\bBACKGROUND\b",
        r"\bFACTUAL MATRIX\b",
    ],
    "ISSUES": [
        r"\bISSUES FOR DETERMINATION\b",
        r"\bISSUE[S]?\s+FOR\s+CONSIDERATION\b",
        r"\bQUESTION[S]?\s+FOR\s+CONSIDERATION\b",
        r"\bPOINTS?\s+FOR\s+DETERMINATION\b",
    ],
    "ARGUMENTS": [
        r"\bCONTENTIONS OF THE PARTIES\b",
        r"\bCONTENTIONS\b",
        r"\bSUBMISSIONS ON BEHALF OF\b",
        r"\bARGUMENTS ADVANCED\b",
        r"\bRIVAL CONTENTIONS\b",
    ],
    "REASONING": [
        r"\bDISCUSSION AND FINDINGS\b",
        r"\bDISCUSSION\b",
        r"\bFINDINGS\b",
        r"\bREASONS\b",
        r"\bANALYSIS\b",
        r"\bWE ARE OF THE VIEW\b",
    ],
    "FINAL_ORDER": [
        r"\bCONCLUSION\b",
        r"\bFINAL ORDER\b",
        r"\bIN THE RESULT\b",
        r"\bIN THE RESULT, WE\b",
        r"\bTHE APPEAL IS\b",          # "allowed / dismissed"
        r"\bTHE PETITION IS\b",
        r"\bORDER\b",                  # generic, use carefully
    ],
}

# Compile regex patterns (case-insensitive)
SECTION_REGEX = {
    sec: [re.compile(pat, re.IGNORECASE) for pat in patterns]
    for sec, patterns in SECTION_PATTERNS.items()
}

def detect_section_heading(line: str):
    """
    Given a single line of text, return which section (if any)
    this line indicates as a heading.
    """
    stripped = line.strip()
    if not stripped:
        return None
    
    for section, regex_list in SECTION_REGEX.items():
        for rgx in regex_list:
            if rgx.search(stripped):
                return section
    return None

# Quick tiny sanity test
test_lines = [
    "FACTS OF THE CASE:",
    "The issues for determination are as follows:",
    "CONTENTIONS OF THE PARTIES",
    "DISCUSSION AND FINDINGS",
    "In the result, we allow the appeal."
]

for line in test_lines:
    print(f"Line: {line!r}  ->  Section:", detect_section_heading(line))

Line: 'FACTS OF THE CASE:'  ->  Section: FACTS
Line: 'The issues for determination are as follows:'  ->  Section: ISSUES
Line: 'CONTENTIONS OF THE PARTIES'  ->  Section: ARGUMENTS
Line: 'DISCUSSION AND FINDINGS'  ->  Section: REASONING
Line: 'In the result, we allow the appeal.'  ->  Section: FINAL_ORDER


In [20]:
# ============================
# PHASE 3 ‚Äì STEPS 4 & 5
# Segment cleaned text into sections (with fallback)
# ============================

import re

def segment_judgment_text(text: str):
    """
    Split a cleaned judgment into canonical sections using heading detection.
    Fallback: if no headings found, use position-based heuristic (FACTS / REASONING / FINAL_ORDER).
    Returns dict: {section_name: text}
    """
    # Initialize containers
    sections = {sec: [] for sec in SECTION_SCHEMA}
    
    # If text is empty or very short, just dump into FACTS
    if not text or not text.strip():
        sections["FACTS"] = [text.strip()]
        return {sec: " ".join(v).strip() for sec, v in sections.items()}
    
    # Because Phase 2 collapsed newlines, we split into "pseudo-lines" by sentence-ish boundaries
    # You can tweak this splitter if needed
    pseudo_lines = re.split(r'(?<=[.!?])\s+', text)
    pseudo_lines = [ln.strip() for ln in pseudo_lines if ln.strip()]
    
    if not pseudo_lines:
        sections["FACTS"] = [text.strip()]
        return {sec: " ".join(v).strip() for sec, v in sections.items()}

    current_section = None
    any_heading_found = False
    heading_positions = []

    for idx, line in enumerate(pseudo_lines):
        sec = detect_section_heading(line)
        if sec is not None:
            any_heading_found = True
            current_section = sec
            heading_positions.append((idx, sec))
            # Usually we don't include the heading text itself as content
            continue

        # No heading on this line ‚Üí append to current section or default
        if current_section is None:
            # Before any heading: assume these are FACTS / background
            sections["FACTS"].append(line)
        else:
            sections[current_section].append(line)

    # Fallback: if we never saw any heading at all, use positional split
    if not any_heading_found:
        n = len(pseudo_lines)
        if n == 0:
            sections["FACTS"] = [text.strip()]
        else:
            facts_end = int(0.3 * n)       # first 30%
            final_start = int(0.8 * n)     # last 20% ‚Üí FINAL_ORDER

            for i, line in enumerate(pseudo_lines):
                if i < facts_end:
                    sections["FACTS"].append(line)
                elif i >= final_start:
                    sections["FINAL_ORDER"].append(line)
                else:
                    sections["REASONING"].append(line)

    # Optional small tweak: if FINAL_ORDER empty but REASONING very long,
    # move last 2‚Äì3 sentences from REASONING into FINAL_ORDER
    if not sections["FINAL_ORDER"] and len(sections["REASONING"]) > 5:
        sections["FINAL_ORDER"] = sections["REASONING"][-3:]
        sections["REASONING"] = sections["REASONING"][:-3]

    # Join lists into strings
    joined_sections = {
        sec: " ".join(lines).strip()
        for sec, lines in sections.items()
    }

    return joined_sections

# Quick sanity test on one cleaned judgment
sample_row = df_index.iloc[0]
with open(sample_row["cleaned_path"], "r", encoding="utf-8", errors="ignore") as f:
    sample_clean = f.read()

sample_sections = segment_judgment_text(sample_clean)

for sec, txt in sample_sections.items():
    print(f"\n=== {sec} ===")
    print(txt[:400])


=== FACTS ===
ivil Appeal No. 1832 of 1967.

=== ISSUES ===


=== ARGUMENTS ===


=== REASONING ===


=== FINAL_ORDER ===
40 of 1967. S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant. Sarjoo Prasad and D.N. Misra for respondent No. 1. The Judgment of the Court was delivered by, Mitter, J. This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 de


In [21]:
# ============================
# PHASE 3 ‚Äì STEP 5
# Build structured JSON for each case_id
# ============================

import json
from tqdm import tqdm

structured_paths = []

for idx, row in tqdm(df_index.iterrows(), total=len(df_index)):
    case_id = str(row["case_id"])
    clean_path = row["cleaned_path"]
    
    # Read cleaned text
    try:
        with open(clean_path, "r", encoding="utf-8", errors="ignore") as f:
            clean_text_str = f.read()
    except Exception as e:
        print(f"[WARN] Could not read cleaned file {clean_path} for case_id={case_id}: {e}")
        clean_text_str = ""
    
    # Segment into sections
    sections = segment_judgment_text(clean_text_str)
    
    # Build structured dict
    structured_obj = {
        "case_id": case_id,
        "split": row["split"],
        "corpus": row["corpus"],
        "cluster_id": int(row["cluster_id"]) if not pd.isna(row["cluster_id"]) else None,
        "cluster_label": row.get("cluster_label", None) if isinstance(row.get("cluster_label", None), str) else None,
        "sections": sections,
    }
    
    # Output path
    out_path = os.path.join(STRUCTURED_DIR, f"{case_id}.json")
    
    # Save JSON
    try:
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(structured_obj, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[WARN] Could not write structured file {out_path} for case_id={case_id}: {e}")
        out_path = None
    
    structured_paths.append(out_path)

# Attach structured_path column
df_index["structured_path"] = structured_paths

print("Rows with missing structured_path:", df_index["structured_path"].isna().sum())
df_index.head()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1600/1600 [00:17<00:00, 92.78it/s]

Rows with missing structured_path: 0





Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path,structured_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt,/kaggle/working/structured/2502.json
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt,/kaggle/working/structured/4523.json
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt,/kaggle/working/structured/6135.json
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt,/kaggle/working/structured/3373.json
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt,/kaggle/working/structured/787.json


In [22]:
# ============================
# PHASE 3 ‚Äì STEP 6
# Save updated index (Phase 3) and quick checks
# ============================

# Basic sanity: ensure no NaN structured paths
missing_structured = df_index["structured_path"].isna().sum()
print("Rows with missing structured_path:", missing_structured)

if missing_structured > 0:
    print("‚ö†Ô∏è Warning: some rows have no structured_path. Inspect before proceeding.")
else:
    print("‚úÖ All rows have structured_path set.")

# Save Phase 3 index
INDEX_PHASE3_PATH = "/kaggle/working/cases_index_indabs_1500_phase3.csv"
df_index.to_csv(INDEX_PHASE3_PATH, index=False)
print("‚úÖ Saved Phase 3 index to:", INDEX_PHASE3_PATH)

# Optional: load and peek at one structured JSON
sample_struct_path = df_index.iloc[0]["structured_path"]
print("\nSample structured JSON path:", sample_struct_path)

with open(sample_struct_path, "r", encoding="utf-8", errors="ignore") as f:
    sample_struct = json.load(f)

for sec in SECTION_SCHEMA:
    print(f"\n=== {sec} (first 300 chars) ===")
    print(sample_struct["sections"].get(sec, "")[:300])

Rows with missing structured_path: 0
‚úÖ All rows have structured_path set.
‚úÖ Saved Phase 3 index to: /kaggle/working/cases_index_indabs_1500_phase3.csv

Sample structured JSON path: /kaggle/working/structured/2502.json

=== FACTS (first 300 chars) ===
ivil Appeal No. 1832 of 1967.

=== ISSUES (first 300 chars) ===


=== ARGUMENTS (first 300 chars) ===


=== REASONING (first 300 chars) ===


=== FINAL_ORDER (first 300 chars) ===
40 of 1967. S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant. Sarjoo Prasad and D.N. Misra for respondent No. 1. The Judgment of the Court was delivered by, Mitter, J. This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an ele


In [23]:
# ============================
# PHASE 3 ‚Äì STEP 7
# Global stats: length & emptiness of sections
# ============================

import json
import numpy as np

# Load Phase 3 index (if not already in memory)
INDEX_PHASE3_PATH = "/kaggle/working/cases_index_indabs_1500_phase3.csv"
df_index_phase3 = pd.read_csv(INDEX_PHASE3_PATH)

section_lengths = {sec: [] for sec in SECTION_SCHEMA}
section_empty_counts = {sec: 0 for sec in SECTION_SCHEMA}

for idx, row in df_index_phase3.iterrows():
    struct_path = row["structured_path"]
    if not isinstance(struct_path, str) or not os.path.exists(struct_path):
        continue
    
    with open(struct_path, "r", encoding="utf-8", errors="ignore") as f:
        obj = json.load(f)
    
    sections = obj.get("sections", {})
    for sec in SECTION_SCHEMA:
        text = sections.get(sec, "") or ""
        length = len(text)
        section_lengths[sec].append(length)
        if length == 0:
            section_empty_counts[sec] += 1

print("=== Section stats (lengths) ===")
for sec in SECTION_SCHEMA:
    lengths = np.array(section_lengths[sec]) if section_lengths[sec] else np.array([0])
    print(
        f"{sec}: count={len(lengths)}, "
        f"mean={lengths.mean():.1f}, "
        f"min={lengths.min()}, "
        f"max={lengths.max()}"
    )

print("\n=== Section empty counts ===")
for sec in SECTION_SCHEMA:
    print(f"{sec}: empty in {section_empty_counts[sec]} cases")

=== Section stats (lengths) ===
FACTS: count=1600, mean=2165.7, min=0, max=51194
ISSUES: count=1600, mean=91.8, min=0, max=14435
ARGUMENTS: count=1600, mean=1154.1, min=0, max=56435
REASONING: count=1600, mean=3531.9, min=0, max=120822
FINAL_ORDER: count=1600, mean=13446.4, min=0, max=235339

=== Section empty counts ===
FACTS: empty in 188 cases
ISSUES: empty in 1542 cases
ARGUMENTS: empty in 1188 cases
REASONING: empty in 665 cases
FINAL_ORDER: empty in 13 cases


In [24]:
# ============================
# PHASE 3 ‚Äì STEP 8
# Manual inspection of a few random structured cases
# ============================

import random

def show_structured_case(row, max_chars=400):
    struct_path = row["structured_path"]
    case_id = row["case_id"]
    split = row["split"]
    cluster_id = row.get("cluster_id", None)
    cluster_label = row.get("cluster_label", None)
    
    print(f"\n====== case_id: {case_id} | split: {split} | cluster_id: {cluster_id} | cluster_label: {cluster_label} ======")
    print("Structured path:", struct_path)
    
    if not isinstance(struct_path, str) or not os.path.exists(struct_path):
        print("‚ö†Ô∏è structured file missing")
        return
    
    with open(struct_path, "r", encoding="utf-8", errors="ignore") as f:
        obj = json.load(f)
    
    sections = obj.get("sections", {})
    for sec in SECTION_SCHEMA:
        txt = (sections.get(sec, "") or "")[:max_chars]
        print(f"\n--- {sec} (first {max_chars} chars) ---")
        print(txt)

# Pick a few random rows
num_samples = 3
indices = random.sample(range(len(df_index_phase3)), k=num_samples)

for i in indices:
    show_structured_case(df_index_phase3.iloc[i], max_chars=400)


Structured path: /kaggle/working/structured/4800.json

--- FACTS (first 400 chars) ---
From the Judgment and Decree dated the 22nd December, 1969 of the Allahabad High Court in Writ Petition . No. 210 of 1967. S.T. Desai and H.S. Parihar for the Appellant. Harbans Lal, Miss A. Subhashini and V.B. Saharya for the Respondent. Mrs. Shobha Dikshit for Respondent. No. 3. The Judgment of the Court was delivered by 532 AMARENDRA NATH SEN, J. Rule 125A makes it clear that the rule 125A is n

--- ISSUES (first 400 chars) ---


--- ARGUMENTS (first 400 chars) ---


--- REASONING (first 400 chars) ---


--- FINAL_ORDER (first 400 chars) ---
The main contention of Mr. section T. However, for the purpose of deciding the question raised in this appeal it does not become necessary to go into any dispute with regard to the facts. Desai that the Mill was factually so closed, it does not become necessary for us to advert to the facts of this particular case. The real question is one of interpretation o

In [25]:
# ============================
# PHASE 4 ‚Äì STEP 1
# Load Phase 3 index + create chunked dir
# ============================

import os
import pandas as pd

# Path to Phase 3 index (output of Phase 3)
INDEX_PHASE3_PATH = "/kaggle/working/cases_index_indabs_1500_phase3.csv"

df_index = pd.read_csv(INDEX_PHASE3_PATH)
print("Index shape:", df_index.shape)
display(df_index.head())

# Directory for chunked outputs
CHUNKED_DIR = "/kaggle/working/chunked"
os.makedirs(CHUNKED_DIR, exist_ok=True)

print("Chunked files will be stored in:", CHUNKED_DIR)

Index shape: (1600, 9)


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path,structured_path
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt,/kaggle/working/structured/2502.json
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt,/kaggle/working/structured/4523.json
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt,/kaggle/working/structured/6135.json
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt,/kaggle/working/structured/3373.json
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt,/kaggle/working/structured/787.json


Chunked files will be stored in: /kaggle/working/chunked


In [26]:
# ============================
# PHASE 4 ‚Äì STEP 2
# Chunking config + LED tokenizer
# ============================

!pip install -q transformers

from transformers import AutoTokenizer

# Use the same tokenizer/model you plan for training
LED_MODEL_NAME = "allenai/led-base-16384"   # change if you use another LED

tokenizer = AutoTokenizer.from_pretrained(LED_MODEL_NAME)

# LED max length you plan to use for training
MAX_SOURCE_TOKENS = 4096   # your training max_source_length
# Keep a small buffer for special tokens -> chunk limit a bit smaller
MAX_CHUNK_TOKENS = 3800

print("Loaded tokenizer:", LED_MODEL_NAME)
print("MAX_SOURCE_TOKENS:", MAX_SOURCE_TOKENS)
print("MAX_CHUNK_TOKENS:", MAX_CHUNK_TOKENS)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Loaded tokenizer: allenai/led-base-16384
MAX_SOURCE_TOKENS: 4096
MAX_CHUNK_TOKENS: 3800


In [27]:
# ============================
# PHASE 4 ‚Äì STEP 3
# Flatten sections into a single text per case
# ============================

import json

# Canonical order of sections (same as earlier phases)
SECTION_SCHEMA = ["FACTS", "ISSUES", "ARGUMENTS", "REASONING", "FINAL_ORDER"]

def flatten_sections_from_structured(struct_path: str) -> str:
    """
    Load structured/<case_id>.json and return a single flat string:
    [SECTION=FACTS] ... [SECTION=ISSUES] ... etc.
    """
    with open(struct_path, "r", encoding="utf-8", errors="ignore") as f:
        obj = json.load(f)
    
    sections = obj.get("sections", {})
    
    parts = []
    for sec in SECTION_SCHEMA:
        sec_text = (sections.get(sec, "") or "").strip()
        if not sec_text:
            # still add the tag to keep structure explicit
            parts.append(f"[SECTION={sec}]")
        else:
            parts.append(f"[SECTION={sec}] {sec_text}")
    
    flat_text = " ".join(parts).strip()
    return flat_text

# Quick sanity test on first case
sample_struct_path = df_index.iloc[0]["structured_path"]
print("Sample structured path:", sample_struct_path)

flat_sample = flatten_sections_from_structured(sample_struct_path)
print("\n=== FLATTENED TEXT (first 800 chars) ===")
print(flat_sample[:800])

Sample structured path: /kaggle/working/structured/2502.json

=== FLATTENED TEXT (first 800 chars) ===
[SECTION=FACTS] ivil Appeal No. 1832 of 1967. [SECTION=ISSUES] [SECTION=ARGUMENTS] [SECTION=REASONING] [SECTION=FINAL_ORDER] 40 of 1967. S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant. Sarjoo Prasad and D.N. Misra for respondent No. 1. The Judgment of the Court was delivered by, Mitter, J. This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 declaring the election of the appellant void under section 98 of the Representation of the People Act (hereinafter referred to as the Act). There was no less than eight candidates at the said election, five of whom polled very few votes. The result of the election so far as th


In [28]:
# ============================
# PHASE 4 ‚Äì STEP 4
# Sentence-level splitting of the flattened text
# ============================

import re

def split_into_sentences(text: str):
    """
    Split flat text into rough sentences, preserving section tags like [SECTION=FACTS].
    Very simple regex-based splitter; you can refine later if needed.
    """
    if not text or not text.strip():
        return []
    
    # protect section tags by inserting a newline before them
    text = re.sub(r"(\[SECTION=[A-Z_]+\])", r"\n\1 ", text)
    
    # initial rough split by sentence enders (., ?, !) followed by space or end
    # but keep tags separate
    raw_sents = re.split(r'(?<=[.!?])\s+', text)
    
    # further split on newlines to ensure tags start their own "sentence"
    sentences = []
    for chunk in raw_sents:
        for part in chunk.split("\n"):
            part = part.strip()
            if part:
                sentences.append(part)
    
    return sentences

# Test on the same sample
sentences_sample = split_into_sentences(flat_sample)
print("Number of 'sentences' in sample:", len(sentences_sample))
print("\n=== First 10 sentences ===")
for s in sentences_sample[:10]:
    print("‚Ä¢", s[:200])

Number of 'sentences' in sample: 151

=== First 10 sentences ===
‚Ä¢ [SECTION=FACTS]  ivil Appeal No.
‚Ä¢ 1832 of 1967.
‚Ä¢ [SECTION=ISSUES]
‚Ä¢ [SECTION=ARGUMENTS]
‚Ä¢ [SECTION=REASONING]
‚Ä¢ [SECTION=FINAL_ORDER]  40 of 1967.
‚Ä¢ S.V.
‚Ä¢ Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant.
‚Ä¢ Sarjoo Prasad and D.N.
‚Ä¢ Misra for respondent No.


In [29]:
# ============================
# PHASE 4 ‚Äì STEPS 5 & 6
# Token-aware greedy chunking
# ============================

from typing import List

def sentences_to_led_chunks(sentences: List[str], max_chunk_tokens: int) -> List[str]:
    """
    Greedy sentence-based chunking:
    - Keep adding sentences to current chunk while token count <= max_chunk_tokens.
    - When adding a sentence would exceed the limit, close the chunk and start a new one.
    - If a single sentence itself exceeds max_chunk_tokens, force it as its own chunk.
    Returns: list of chunk_text strings.
    """
    chunks = []
    current_text = ""

    for sent in sentences:
        sent = sent.strip()
        if not sent:
            continue

        if current_text:
            candidate = current_text + " " + sent
        else:
            candidate = sent

        # Tokenize candidate to see length
        input_ids = tokenizer(
            candidate,
            add_special_tokens=False,
            truncation=False
        )["input_ids"]
        token_count = len(input_ids)

        if token_count <= max_chunk_tokens:
            # safe to add
            current_text = candidate
        else:
            # close current chunk if non-empty
            if current_text:
                chunks.append(current_text.strip())
                # start new chunk with this sentence alone
                single_ids = tokenizer(
                    sent,
                    add_special_tokens=False,
                    truncation=False
                )["input_ids"]
                if len(single_ids) <= max_chunk_tokens:
                    current_text = sent
                else:
                    # sentence itself too long -> force as own chunk
                    chunks.append(sent)
                    current_text = ""
            else:
                # current_text empty but sentence already too long -> own chunk
                chunks.append(sent)
                current_text = ""

    # close last chunk
    if current_text:
        chunks.append(current_text.strip())

    return chunks

# Quick sanity test on the sample sentences from earlier
sample_chunks = sentences_to_led_chunks(sentences_sample, MAX_CHUNK_TOKENS)
print("Sample case -> number of chunks:", len(sample_chunks))
for i, ch in enumerate(sample_chunks[:3]):
    print(f"\n=== Chunk {i} (first 400 chars) ===")
    print(ch[:400])

Sample case -> number of chunks: 2

=== Chunk 0 (first 400 chars) ===
[SECTION=FACTS]  ivil Appeal No. 1832 of 1967. [SECTION=ISSUES] [SECTION=ARGUMENTS] [SECTION=REASONING] [SECTION=FINAL_ORDER]  40 of 1967. S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant. Sarjoo Prasad and D.N. Misra for respondent No. 1. The Judgment of the Court was delivered by, Mitter, J. This is an appeal from a judgment of the Madhya Pradesh High Court by a r

=== Chunk 1 (first 400 chars) ===
The appointment was gazetted in October 1966 by a notification to the effect that the State Government was pleased to constitute the Tribunal as specified below for the purpose of section 73 of the Madhya Pradesh Improvement Trusts Act, 1960, for acquisition of land at Ujjain. President Shri M.C. Joshi, Advocate, Ujjain. Assessors. Shri Chand Narayan Rajdan, Retired Traffic Superintendent Agar Lig


In [30]:
# ============================
# PHASE 4 ‚Äì STEP 7
# Chunk all cases, save chunked/<case_id>.json
# ============================

import json
from tqdm import tqdm

chunked_paths = []
num_chunks_list = []

for idx, row in tqdm(df_index.iterrows(), total=len(df_index)):
    case_id = str(row["case_id"])
    struct_path = row["structured_path"]

    # Fallback if structured file missing
    if not isinstance(struct_path, str) or not os.path.exists(struct_path):
        print(f"[WARN] Missing structured file for case_id={case_id}: {struct_path}")
        chunked_paths.append(None)
        num_chunks_list.append(0)
        continue

    # 1) Flatten sections
    flat_text = flatten_sections_from_structured(struct_path)

    # 2) Sentence splitting
    sentences = split_into_sentences(flat_text)

    # 3) Create chunks (list of strings)
    chunk_texts = sentences_to_led_chunks(sentences, MAX_CHUNK_TOKENS)

    # If somehow no text/sentences, create a single empty chunk
    if not chunk_texts:
        chunk_texts = [""]

    # 4) Build chunk objects with metadata
    chunks = []
    for cid, chunk_text in enumerate(chunk_texts):
        chunk_obj = {
            "case_id": case_id,
            "chunk_id": cid,
            "split": row["split"],
            "corpus": row["corpus"],
            "cluster_id": int(row["cluster_id"]) if not pd.isna(row["cluster_id"]) else None,
            "cluster_label": row.get("cluster_label", None),
            "text": chunk_text,
        }
        chunks.append(chunk_obj)

    # 5) Save per-case chunk file
    out_path = os.path.join(CHUNKED_DIR, f"{case_id}.json")
    try:
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(chunks, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[WARN] Could not write chunked file {out_path} for case_id={case_id}: {e}")
        out_path = None

    chunked_paths.append(out_path)
    num_chunks_list.append(len(chunks))

# Attach to df_index
df_index["chunked_path"] = chunked_paths
df_index["num_chunks"] = num_chunks_list

print("Rows with missing chunked_path:", df_index["chunked_path"].isna().sum())
df_index.head()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1600/1600 [13:41<00:00,  1.95it/s]

Rows with missing chunked_path: 0





Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path,structured_path,chunked_path,num_chunks
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt,/kaggle/working/structured/2502.json,/kaggle/working/chunked/2502.json,2
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt,/kaggle/working/structured/4523.json,/kaggle/working/chunked/4523.json,1
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt,/kaggle/working/structured/6135.json,/kaggle/working/chunked/6135.json,2
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt,/kaggle/working/structured/3373.json,/kaggle/working/chunked/3373.json,1
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt,/kaggle/working/structured/787.json,/kaggle/working/chunked/787.json,2


In [31]:
# ============================
# PHASE 4 ‚Äì STEP 7 (continued)
# Save Phase 4 index + basic stats
# ============================

# Sanity checks
missing_chunked = df_index["chunked_path"].isna().sum()
print("Rows with missing chunked_path:", missing_chunked)

print("\nnum_chunks stats:")
print("  min:", df_index["num_chunks"].min())
print("  max:", df_index["num_chunks"].max())
print("  mean:", df_index["num_chunks"].mean())

# Save Phase 4 index
INDEX_PHASE4_PATH = "/kaggle/working/cases_index_indabs_1500_phase4.csv"
df_index.to_csv(INDEX_PHASE4_PATH, index=False)
print("‚úÖ Saved Phase 4 index to:", INDEX_PHASE4_PATH)

# Optional: peek at one chunked JSON
sample_chunk_path = df_index.iloc[0]["chunked_path"]
print("\nSample chunked path:", sample_chunk_path)

with open(sample_chunk_path, "r", encoding="utf-8", errors="ignore") as f:
    sample_chunks = json.load(f)

print("Number of chunks in sample case:", len(sample_chunks))
for ch in sample_chunks[:2]:
    print("\n--- chunk_id:", ch["chunk_id"], "| split:", ch["split"], "| cluster:", ch["cluster_id"], ch["cluster_label"])
    print(ch["text"][:400])

Rows with missing chunked_path: 0

num_chunks stats:
  min: 1
  max: 24
  mean: 1.638125
‚úÖ Saved Phase 4 index to: /kaggle/working/cases_index_indabs_1500_phase4.csv

Sample chunked path: /kaggle/working/chunked/2502.json
Number of chunks in sample case: 2

--- chunk_id: 0 | split: train | cluster: 3 CL_3
[SECTION=FACTS]  ivil Appeal No. 1832 of 1967. [SECTION=ISSUES] [SECTION=ARGUMENTS] [SECTION=REASONING] [SECTION=FINAL_ORDER]  40 of 1967. S.V. Gupte, Rarneshwar Nath, Mahinder Narainand, Ravinder Nath for the appellant. Sarjoo Prasad and D.N. Misra for respondent No. 1. The Judgment of the Court was delivered by, Mitter, J. This is an appeal from a judgment of the Madhya Pradesh High Court by a r

--- chunk_id: 1 | split: train | cluster: 3 CL_3
The appointment was gazetted in October 1966 by a notification to the effect that the State Government was pleased to constitute the Tribunal as specified below for the purpose of section 73 of the Madhya Pradesh Improvement Trusts Act, 196

In [32]:
# ============================
# PHASE 4 ‚Äì STEP 8
# Quality checks for chunking
# ============================

import os
import json
import random
import numpy as np
import pandas as pd

# Reload Phase 4 index (just to be safe)
INDEX_PHASE4_PATH = "/kaggle/working/cases_index_indabs_1500_phase4.csv"
df_phase4 = pd.read_csv(INDEX_PHASE4_PATH)

print("Phase 4 index shape:", df_phase4.shape)
print(df_phase4[["case_id", "split", "num_chunks"]].head())

# 1) Inspect a few random cases and their chunks
def inspect_chunked_case(row, max_chars=350):
    chunk_path = row["chunked_path"]
    case_id = row["case_id"]
    split = row["split"]
    
    print(f"\n====== case_id: {case_id} | split: {split} | chunked_path: {chunk_path} ======")
    if not isinstance(chunk_path, str) or not os.path.exists(chunk_path):
        print("‚ö†Ô∏è chunked file missing")
        return
    
    with open(chunk_path, "r", encoding="utf-8", errors="ignore") as f:
        chunks = json.load(f)
    
    print("Total chunks:", len(chunks))
    for ch in chunks[:3]:  # show first 3 chunks
        text = ch["text"]
        print(f"\n--- chunk_id: {ch['chunk_id']} ---")
        print(text[:max_chars])

# Pick 3 random cases
num_samples = 3
indices = random.sample(range(len(df_phase4)), k=min(num_samples, len(df_phase4)))
for i in indices:
    inspect_chunked_case(df_phase4.iloc[i], max_chars=350)


# 2) Token length stats per chunk (global)
all_chunk_lengths = []

for idx, row in df_phase4.iterrows():
    chunk_path = row["chunked_path"]
    if not isinstance(chunk_path, str) or not os.path.exists(chunk_path):
        continue
    
    with open(chunk_path, "r", encoding="utf-8", errors="ignore") as f:
        chunks = json.load(f)
    
    for ch in chunks:
        text = ch["text"]
        # tokenize without special tokens
        token_ids = tokenizer(
            text,
            add_special_tokens=False,
            truncation=False
        )["input_ids"]
        all_chunk_lengths.append(len(token_ids))

all_chunk_lengths = np.array(all_chunk_lengths)
print("\n=== Token length stats per chunk ===")
print("  #chunks:", len(all_chunk_lengths))
print("  min   :", int(all_chunk_lengths.min()) if len(all_chunk_lengths) else None)
print("  max   :", int(all_chunk_lengths.max()) if len(all_chunk_lengths) else None)
print("  mean  :", float(all_chunk_lengths.mean()) if len(all_chunk_lengths) else None)
print("  95th percentile:", float(np.percentile(all_chunk_lengths, 95)) if len(all_chunk_lengths) else None)

# Check if any chunk exceeds MAX_CHUNK_TOKENS
over_limit = (all_chunk_lengths > MAX_CHUNK_TOKENS).sum()
print(f"\nChunks over MAX_CHUNK_TOKENS ({MAX_CHUNK_TOKENS}):", int(over_limit))

Phase 4 index shape: (1600, 11)
   case_id  split  num_chunks
0     2502  train           2
1     4523  train           1
2     6135  train           2
3     3373  train           1
4      787  train           2

Total chunks: 1

--- chunk_id: 0 ---
[SECTION=FACTS] [SECTION=ISSUES] [SECTION=ARGUMENTS] [SECTION=REASONING] [SECTION=FINAL_ORDER]  No. 2087/68. M. C. Bhandare, C. K. Sucharita and M. N. Shroff for the Appellant. Nemo for the Respondent. The Judgment of the Court was delivered by KOSHAL J. (5) of section 17 of the Maharashtra Medical Practitioners Act, 1961 (hereinafter referred to a

Total chunks: 1

--- chunk_id: 0 ---
[SECTION=FACTS]  Appeals Nos. 86 to 97 of 1962. [SECTION=ISSUES] [SECTION=ARGUMENTS] [SECTION=REASONING]  But the High Court held that the plea about the vires of section 2(12) and the Explanation thereto raised a substantial question as to the interpretation of the Constitution, and accordingly granted certificates of fitness under article 132 of 

Total chu

In [33]:
# ============================
# PHASE 5 ‚Äì STEP 1
# Load Phase 4 index + create skeleton dir
# ============================

import os
import pandas as pd

# Path to Phase 4 index (output of Phase 4)
INDEX_PHASE4_PATH = "/kaggle/working/cases_index_indabs_1500_phase4.csv"

df_index = pd.read_csv(INDEX_PHASE4_PATH)
print("Phase 4 index shape:", df_index.shape)
display(df_index.head())

# Directory where we will store per-case skeleton JSON files
SKELETON_DIR = "/kaggle/working/skeleton"
os.makedirs(SKELETON_DIR, exist_ok=True)

print("Skeleton files will be stored in:", SKELETON_DIR)

Phase 4 index shape: (1600, 11)


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path,structured_path,chunked_path,num_chunks
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt,/kaggle/working/structured/2502.json,/kaggle/working/chunked/2502.json,2
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt,/kaggle/working/structured/4523.json,/kaggle/working/chunked/4523.json,1
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt,/kaggle/working/structured/6135.json,/kaggle/working/chunked/6135.json,2
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt,/kaggle/working/structured/3373.json,/kaggle/working/chunked/3373.json,1
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt,/kaggle/working/structured/787.json,/kaggle/working/chunked/787.json,2


Skeleton files will be stored in: /kaggle/working/skeleton


In [34]:
# ============================
# PHASE 5 ‚Äì STEP 2
# Sentence splitter for judgments and summaries
# ============================

import re

def split_into_sentences_plain(text: str, min_len: int = 5):
    """
    Split plain text (judgment or summary) into simple sentences.
    - Split on ., ?, ! followed by whitespace or end.
    - Clean up whitespace.
    - Drop very short fragments (length < min_len chars).
    """
    if not text or not isinstance(text, str):
        return []
    
    # Normalize newlines to spaces
    text = text.replace("\r", " ").replace("\n", " ")
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    if not text:
        return []
    
    # Split on sentence enders
    raw_sents = re.split(r'(?<=[.!?])\s+', text)
    
    sentences = []
    for s in raw_sents:
        s = s.strip()
        if len(s) >= min_len:
            sentences.append(s)
    
    return sentences

# Quick sanity test on one case
sample_row = df_index.iloc[0]

# Judgment: use cleaned_path
with open(sample_row["cleaned_path"], "r", encoding="utf-8", errors="ignore") as f:
    sample_judg = f.read()

# Summary: use summary_path
with open(sample_row["summary_path"], "r", encoding="utf-8", errors="ignore") as f:
    sample_sum = f.read()

src_sents = split_into_sentences_plain(sample_judg)
tgt_sents = split_into_sentences_plain(sample_sum)

print("Sample case_id:", sample_row["case_id"])
print("Judgment sentences:", len(src_sents))
print("Summary sentences:", len(tgt_sents))

print("\n--- First 3 judgment sentences ---")
for s in src_sents[:3]:
    print("‚Ä¢", s[:200])

print("\n--- Summary sentences ---")
for s in tgt_sents:
    print("‚Ä¢", s[:200])

Sample case_id: 2502
Judgment sentences: 151
Summary sentences: 30

--- First 3 judgment sentences ---
‚Ä¢ ivil Appeal No.
‚Ä¢ 1832 of 1967.
‚Ä¢ Appeal under section 116 A of the Representation of the People Act, 1951 from the judgment and order dated October 31, 1967 of the Madhya Pradesh High Court, Indore Bench in Election Petition No.

--- Summary sentences ---
‚Ä¢ The appellant 's election to the M.P.
‚Ä¢ Legislative Assembly in February 1957 was challenged by an election petition mainly on the allegation ,that he was disqualified from being a candidate as he held certain offices of profit under the Go
‚Ä¢ The trial Judge allowed the election petition holding that the appellant held an office, of profit under the Government being on the panel of lawyers prepared by the Central 'and Western Railway Admin
‚Ä¢ 250 per month; it was also held that on the material before the court it could not be said that the appellant held the post of the President Member of a Tribunal constituted un

In [35]:
# ============================
# PHASE 5 ‚Äì STEP 3
# TF-IDF + cosine similarity helpers (per case)
# ============================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def build_tfidf_for_case(src_sentences, summary_sentences, max_features=8000):
    """
    Build TF-IDF vectors for a single case.
    - src_sentences: list of judgment sentences
    - summary_sentences: list of gold summary sentences
    Returns:
      X_src: TF-IDF matrix for source sentences (shape [M, D])
      X_sum: TF-IDF matrix for summary sentences (shape [N, D])
      vectorizer: fitted TfidfVectorizer (for debugging if needed)
    """
    # Combined corpus for fitting
    corpus = src_sentences + summary_sentences
    if len(corpus) == 0:
        return None, None, None
    
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),     # unigrams + bigrams
        min_df=1,
        max_features=max_features
    )
    X = vectorizer.fit_transform(corpus)  # shape: [M+N, D]
    
    M = len(src_sentences)
    N = len(summary_sentences)
    
    X_src = X[:M]         # first M rows
    X_sum = X[M:M+N]      # next N rows
    return X_src, X_sum, vectorizer


def compute_src_summary_sim(X_src, X_sum):
    """
    Compute cosine similarity between each summary sentence and all src sentences.
    Returns:
      sim_matrix: shape [N, M]
        sim_matrix[j, i] = similarity between summary j and src i
    """
    if X_src is None or X_sum is None:
        return None
    # cosine_similarity: (A, B) -> [A_rows, B_rows]
    sim_matrix = cosine_similarity(X_sum, X_src)  # [N, M]
    return sim_matrix

# Quick test on sample_case to verify shapes
print("\n=== TF-IDF test on sample case ===")
X_src, X_sum, vec = build_tfidf_for_case(src_sents, tgt_sents, max_features=5000)

if X_src is not None and X_sum is not None:
    print("X_src shape:", X_src.shape)   # [M, D]
    print("X_sum shape:", X_sum.shape)   # [N, D]
    
    sim_mat = compute_src_summary_sim(X_src, X_sum)
    print("sim_matrix shape:", sim_mat.shape)  # [N, M]
    
    # Show top-3 aligned src sentences for each summary sentence
    for j in range(sim_mat.shape[0]):
        sims = sim_mat[j]
        top3_idx = np.argsort(-sims)[:3]
        print(f"\nSummary sentence {j}: {tgt_sents[j][:200]}")
        print("Top-3 aligned judgment sentences:")
        for idx_src in top3_idx:
            print(f"  [src {idx_src}] (sim={sims[idx_src]:.3f})", src_sents[idx_src][:200])
else:
    print("TF-IDF could not be built for this sample (empty sentences?).")


=== TF-IDF test on sample case ===
X_src shape: (151, 3307)
X_sum shape: (30, 3307)
sim_matrix shape: (30, 151)

Summary sentence 0: The appellant 's election to the M.P.
Top-3 aligned judgment sentences:
  [src 112] (sim=0.188) The occasion for writing this letter is not quite clear unless it related to his election to the Legislative Assembly because by the last sentence of the letter the writer was asking the Principal as 
  [src 8] (sim=0.180) This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 de
  [src 117] (sim=0.146) It was during the hearing of the election petition that a letter dated 9th October 1967 came to be written by the Under Secretary to the Government of Madhya Pradesh,Education Department to the Regist

Summary sentence 1: Legislative Assembly in February 1957 was challenged by an election petition mainly on the allegatio

In [36]:
# ============================
# PHASE 5 ‚Äì STEPS 4‚Äì7
# Build TF-IDF-based skeletons for all cases
# ============================

import os
import json
import numpy as np
from tqdm import tqdm

# Hyperparameters for skeleton extraction
TOP_K_PER_SUMMARY = 3           # how many src sentences per summary sentence
MIN_SIM_THRESHOLD = 0.10        # drop matches with similarity below this
MAX_SKELETON_SENTENCES = 60     # hard cap per case

skeleton_paths = []
num_skeleton_sentences_list = []

for idx, row in tqdm(df_index.iterrows(), total=len(df_index)):
    case_id = str(row["case_id"])
    split = row["split"]
    corpus = row["corpus"]
    cluster_id = int(row["cluster_id"]) if not pd.isna(row["cluster_id"]) else None
    cluster_label = row.get("cluster_label", None)

    cleaned_path = row["cleaned_path"]
    summary_path = row["summary_path"]

    # --- 1) Read judgment & summary text ---
    try:
        with open(cleaned_path, "r", encoding="utf-8", errors="ignore") as f:
            judg_text = f.read()
    except Exception as e:
        print(f"[WARN] Could not read cleaned judgment {cleaned_path} for case_id={case_id}: {e}")
        judg_text = ""

    try:
        with open(summary_path, "r", encoding="utf-8", errors="ignore") as f:
            sum_text = f.read()
    except Exception as e:
        print(f"[WARN] Could not read summary {summary_path} for case_id={case_id}: {e}")
        sum_text = ""

    # --- 2) Split into sentences ---
    src_sents = split_into_sentences_plain(judg_text)
    tgt_sents = split_into_sentences_plain(sum_text)

    # Handle degenerate cases: no sentences
    if len(src_sents) == 0 or len(tgt_sents) == 0:
        skeleton_indices = []
        skeleton_scores = []
    else:
        # --- 3) Build TF-IDF for this case ---
        X_src, X_sum, vec = build_tfidf_for_case(src_sents, tgt_sents, max_features=8000)

        if X_src is None or X_sum is None:
            skeleton_indices = []
            skeleton_scores = []
        else:
            # --- 4) Similarity matrix: [N_summary, M_src] ---
            sim_matrix = compute_src_summary_sim(X_src, X_sum)  # shape [N, M]

            # --- 5) For each summary sentence, pick top-K src sentences ---
            candidate_indices = set()
            N, M = sim_matrix.shape

            for j in range(N):
                sims = sim_matrix[j]  # length M
                # top-K indices by similarity (descending)
                topk_idx = np.argsort(-sims)[:TOP_K_PER_SUMMARY]
                for i_src in topk_idx:
                    if sims[i_src] >= MIN_SIM_THRESHOLD:
                        candidate_indices.add(int(i_src))

            candidate_indices = list(candidate_indices)

            # If still empty (all sims below threshold), fallback to global max per src
            if not candidate_indices:
                max_sims_per_src = sim_matrix.max(axis=0)  # [M]
                # pick top MAX_SKELETON_SENTENCES or all if fewer
                top_all_idx = np.argsort(-max_sims_per_src)[:min(MAX_SKELETON_SENTENCES, M)]
                candidate_indices = list(top_all_idx)

            # --- 6) Rank candidates by importance (max similarity across summaries) ---
            max_sims_per_src = sim_matrix.max(axis=0)  # [M]
            # Only for candidate indices
            cand_with_score = [(i, float(max_sims_per_src[i])) for i in candidate_indices]

            # Sort by score descending, then by index ascending
            cand_with_score.sort(key=lambda x: (-x[1], x[0]))

            # Apply hard cap on number of skeleton sentences
            cand_with_score = cand_with_score[:min(len(cand_with_score), MAX_SKELETON_SENTENCES)]

            # Final indices: sort ascending to keep original order in document
            skeleton_indices = sorted([i for i, s in cand_with_score])
            skeleton_scores = [max_sims_per_src[i] for i in skeleton_indices]

    num_skeleton = len(skeleton_indices)

    # --- 7) Build skeleton object ---
    skeleton_obj = {
        "case_id": case_id,
        "split": split,
        "corpus": corpus,
        "cluster_id": cluster_id,
        "cluster_label": cluster_label,
        "k_per_summary": TOP_K_PER_SUMMARY,
        "min_sim_threshold": MIN_SIM_THRESHOLD,
        "max_skeleton_sentences": MAX_SKELETON_SENTENCES,
        "num_src_sentences": len(src_sents),
        "num_summary_sentences": len(tgt_sents),
        "skeleton_indices": skeleton_indices,
        "skeleton_scores": [float(s) for s in skeleton_scores],
        # optional: store actual skeleton sentences for debugging / paper examples
        "skeleton_sentences": [src_sents[i] for i in skeleton_indices] if skeleton_indices else [],
    }

    # Output path
    out_path = os.path.join(SKELETON_DIR, f"{case_id}.json")

    try:
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(skeleton_obj, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[WARN] Could not write skeleton file {out_path} for case_id={case_id}: {e}")
        out_path = None

    skeleton_paths.append(out_path)
    num_skeleton_sentences_list.append(num_skeleton)

# Attach to df_index
df_index["skeleton_path"] = skeleton_paths
df_index["num_skeleton_sentences"] = num_skeleton_sentences_list

print("Rows with missing skeleton_path:", df_index["skeleton_path"].isna().sum())
print(df_index[["case_id", "num_skeleton_sentences"]].head())

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1600/1600 [00:33<00:00, 47.44it/s]

Rows with missing skeleton_path: 0
   case_id  num_skeleton_sentences
0     2502                      51
1     4523                      51
2     6135                      60
3     3373                      22
4      787                      13





In [37]:
# ============================
# PHASE 5 ‚Äì Save index + stats
# ============================

# Basic stats
print("\nnum_skeleton_sentences stats:")
print("  min :", df_index["num_skeleton_sentences"].min())
print("  max :", df_index["num_skeleton_sentences"].max())
print("  mean:", df_index["num_skeleton_sentences"].mean())

# Save Phase 5 index
INDEX_PHASE5_PATH = "/kaggle/working/cases_index_indabs_1500_phase5.csv"
df_index.to_csv(INDEX_PHASE5_PATH, index=False)
print("‚úÖ Saved Phase 5 index to:", INDEX_PHASE5_PATH)

# Peek at one skeleton JSON
sample_row = df_index.iloc[0]
print("\nSample skeleton path:", sample_row["skeleton_path"])

with open(sample_row["skeleton_path"], "r", encoding="utf-8", errors="ignore") as f:
    sample_skel = json.load(f)

print("Sample skeleton case_id:", sample_skel["case_id"])
print("num_src_sentences:", sample_skel["num_src_sentences"])
print("num_summary_sentences:", sample_skel["num_summary_sentences"])
print("num_skeleton_sentences:", len(sample_skel["skeleton_indices"]))

print("\n--- First few skeleton sentences ---")
for s in sample_skel["skeleton_sentences"][:5]:
    print("‚Ä¢", s[:200])


num_skeleton_sentences stats:
  min : 4
  max : 60
  mean: 40.829375
‚úÖ Saved Phase 5 index to: /kaggle/working/cases_index_indabs_1500_phase5.csv

Sample skeleton path: /kaggle/working/skeleton/2502.json
Sample skeleton case_id: 2502
num_src_sentences: 151
num_summary_sentences: 30
num_skeleton_sentences: 51

--- First few skeleton sentences ---
‚Ä¢ This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 de
‚Ä¢ The election petition was filed by the husband of the 2nd respondent, Mrs.
‚Ä¢ The first alleged disqualification is based on a letter of appointment dated February 6, 1962 addressed by the Chief Commercial Superintendent to the appellant who accepted the conditions and terms of
‚Ä¢ The letter of the Commercial Superintendent ' shows that the appellant 's name.
‚Ä¢ Clause (13) of the terms is really the most important one for our presen

In [38]:
# ============================
# PHASE 5 ‚Äì STEP 8
# Skeleton quality checks & manual inspection
# ============================

import os
import json
import random
import numpy as np
import pandas as pd

# Reload Phase 5 index (just in case)
INDEX_PHASE5_PATH = "/kaggle/working/cases_index_indabs_1500_phase5.csv"
df_phase5 = pd.read_csv(INDEX_PHASE5_PATH)

print("Phase 5 index shape:", df_phase5.shape)
print(df_phase5[["case_id", "split", "num_skeleton_sentences"]].head())

# 1) Global stats on skeleton size
num_skel = df_phase5["num_skeleton_sentences"].fillna(0).values
num_skel = np.array(num_skel, dtype=int)

print("\n=== Skeleton size stats (num_skeleton_sentences) ===")
print("  #cases:", len(num_skel))
print("  min   :", int(num_skel.min()) if len(num_skel) else None)
print("  max   :", int(num_skel.max()) if len(num_skel) else None)
print("  mean  :", float(num_skel.mean()) if len(num_skel) else None)
print("  25%   :", float(np.percentile(num_skel, 25)) if len(num_skel) else None)
print("  50%   :", float(np.percentile(num_skel, 50)) if len(num_skel) else None)
print("  75%   :", float(np.percentile(num_skel, 75)) if len(num_skel) else None)

print("\n#cases with empty skeleton (0 sentences):", int((num_skel == 0).sum()))


# 2) Manual inspection helper
def inspect_skeleton_case(row, max_summary_chars=300, max_skeleton_sentences=8):
    case_id = row["case_id"]
    split = row["split"]
    cleaned_path = row["cleaned_path"]
    summary_path = row["summary_path"]
    skeleton_path = row["skeleton_path"]

    print(f"\n====== case_id: {case_id} | split: {split} ======")
    print("cleaned_path :", cleaned_path)
    print("summary_path :", summary_path)
    print("skeleton_path:", skeleton_path)

    if not isinstance(skeleton_path, str) or not os.path.exists(skeleton_path):
        print("‚ö†Ô∏è skeleton file missing")
        return

    # Load skeleton object
    with open(skeleton_path, "r", encoding="utf-8", errors="ignore") as f:
        skel = json.load(f)

    skel_indices = skel.get("skeleton_indices", [])
    skel_scores = skel.get("skeleton_scores", [])
    skel_sentences = skel.get("skeleton_sentences", [])

    print("\nnum_src_sentences       :", skel.get("num_src_sentences"))
    print("num_summary_sentences   :", skel.get("num_summary_sentences"))
    print("num_skeleton_sentences  :", len(skel_indices))

    # Load and split summary text for context
    try:
        with open(summary_path, "r", encoding="utf-8", errors="ignore") as f:
            sum_text = f.read()
    except Exception as e:
        print(f"[WARN] Could not read summary for case_id={case_id}: {e}")
        sum_text = ""

    summary_sents = split_into_sentences_plain(sum_text)

    print("\n--- GOLD SUMMARY SENTENCES ---")
    if not summary_sents:
        print("  (no summary sentences found)")
    else:
        for j, s in enumerate(summary_sents):
            print(f"  [{j}] {s[:max_summary_chars]}")

    print("\n--- SKELETON SENTENCES (aligned judgment sentences) ---")
    if not skel_sentences:
        print("  (no skeleton sentences stored)")
    else:
        for idx, (s_sent, score) in enumerate(zip(skel_sentences[:max_skeleton_sentences], skel_scores[:max_skeleton_sentences])):
            print(f"  [skeleton {idx}] (score={score:.3f}) {s_sent[:400]}")

    if len(skel_sentences) > max_skeleton_sentences:
        print(f"\n  ... ({len(skel_sentences) - max_skeleton_sentences} more skeleton sentences not shown)")


# 3) Inspect a few random cases
num_samples = 3
indices = random.sample(range(len(df_phase5)), k=min(num_samples, len(df_phase5)))

for i in indices:
    inspect_skeleton_case(df_phase5.iloc[i], max_summary_chars=300, max_skeleton_sentences=8)

Phase 5 index shape: (1600, 13)
   case_id  split  num_skeleton_sentences
0     2502  train                      51
1     4523  train                      51
2     6135  train                      60
3     3373  train                      22
4      787  train                      13

=== Skeleton size stats (num_skeleton_sentences) ===
  #cases: 1600
  min   : 4
  max   : 60
  mean  : 40.829375
  25%   : 28.0
  50%   : 40.0
  75%   : 60.0

#cases with empty skeleton (0 sentences): 0

cleaned_path : /kaggle/working/cleaned/6849.txt
summary_path : /kaggle/input/legal-datav2/dataset/IN-Abs/train-data/summary/6849.txt
skeleton_path: /kaggle/working/skeleton/6849.json

num_src_sentences       : 76
num_summary_sentences   : 37
num_skeleton_sentences  : 46

--- GOLD SUMMARY SENTENCES ---
  [0] The appellant plaintiff rented out to the respondent defendant the suit premises allotted to him, by sub lease, by a Housing Co operative Society, which itself held the flat under a 99 years lease grant

In [39]:
# ============================
# PHASE 6 ‚Äì STEP 1
# Load Phase 5 index + create final splits (train/val/test)
# ============================

import pandas as pd

INDEX_PHASE5_PATH = "/kaggle/working/cases_index_indabs_1500_phase5.csv"
df_index = pd.read_csv(INDEX_PHASE5_PATH)

print("Phase 5 index shape:", df_index.shape)
display(df_index.head())

# Current 'split' = only "train" / "test" from original dataset
print("\nOriginal split counts:")
print(df_index["split"].value_counts())

# Create validation split from TRAIN (e.g., 10% of train)
df_train_raw = df_index[df_index["split"] == "train"].copy()
df_test = df_index[df_index["split"] == "test"].copy()

VAL_FRAC = 0.10
df_val = df_train_raw.sample(frac=VAL_FRAC, random_state=42)
df_train = df_train_raw.drop(df_val.index)

print("\nAfter train/val split:")
print("train:", len(df_train))
print("val  :", len(df_val))
print("test :", len(df_test))

# Add a new 'final_split' column
df_index["final_split"] = None
df_index.loc[df_train.index, "final_split"] = "train"
df_index.loc[df_val.index,   "final_split"] = "val"
df_index.loc[df_test.index,  "final_split"] = "test"

print("\nFinal split counts:")
print(df_index["final_split"].value_counts())

display(df_index.head())

Phase 5 index shape: (1600, 13)


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path,structured_path,chunked_path,num_chunks,skeleton_path,num_skeleton_sentences
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt,/kaggle/working/structured/2502.json,/kaggle/working/chunked/2502.json,2,/kaggle/working/skeleton/2502.json,51
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt,/kaggle/working/structured/4523.json,/kaggle/working/chunked/4523.json,1,/kaggle/working/skeleton/4523.json,51
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt,/kaggle/working/structured/6135.json,/kaggle/working/chunked/6135.json,2,/kaggle/working/skeleton/6135.json,60
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt,/kaggle/working/structured/3373.json,/kaggle/working/chunked/3373.json,1,/kaggle/working/skeleton/3373.json,22
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt,/kaggle/working/structured/787.json,/kaggle/working/chunked/787.json,2,/kaggle/working/skeleton/787.json,13



Original split counts:
split
train    1500
test      100
Name: count, dtype: int64

After train/val split:
train: 1350
val  : 150
test : 100

Final split counts:
final_split
train    1350
val       150
test      100
Name: count, dtype: int64


Unnamed: 0,case_id,split,corpus,judgment_path,summary_path,cluster_id,cluster_label,cleaned_path,structured_path,chunked_path,num_chunks,skeleton_path,num_skeleton_sentences,final_split
0,2502,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/2502.txt,/kaggle/working/structured/2502.json,/kaggle/working/chunked/2502.json,2,/kaggle/working/skeleton/2502.json,51,train
1,4523,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,0,CL_0,/kaggle/working/cleaned/4523.txt,/kaggle/working/structured/4523.json,/kaggle/working/chunked/4523.json,1,/kaggle/working/skeleton/4523.json,51,train
2,6135,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,2,CL_2,/kaggle/working/cleaned/6135.txt,/kaggle/working/structured/6135.json,/kaggle/working/chunked/6135.json,2,/kaggle/working/skeleton/6135.json,60,train
3,3373,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,1,CL_1,/kaggle/working/cleaned/3373.txt,/kaggle/working/structured/3373.json,/kaggle/working/chunked/3373.json,1,/kaggle/working/skeleton/3373.json,22,train
4,787,train,IN-Abs,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,/kaggle/input/legal-datav2/dataset/IN-Abs/trai...,3,CL_3,/kaggle/working/cleaned/787.txt,/kaggle/working/structured/787.json,/kaggle/working/chunked/787.json,2,/kaggle/working/skeleton/787.json,13,train


In [40]:
# ============================
# PHASE 6 ‚Äì STEP 2
# Input format helpers
# ============================

import os
import json

# Canonical section order (same as Phase 3 & 4)
SECTION_SCHEMA = ["FACTS", "ISSUES", "ARGUMENTS", "REASONING", "FINAL_ORDER"]

def flatten_sections_from_structured(struct_path: str) -> str:
    """
    Load structured/<case_id>.json and return a single flat string:
    [SECTION=FACTS] ... [SECTION=ISSUES] ... etc.
    """
    with open(struct_path, "r", encoding="utf-8", errors="ignore") as f:
        obj = json.load(f)
    
    sections = obj.get("sections", {})
    
    parts = []
    for sec in SECTION_SCHEMA:
        sec_text = (sections.get(sec, "") or "").strip()
        if not sec_text:
            parts.append(f"[SECTION={sec}]")
        else:
            parts.append(f"[SECTION={sec}] {sec_text}")
    
    flat_text = " ".join(parts).strip()
    return flat_text


def build_cluster_token(cluster_id):
    """
    Return a cluster token like <CL_2> or <CL_NONE> if cluster_id is missing.
    """
    if cluster_id is None or (isinstance(cluster_id, float) and pd.isna(cluster_id)):
        return "<CL_NONE>"
    try:
        cid = int(cluster_id)
    except Exception:
        return "<CL_NONE>"
    return f"<CL_{cid}>"


def build_skeleton_block(skeleton_sentences, add_hl_tags: bool = True):
    """
    Build the [SKELETON] block text from a list of sentences.
    If list is empty -> just return "[SKELETON]".
    If add_hl_tags=True -> wrap each sentence in <HL> ... </HL>.
    """
    if not skeleton_sentences:
        return "[SKELETON]"
    
    if add_hl_tags:
        wrapped = [f"<HL> {s} </HL>" for s in skeleton_sentences]
    else:
        wrapped = skeleton_sentences
    
    return "[SKELETON] " + " ".join(wrapped)


def build_doc_block(structured_path: str):
    """
    Build the [DOC] block using flattened sections.
    """
    flat_text = flatten_sections_from_structured(structured_path)
    return "[DOC] " + flat_text

In [41]:
# ============================
# PHASE 6 ‚Äì STEP 3
# Load structured + skeleton + summary for a single case
# ============================

# Pick a sample row (you can change the index later)
sample_row = df_index.iloc[0]

case_id = sample_row["case_id"]
final_split = sample_row["final_split"]
structured_path = sample_row["structured_path"]
skeleton_path = sample_row["skeleton_path"]
summary_path = sample_row["summary_path"]
cluster_id = sample_row["cluster_id"]
cluster_label = sample_row.get("cluster_label", None)

print("Sample case_id:", case_id)
print("final_split    :", final_split)
print("structured_path:", structured_path)
print("skeleton_path  :", skeleton_path)
print("summary_path   :", summary_path)
print("cluster_id     :", cluster_id, "| cluster_label:", cluster_label)

# 1) Load structured JSON (for doc block)
with open(structured_path, "r", encoding="utf-8", errors="ignore") as f:
    sample_struct = json.load(f)

# 2) Load skeleton JSON (for skeleton block)
with open(skeleton_path, "r", encoding="utf-8", errors="ignore") as f:
    sample_skeleton = json.load(f)

skeleton_sentences = sample_skeleton.get("skeleton_sentences", []) or []

# 3) Load gold summary (target)
with open(summary_path, "r", encoding="utf-8", errors="ignore") as f:
    target_text = f.read().strip()

print("\n#skeleton_sentences:", len(skeleton_sentences))
print("target_text length:", len(target_text))

Sample case_id: 2502
final_split    : train
structured_path: /kaggle/working/structured/2502.json
skeleton_path  : /kaggle/working/skeleton/2502.json
summary_path   : /kaggle/input/legal-datav2/dataset/IN-Abs/train-data/summary/2502.txt
cluster_id     : 3 | cluster_label: CL_3

#skeleton_sentences: 51
target_text length: 4354


In [42]:
# ============================
# PHASE 6 ‚Äì STEP 4
# Assemble input_text + target_text for a single case
# ============================

# Cluster token
cluster_token = build_cluster_token(cluster_id)

# Skeleton block
skeleton_block = build_skeleton_block(skeleton_sentences, add_hl_tags=True)

# Doc block ([DOC] + flattened sections)
doc_block = build_doc_block(structured_path)

# Final input_text
input_text = f"{cluster_token} {skeleton_block} {doc_block}"

print("=== INPUT_TEXT (first 800 chars) ===")
print(input_text[:800])

print("\n=== TARGET_TEXT (first 400 chars) ===")
print(target_text[:400])

print("\nInput length (chars):", len(input_text))
print("Target length (chars):", len(target_text))

=== INPUT_TEXT (first 800 chars) ===
<CL_3> [SKELETON] <HL> This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 declaring the election of the appellant void under section 98 of the Representation of the People Act (hereinafter referred to as the Act). </HL> <HL> The election petition was filed by the husband of the 2nd respondent, Mrs. </HL> <HL> The first alleged disqualification is based on a letter of appointment dated February 6, 1962 addressed by the Chief Commercial Superintendent to the appellant who accepted the conditions and terms of that letter by his reply within a few days thereafter. </HL> <HL> The letter of the Commercial Superintendent ' shows that the appellant 's n

=== TARGET_TEXT (first 400 chars) ===
The appellant 's election to the M.P. Legislative Assembly in February 1957 was challenged by an election petition mainly o

In [43]:
# ============================
# PHASE 6 ‚Äì STEP 5
# Token length limits + truncation helpers
# ============================

!pip install -q transformers

from transformers import AutoTokenizer

# Use same LED model as earlier phases
LED_MODEL_NAME = "allenai/led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(LED_MODEL_NAME)

# LED limits for training
MAX_SOURCE_TOKENS = 4096    # max length for input_text
MAX_TARGET_TOKENS = 512     # max length for target_text (summary)

print("Using tokenizer:", LED_MODEL_NAME)
print("MAX_SOURCE_TOKENS:", MAX_SOURCE_TOKENS)
print("MAX_TARGET_TOKENS:", MAX_TARGET_TOKENS)


def truncate_source_with_budget(cluster_token: str,
                                skeleton_block: str,
                                doc_block: str,
                                max_source_tokens: int,
                                tokenizer) -> str:
    """
    Truncate input_text while preserving cluster_token + skeleton_block as much as possible.
    Strategy:
      prefix = "<CL_x> [SKELETON] ... "
      doc    = "[DOC] full doc text ..."
    If prefix + doc <= max_source_tokens -> keep all.
    Else:
      - keep full prefix
      - truncate doc to fit remaining budget.
    """
    prefix = f"{cluster_token} {skeleton_block}".strip()
    doc_text = doc_block.strip()

    # Full candidate
    full_text = f"{prefix} {doc_text}".strip()
    full_ids = tokenizer(
        full_text,
        add_special_tokens=False,
        truncation=False
    )["input_ids"]

    if len(full_ids) <= max_source_tokens:
        return full_text

    # Otherwise, compute prefix tokens and allocate budget for doc
    prefix_ids = tokenizer(
        prefix,
        add_special_tokens=False,
        truncation=False
    )["input_ids"]

    budget_for_doc = max_source_tokens - len(prefix_ids)
    if budget_for_doc <= 0:
        # Budget doesn't even allow full prefix -> hard truncate prefix itself
        truncated_prefix_ids = prefix_ids[:max_source_tokens]
        truncated_prefix = tokenizer.decode(truncated_prefix_ids, skip_special_tokens=False)
        return truncated_prefix.strip()

    # Truncate doc part to fit remaining budget
    doc_ids = tokenizer(
        doc_text,
        add_special_tokens=False,
        truncation=True,
        max_length=budget_for_doc
    )["input_ids"]

    truncated_doc = tokenizer.decode(doc_ids, skip_special_tokens=False)
    truncated_full = f"{prefix} {truncated_doc}".strip()
    return truncated_full


def truncate_target_text(target_text: str,
                         max_target_tokens: int,
                         tokenizer) -> str:
    """
    Truncate target summary text to max_target_tokens using the same tokenizer.
    """
    if not target_text:
        return ""

    enc = tokenizer(
        target_text,
        add_special_tokens=False,
        truncation=True,
        max_length=max_target_tokens
    )
    ids = enc["input_ids"]
    truncated = tokenizer.decode(ids, skip_special_tokens=False)
    return truncated.strip()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using tokenizer: allenai/led-base-16384
MAX_SOURCE_TOKENS: 4096
MAX_TARGET_TOKENS: 512


In [44]:
# =========================================
# FIXED truncate_source_with_budget
# =========================================

def truncate_source_with_budget(cluster_token: str,
                                skeleton_block: str,
                                doc_block: str,
                                max_source_tokens: int,
                                tokenizer) -> str:
    """
    Truncate input_text while preserving cluster_token + skeleton_block as much as possible.
    Uses tokenizer truncation so we never exceed model_max_length.
    """
    prefix = f"{cluster_token} {skeleton_block}".strip()
    doc_text = doc_block.strip()

    # 1) Try full text, but clipped to max_source_tokens
    full_text = f"{prefix} {doc_text}".strip()
    full_ids = tokenizer(
        full_text,
        add_special_tokens=False,
        truncation=True,           # üîß important: avoid > model_max_length
        max_length=max_source_tokens
    )["input_ids"]

    # If truncated full_text already fits within max_source_tokens,
    # just return the decoded truncated version (this may be slightly shorter than original).
    if len(full_ids) <= max_source_tokens:
        return tokenizer.decode(full_ids, skip_special_tokens=False).strip()

    # 2) If we‚Äôre here, we know full_text is too long.
    #    Compute prefix length (also safely truncated to max_source_tokens)
    prefix_ids = tokenizer(
        prefix,
        add_special_tokens=False,
        truncation=True,
        max_length=max_source_tokens
    )["input_ids"]

    # Remaining budget for doc tokens
    budget_for_doc = max_source_tokens - len(prefix_ids)
    if budget_for_doc <= 0:
        # Budget doesn't even allow full prefix -> hard-truncate prefix itself
        truncated_prefix_ids = prefix_ids[:max_source_tokens]
        truncated_prefix = tokenizer.decode(truncated_prefix_ids, skip_special_tokens=False)
        return truncated_prefix.strip()

    # 3) Truncate doc part to fit remaining budget
    doc_ids = tokenizer(
        doc_text,
        add_special_tokens=False,
        truncation=True,
        max_length=budget_for_doc
    )["input_ids"]

    truncated_doc = tokenizer.decode(doc_ids, skip_special_tokens=False)
    truncated_full = f"{prefix} {truncated_doc}".strip()
    return truncated_full

In [45]:
# ============================
# PHASE 6 ‚Äì STEP 6
# Build input_text + target_text for all cases
# ============================

import json
import os

train_examples = []
val_examples = []
test_examples = []

for idx, row in df_index.iterrows():
    final_split = row["final_split"]

    # Skip rows without a proper final split
    if final_split not in {"train", "val", "test"}:
        continue

    case_id = str(row["case_id"])
    corpus = row["corpus"]
    cluster_id = row["cluster_id"]
    cluster_label = row.get("cluster_label", None)
    structured_path = row["structured_path"]
    skeleton_path = row["skeleton_path"]
    summary_path = row["summary_path"]

    # ---- Load skeleton sentences ----
    skeleton_sentences = []
    if isinstance(skeleton_path, str) and os.path.exists(skeleton_path):
        try:
            with open(skeleton_path, "r", encoding="utf-8", errors="ignore") as f:
                skel = json.load(f)
            skeleton_sentences = skel.get("skeleton_sentences", []) or []
        except Exception as e:
            print(f"[WARN] Could not load skeleton for case_id={case_id}: {e}")
            skeleton_sentences = []
    else:
        # no skeleton -> empty list
        skeleton_sentences = []

    # ---- Load gold summary (target) ----
    try:
        with open(summary_path, "r", encoding="utf-8", errors="ignore") as f:
            target_text = f.read().strip()
    except Exception as e:
        print(f"[WARN] Could not read summary for case_id={case_id}: {e}")
        target_text = ""

    # ---- Build cluster token, skeleton block, doc block ----
    cluster_token = build_cluster_token(cluster_id)
    skeleton_block = build_skeleton_block(skeleton_sentences, add_hl_tags=True)
    doc_block = build_doc_block(structured_path)   # [DOC] + flattened sections

    # ---- Truncate source & target under token limits ----
    input_text = truncate_source_with_budget(
        cluster_token=cluster_token,
        skeleton_block=skeleton_block,
        doc_block=doc_block,
        max_source_tokens=MAX_SOURCE_TOKENS,
        tokenizer=tokenizer
    )

    target_text = truncate_target_text(
        target_text=target_text,
        max_target_tokens=MAX_TARGET_TOKENS,
        tokenizer=tokenizer
    )

    # ---- Build example dict ----
    example = {
        "case_id": case_id,
        "split": final_split,
        "corpus": corpus,
        "cluster_id": int(cluster_id) if not pd.isna(cluster_id) else None,
        "cluster_label": cluster_label,
        "input_text": input_text,
        "target_text": target_text,
    }

    # ---- Append to correct split list ----
    if final_split == "train":
        train_examples.append(example)
    elif final_split == "val":
        val_examples.append(example)
    elif final_split == "test":
        test_examples.append(example)

print("Built examples:")
print("  train:", len(train_examples))
print("  val  :", len(val_examples))
print("  test :", len(test_examples))

# Quick sanity: show one example from each split (if exists)
def show_example(ex, max_in=400, max_tgt=300):
    print("\n=== Example ===")
    print("case_id    :", ex["case_id"])
    print("split      :", ex["split"])
    print("cluster_id :", ex["cluster_id"], "| cluster_label:", ex["cluster_label"])
    print("\n-- input_text (first", max_in, "chars) --")
    print(ex["input_text"][:max_in])
    print("\n-- target_text (first", max_tgt, "chars) --")
    print(ex["target_text"][:max_tgt])

if train_examples:
    print("\nSample TRAIN example:")
    show_example(train_examples[0])

if val_examples:
    print("\nSample VAL example:")
    show_example(val_examples[0])

if test_examples:
    print("\nSample TEST example:")
    show_example(test_examples[0])

Built examples:
  train: 1350
  val  : 150
  test : 100

Sample TRAIN example:

=== Example ===
case_id    : 2502
split      : train
cluster_id : 3 | cluster_label: CL_3

-- input_text (first 400 chars) --
<CL_3> [SKELETON] <HL> This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 declaring the election of the appellant void under section 98 of the Representation of the People Act (hereinafter referred to as the Act). </HL> <HL> The election petition was fi

-- target_text (first 300 chars) --
The appellant 's election to the M.P. Legislative Assembly in February 1957 was challenged by an election petition mainly on the allegation ,that he was disqualified from being a candidate as he held certain offices of profit under the Government.
The trial Judge allowed the election petition holdin

Sample VAL example:

=== Example ===
case_id    : 4159

In [46]:
# ============================
# PHASE 6 ‚Äì STEP 7
# Write led_train.jsonl / led_val.jsonl / led_test.jsonl
# ============================

OUTPUT_TRAIN_PATH = "/kaggle/working/led_train.jsonl"
OUTPUT_VAL_PATH   = "/kaggle/working/led_val.jsonl"
OUTPUT_TEST_PATH  = "/kaggle/working/led_test.jsonl"

def write_jsonl(path, examples):
    with open(path, "w", encoding="utf-8") as f:
        for ex in examples:
            line = json.dumps(ex, ensure_ascii=False)
            f.write(line + "\n")

write_jsonl(OUTPUT_TRAIN_PATH, train_examples)
write_jsonl(OUTPUT_VAL_PATH, val_examples)
write_jsonl(OUTPUT_TEST_PATH, test_examples)

print("‚úÖ Wrote JSONL files:")
print("  train:", OUTPUT_TRAIN_PATH, "->", len(train_examples), "examples")
print("  val  :", OUTPUT_VAL_PATH,   "->", len(val_examples), "examples")
print("  test :", OUTPUT_TEST_PATH,  "->", len(test_examples), "examples")


# Extra: verify token lengths for a small random subset
import random
import numpy as np

def sample_token_stats(examples, n=50):
    if not examples:
        return None
    sample = random.sample(examples, k=min(n, len(examples)))
    src_lens = []
    tgt_lens = []

    for ex in sample:
        src_ids = tokenizer(
            ex["input_text"],
            add_special_tokens=True,
            truncation=False
        )["input_ids"]
        tgt_ids = tokenizer(
            ex["target_text"],
            add_special_tokens=True,
            truncation=False
        )["input_ids"]
        src_lens.append(len(src_ids))
        tgt_lens.append(len(tgt_ids))

    src_lens = np.array(src_lens)
    tgt_lens = np.array(tgt_lens)
    return src_lens, tgt_lens

print("\nChecking token length stats on a sample...")

src_train, tgt_train = sample_token_stats(train_examples, n=50)
if src_train is not None:
    print("TRAIN source tokens: min={}, max={}, mean={:.1f}".format(
        src_train.min(), src_train.max(), src_train.mean()))
    print("TRAIN target tokens: min={}, max={}, mean={:.1f}".format(
        tgt_train.min(), tgt_train.max(), tgt_train.mean()))

src_val, tgt_val = sample_token_stats(val_examples, n=50)
if src_val is not None:
    print("\nVAL source tokens: min={}, max={}, mean={:.1f}".format(
        src_val.min(), src_val.max(), src_val.mean()))
    print("VAL target tokens: min={}, max={}, mean={:.1f}".format(
        tgt_val.min(), tgt_val.max(), tgt_val.mean()))

src_test, tgt_test = sample_token_stats(test_examples, n=50)
if src_test is not None:
    print("\nTEST source tokens: min={}, max={}, mean={:.1f}".format(
        src_test.min(), src_test.max(), src_test.mean()))
    print("TEST target tokens: min={}, max={}, mean={:.1f}".format(
        tgt_test.min(), tgt_test.max(), tgt_test.mean()))

‚úÖ Wrote JSONL files:
  train: /kaggle/working/led_train.jsonl -> 1350 examples
  val  : /kaggle/working/led_val.jsonl -> 150 examples
  test : /kaggle/working/led_test.jsonl -> 100 examples

Checking token length stats on a sample...
TRAIN source tokens: min=1078, max=4098, mean=3622.7
TRAIN target tokens: min=142, max=514, mean=483.1

VAL source tokens: min=412, max=4098, mean=3520.7
VAL target tokens: min=130, max=514, mean=473.8

TEST source tokens: min=1287, max=4098, mean=3697.6
TEST target tokens: min=251, max=514, mean=488.9


In [47]:
# ============================================
# PHASE 7 ‚Äì CELL 1
# Install libraries & import
# ============================================

!pip install -q transformers datasets evaluate accelerate

import os
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m47.7/47.7 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4

In [48]:
# ============================================
# PHASE 7 ‚Äì CELL 2
# Config: paths, model name, constants
# ============================================

# üî¥ CHANGE THIS to your real Kaggle dataset folder name
# (the one that contains led_train.jsonl, led_val.jsonl, led_test.jsonl)
DATASET_DIR = "/kaggle/working/"  # <-- update this

TRAIN_PATH = os.path.join(DATASET_DIR, "led_train.jsonl")
VAL_PATH   = os.path.join(DATASET_DIR, "led_val.jsonl")
TEST_PATH  = os.path.join(DATASET_DIR, "led_test.jsonl")

print("Train path:", TRAIN_PATH)
print("Val path  :", VAL_PATH)
print("Test path :", TEST_PATH)

LED_MODEL_NAME = "allenai/led-base-16384"

MAX_SOURCE_TOKENS = 4096
MAX_TARGET_TOKENS = 512

OUTPUT_DIR = "./led_finetuned_indabs"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Train path: /kaggle/working/led_train.jsonl
Val path  : /kaggle/working/led_val.jsonl
Test path : /kaggle/working/led_test.jsonl
Using device: cuda


In [49]:
# ============================================
# PHASE 7 ‚Äì CELL 3
# Load train/val/test JSONL as HF datasets
# ============================================

data_files = {
    "train": TRAIN_PATH,
    "validation": VAL_PATH,
    "test": TEST_PATH,
}

raw_datasets = load_dataset("json", data_files=data_files)

print(raw_datasets)
print("\nSample train row:")
print(raw_datasets["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['case_id', 'split', 'corpus', 'cluster_id', 'cluster_label', 'input_text', 'target_text'],
        num_rows: 1350
    })
    validation: Dataset({
        features: ['case_id', 'split', 'corpus', 'cluster_id', 'cluster_label', 'input_text', 'target_text'],
        num_rows: 150
    })
    test: Dataset({
        features: ['case_id', 'split', 'corpus', 'cluster_id', 'cluster_label', 'input_text', 'target_text'],
        num_rows: 100
    })
})

Sample train row:
{'case_id': '2502', 'split': 'train', 'corpus': 'IN-Abs', 'cluster_id': 3, 'cluster_label': 'CL_3', 'input_text': '<CL_3> [SKELETON] <HL> This is an appeal from a judgment of the Madhya Pradesh High Court by a returned candidate at an election to Madhya Pradesh Legislative Assembly from Ujjain North Constituency held in February 1967 declaring the election of the appellant void under section 98 of the Representation of the People Act (hereinafter referred to as the Act). </H

In [50]:
# ============================================
# PHASE 7 ‚Äì CELL 4
# Load LED tokenizer & model
# ============================================

tokenizer = AutoTokenizer.from_pretrained(LED_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(LED_MODEL_NAME)

# Enable gradient checkpointing to save GPU memory
model.gradient_checkpointing_enable()
model.config.gradient_checkpointing = True

# (Optional) ensure attention_window is set (depends on model; LED-base usually has it)
if hasattr(model.config, "attention_window"):
    print("Attention window:", model.config.attention_window)

model.to(device)
print("Model loaded on:", device)

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

Attention window: [1024, 1024, 1024, 1024, 1024, 1024]
Model loaded on: cuda


In [51]:
# ============================================
# PHASE 7 ‚Äì CELL 5
# Preprocessing: tokenize input/target and add global attention mask
# ============================================

# We will keep only these columns after preprocessing
text_column = "input_text"
summary_column = "target_text"

def preprocess_function(batch):
    """
    Tokenize input_text and target_text.
    Also add global_attention_mask for LED (global attention on first token).
    """
    inputs = batch[text_column]
    targets = batch[summary_column]

    # Source
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_TOKENS,
        truncation=True,
        padding="max_length",   # or "longest" if you prefer
    )

    # Add global attention mask: 1 for first token, 0 for others
    # shape: [batch_size, seq_len]
    global_attention_mask = []
    for input_ids in model_inputs["input_ids"]:
        mask = [0] * len(input_ids)
        mask[0] = 1  # first token has global attention
        global_attention_mask.append(mask)
    model_inputs["global_attention_mask"] = global_attention_mask

    # Targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_TARGET_TOKENS,
            truncation=True,
            padding="max_length",
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing to all splits
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,  # drop original columns
)

print(tokenized_datasets)
print("\nKeys in a tokenized train example:")
print(tokenized_datasets["train"][0].keys())

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]



Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
        num_rows: 1350
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
        num_rows: 150
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
        num_rows: 100
    })
})

Keys in a tokenized train example:
dict_keys(['input_ids', 'attention_mask', 'global_attention_mask', 'labels'])


In [52]:
# ============================================
# PHASE 7 ‚Äì CELL 6
# Data collator for seq2seq (handles padding & label -100)
# ============================================

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,   # ignore padding tokens in loss
    padding="longest",
)

print("Data collator ready.")

Data collator ready.


In [53]:
# ============================================
# FIXED compute_metrics (robust to logits vs ids)
# ============================================

rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    import numpy as np

    predictions, labels = eval_pred

    # If Trainer returns a tuple (e.g. (logits, ...)), take first
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # If predictions are 3D or float -> they are logits
    # shape usually [batch, seq_len, vocab_size]
    if hasattr(predictions, "ndim") and predictions.ndim == 3:
        # take argmax over vocab dimension to get token IDs
        predictions = np.argmax(predictions, axis=-1)

    # Ensure integer type for tokenizer.decode
    predictions = np.asarray(predictions, dtype="int32")

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(
        predictions,
        skip_special_tokens=True
    )

    # Replace -100 in labels with pad_token_id and ensure int
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.asarray(labels, dtype="int32")

    decoded_labels = tokenizer.batch_decode(
        labels,
        skip_special_tokens=True
    )

    # Post-process
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute ROUGE
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    result = {k: v * 100.0 for k, v in result.items()}

    # Average generated length (in tokens)
    # (If predictions were logits -> we already argmaxed, so it's fine.)
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id)
        for pred in predictions
    ]
    result["gen_len"] = float(np.mean(prediction_lens))

    return result

print("‚úÖ compute_metrics patched.")


Downloading builder script: 0.00B [00:00, ?B/s]

‚úÖ compute_metrics patched.


In [54]:
# ============================================
# PHASE 7 ‚Äì CELL 8 (FINAL FIX)
# TrainingArguments & Seq2SeqTrainer
# ============================================
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

batch_size = 1
gradient_acc_steps = 4   # effective batch size = 4

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,

    num_train_epochs=2,            # üîÅ from 1 ‚Üí 2 (or 3)
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_acc_steps,

    learning_rate=2e-5,            # üîÅ from 3e-5 ‚Üí 2e-5
    weight_decay=0.01,
    warmup_ratio=0.03,

    logging_steps=50,

    eval_strategy="no",
    save_strategy="no",
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    # no eval_dataset, no compute_metrics -> no eval during train
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Trainer initialized.")

Trainer initialized.


  trainer = Seq2SeqTrainer(


In [55]:
train_result = trainer.train()

# Save final model & tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("‚úÖ Training finished and model saved at", OUTPUT_DIR)

Step,Training Loss
50,3.3369
100,2.4235
150,2.0243
200,1.8971
250,1.84
300,1.7712
350,1.6253
400,1.5692
450,1.5823
500,1.5807


‚úÖ Training finished and model saved at ./led_finetuned_indabs


In [56]:
# ============================================
# PHASE 7 ‚Äì CELL 9
# Train model + eval on validation
# ============================================

metrics = train_result.metrics
metrics["train_samples"] = len(tokenized_datasets["train"])
print("\n=== Train metrics ===")
print(metrics)

# Evaluate on validation set with best model
eval_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
eval_metrics["eval_samples"] = len(tokenized_datasets["validation"])
print("\n=== Validation metrics ===")
print(eval_metrics)

# Save metrics to disk
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_metrics("train", metrics)
trainer.save_metrics("eval", eval_metrics)


=== Train metrics ===
{'train_runtime': 2675.5455, 'train_samples_per_second': 1.009, 'train_steps_per_second': 0.253, 'total_flos': 7290553054003200.0, 'train_loss': 1.8606863247572318, 'epoch': 2.0, 'train_samples': 1350}



=== Validation metrics ===
{'eval_loss': 1.6222188472747803, 'eval_runtime': 36.7395, 'eval_samples_per_second': 4.083, 'eval_steps_per_second': 4.083, 'epoch': 2.0, 'eval_samples': 150}


In [57]:
# ============================================
# PHASE 7 ‚Äì CELL 10
# Evaluate on test split
# ============================================

test_metrics = trainer.evaluate(
    eval_dataset=tokenized_datasets["test"],
    metric_key_prefix="test",
)

test_metrics["test_samples"] = len(tokenized_datasets["test"])
print("\n=== Test metrics ===")
print(test_metrics)

trainer.save_metrics("test", test_metrics)


=== Test metrics ===
{'test_loss': 1.6270530223846436, 'test_runtime': 24.3582, 'test_samples_per_second': 4.105, 'test_steps_per_second': 4.105, 'epoch': 2.0, 'test_samples': 100}


In [58]:
# ============================================
# PHASE 7 ‚Äì CELL 11
# Generate some sample summaries from test set
# ============================================

def generate_summary_for_example(ex, max_new_tokens=256, num_beams=4):
    input_text = ex["input_text"]
    # Tokenize input
    model_inputs = tokenizer(
        input_text,
        max_length=MAX_SOURCE_TOKENS,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Add global attention on first token
    global_attention_mask = torch.zeros_like(model_inputs["input_ids"])
    global_attention_mask[:, 0] = 1
    model_inputs["global_attention_mask"] = global_attention_mask

    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            no_repeat_ngram_size=3,
            length_penalty=1.0,
        )

    summary = tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True
    )[0].strip()
    return summary

# Load raw test JSONL again so we can see original input_text / target_text easily
test_raw = raw_datasets["test"]

num_samples = min(3, len(test_raw))
indices = np.random.choice(len(test_raw), size=num_samples, replace=False)

for idx in indices:
    ex = test_raw[idx]
    print("\n======================================")
    print(f"case_id: {ex.get('case_id', 'N/A')} | split: {ex.get('split', 'N/A')}")
    print("-------------- GOLD SUMMARY --------------")
    print(ex["target_text"][:1000])

    print("\n-------------- GENERATED SUMMARY --------------")
    gen_sum = generate_summary_for_example(ex)
    print(gen_sum[:1000])


case_id: 5248 | split: test
-------------- GOLD SUMMARY --------------
The Income Tax Officer included in the net wealth of the respondent assessee for the assessement year 1957 58, two sums, viz., Rs. 1,50,000 and Rs.67,560/12/ which the asses see claimed to have gifted.
It is stated that on January 1, 1957 the respondent assessee, by a letter directed a company in which he maintained an account, to debit his account to the extent of Rs. 1,50,000 and credit in the names of his two sons and grandsons various sums, as he had decided to give away these amounts to them out of love and affection.
The company carried out the instructions and relevant debit and credit entries were made in the respective accounts.
On the same day, by two separate letters, the gifts were ac cepted by the sons and later on these amounts were withdrawn by the respective donees.
In the case of second gift, oral instructions were given for transferring the amounts stand ing to his credit.
The respondent assessee 

Input ids are automatically padded from 2842 to 3072 to be a multiple of `config.attention_window`: 1024


The assessee, proprietor of a business who had invested a large amount of capital in it, caused entries to be made in his account books crediting his wife and certain other members of his family with sums which were debited to his capital account.
The entries were followed up by letters to the effect, inter alia, that the sums were en tirely in the nature of personal gifts from C and would bear interest payable half yearly.
There was no evidence that the said sum was available with the said firm of M/s Pearls & Beads.
Aggrieved by the said decision, the revenue has come up in appeal.
In the instant case, the assessee was the karta of a Hindu undivided family.
Though C had the intention of making gifts, the entries in the books of account did not complete the gift.
He was not in a position to make gifts in cash of the amounts credited in favour of his son and this amount should be excluded from his taxable net wealth.
Abba Dada and Company vs Commissioner of Income Tax, Bombay City II, 

In [59]:
import torch
import evaluate

rouge = evaluate.load("rouge")

model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"

def generate_summary_for_example(ex, max_new_tokens=256, num_beams=4):
    input_text = ex["input_text"]

    # Tokenize input
    inputs = tokenizer(
        input_text,
        max_length=MAX_SOURCE_TOKENS,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # LED: global attention on first token
    global_attention_mask = torch.zeros_like(inputs["input_ids"])
    global_attention_mask[:, 0] = 1
    inputs["global_attention_mask"] = global_attention_mask

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            no_repeat_ngram_size=3,
            length_penalty=1.0,
        )

    summary = tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True
    )[0].strip()
    return summary

In [60]:
def evaluate_split(split_name: str, max_samples: int = None):
    ds = raw_datasets[split_name]
    preds = []
    refs = []

    n = len(ds) if max_samples is None else min(len(ds), max_samples)
    print(f"Evaluating split={split_name} on {n} samples...")

    for i in range(n):
        ex = ds[i]
        pred = generate_summary_for_example(ex, max_new_tokens=256, num_beams=4)
        preds.append(pred)
        refs.append(ex["target_text"])

        if (i + 1) % 10 == 0:
            print(f"  processed {i+1}/{n}")

    metrics = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    metrics = {k: v * 100.0 for k, v in metrics.items()}  # convert to %
    print(f"\nROUGE for {split_name}:")
    for k, v in metrics.items():
        print(f"  {k}: {v:.2f}")

    return metrics, preds, refs

In [61]:
val_metrics, val_preds, val_refs = evaluate_split("validation")  # or max_samples=50 for quick check

Input ids are automatically padded from 1892 to 2048 to be a multiple of `config.attention_window`: 1024


Evaluating split=validation on 150 samples...


Input ids are automatically padded from 834 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3158 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4007 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3230 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 10/150


Input ids are automatically padded from 4028 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2575 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 20/150


Input ids are automatically padded from 3655 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2890 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 30/150


Input ids are automatically padded from 2781 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2960 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3038 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3012 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1190 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2570 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 40/150


Input ids are automatically padded from 1659 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3760 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3208 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 50/150


Input ids are automatically padded from 584 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2447 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 60/150


Input ids are automatically padded from 3768 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3269 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3265 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2597 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3075 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 70/150


Input ids are automatically padded from 3929 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3040 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3762 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2560 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1935 to 2048 to be a multiple of `config.attention_window`: 1024


  processed 80/150


Input ids are automatically padded from 3552 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1446 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2167 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 90/150


Input ids are automatically padded from 1504 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1428 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2195 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2441 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3289 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 100/150


Input ids are automatically padded from 2340 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2763 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3848 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3723 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 110/150


Input ids are automatically padded from 2882 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3135 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4037 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2010 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3200 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 120/150


Input ids are automatically padded from 3318 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2241 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1293 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 412 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1538 to 2048 to be a multiple of `config.attention_window`: 1024


  processed 130/150


Input ids are automatically padded from 1260 to 2048 to be a multiple of `config.attention_window`: 1024


  processed 140/150


Input ids are automatically padded from 2379 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2286 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3432 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 150/150

ROUGE for validation:
  rouge1: 50.09
  rouge2: 27.85
  rougeL: 29.69
  rougeLsum: 46.44


In [62]:
test_metrics, test_preds, test_refs = evaluate_split("test")

Evaluating split=test on 100 samples...


Input ids are automatically padded from 2603 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2169 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3313 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3095 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3977 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 10/100


Input ids are automatically padded from 2929 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3998 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2304 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 20/100


Input ids are automatically padded from 2892 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3286 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 30/100


Input ids are automatically padded from 3007 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2943 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2769 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1287 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2702 to 3072 to be a multiple of `config.attention_window`: 1024


  processed 40/100


Input ids are automatically padded from 1002 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3287 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 937 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3719 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4008 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 50/100


Input ids are automatically padded from 3738 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2298 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3839 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 60/100


Input ids are automatically padded from 2267 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3536 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3663 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4038 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 70/100


Input ids are automatically padded from 3748 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2391 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3550 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 80/100


Input ids are automatically padded from 3413 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3341 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3077 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4010 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 90/100


Input ids are automatically padded from 1358 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3789 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2789 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3513 to 4096 to be a multiple of `config.attention_window`: 1024


  processed 100/100

ROUGE for test:
  rouge1: 50.14
  rouge2: 28.13
  rougeL: 29.46
  rougeLsum: 46.65
