In [1]:
!pip install -q transformers accelerate datasets torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# ✅ Load data
train_df = pd.read_parquet(r"/kaggle/input/ilsicon/train-00000-of-00001.parquet")
dev_df = pd.read_parquet(r"/kaggle/input/ilsicon/dev-00000-of-00001 (1).parquet")

# ✅ Combine text and labels properly
train_df.rename(columns={'case_description': 'text', 'section_labels': 'labels'}, inplace=True)
dev_df.rename(columns={'case_description': 'text', 'section_labels': 'labels'}, inplace=True)

# ✅ Fix labels parsing
def fix_labels(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return eval(x)
        except:
            return []
    return []

train_df["labels"] = train_df["labels"].apply(fix_labels)
dev_df["labels"] = dev_df["labels"].apply(fix_labels)

# ✅ Build label space
all_labels = sorted(list({lbl for sublist in train_df["labels"] for lbl in sublist}))
label2id = {lbl: i for i, lbl in enumerate(all_labels)}
num_labels = len(all_labels)

def encode_labels(lbls):
    vec = np.zeros(num_labels)
    for l in lbls:
        if l in label2id:
            vec[label2id[l]] = 1
    return vec.tolist()

train_df["labels_vec"] = train_df["labels"].apply(encode_labels)
dev_df["labels_vec"] = dev_df["labels"].apply(encode_labels)

# ✅ Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[["text", "labels_vec"]])
dev_dataset = Dataset.from_pandas(dev_df[["text", "labels_vec"]])

# ✅ Tokenizer
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    # Convert text elements to clean strings
    texts = [
        " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x)
        for x in batch["text"]
    ]
    enc = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
    enc["labels"] = batch["labels_vec"]
    return enc

train_dataset = train_dataset.map(preprocess, batched=True)
dev_dataset = dev_dataset.map(preprocess, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
dev_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-10-10 08:46:35.096521: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760085995.278853      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760085995.327032      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/42750 [00:00<?, ? examples/s]

Map:   0%|          | 0/10181 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


ValueError: unknown is not supported

In [2]:
# ✅ Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")

# ✅ Metrics
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    # Convert any string or float labels to integers safely
    if labels.dtype.kind not in {'i', 'u'}:  # not integer
        try:
            labels = labels.astype(int)
        except:
            # If string labels exist, map them
            unique_labels = sorted(list(set(labels)))
            label2id = {l: i for i, l in enumerate(unique_labels)}
            labels = np.array([label2id[l] for l in labels])

    return {
        "f1": f1_score(labels, preds, average="micro"),
        "accuracy": accuracy_score(labels, preds),
    }

# ✅ Training arguments — FAST MODE
training_args = TrainingArguments(
    output_dir="./legalbert_ilsi_fast",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # simulate batch 8
    num_train_epochs=2,             # quick run
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    fp16=True,                      # faster mixed precision
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",          # shorter eval
    save_strategy="epoch",
    report_to="none"
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ✅ Train
trainer.train()

# ✅ Evaluate
metrics = trainer.evaluate()
print(metrics)

trainer.save_model("./legalbert_ilsi_fast_model")
tokenizer.save_pretrained("./legalbert_ilsi_fast_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


ValueError: attempt to get argmax of an empty sequence

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# ✅ Load data
train_df = pd.read_parquet(r"/kaggle/input/ilsicon/train-00000-of-00001.parquet")
dev_df = pd.read_parquet(r"/kaggle/input/ilsicon/dev-00000-of-00001 (1).parquet")

# ✅ Combine text and labels properly
train_df.rename(columns={'case_description': 'text', 'section_labels': 'labels'}, inplace=True)
dev_df.rename(columns={'case_description': 'text', 'section_labels': 'labels'}, inplace=True)

# ✅ Fix labels parsing
def fix_labels(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return eval(x)
        except:
            return []
    return []

train_df["labels"] = train_df["labels"].apply(fix_labels)
dev_df["labels"] = dev_df["labels"].apply(fix_labels)

# ✅ Build label space
all_labels = sorted(list({lbl for sublist in train_df["labels"] for lbl in sublist}))
label2id = {lbl: i for i, lbl in enumerate(all_labels)}
num_labels = len(all_labels)

def encode_labels(lbls):
    vec = np.zeros(num_labels)
    for l in lbls:
        if l in label2id:
            vec[label2id[l]] = 1
    return vec.tolist()

train_df["labels_vec"] = train_df["labels"].apply(encode_labels)
dev_df["labels_vec"] = dev_df["labels"].apply(encode_labels)

# ✅ Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[["text", "labels_vec"]])
dev_dataset = Dataset.from_pandas(dev_df[["text", "labels_vec"]])

# ✅ Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    # Convert text elements to clean strings
    texts = [
        " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x)
        for x in batch["text"]
    ]
    enc = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
    enc["labels"] = batch["labels_vec"]
    return enc

train_dataset = train_dataset.map(preprocess, batched=True)
dev_dataset = dev_dataset.map(preprocess, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
dev_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ✅ Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")

# ✅ Metrics
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions))
    preds = (preds > 0.5).int().numpy()
    labels = p.label_ids
    return {
        "f1": f1_score(labels, preds, average="micro"),
        "accuracy": accuracy_score(labels, preds)
    }

# ✅ Training arguments — FAST MODE
training_args = TrainingArguments(
    output_dir="./legalbert_ilsi_fast",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # simulate batch 8
    num_train_epochs=2,             # quick run
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    fp16=True,                      # faster mixed precision
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",          # shorter eval
    save_strategy="epoch",
    report_to="none"
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ✅ Train
trainer.train()

# ✅ Evaluate
metrics = trainer.evaluate()
print(metrics)

trainer.save_model("./legalbert_ilsi_fast_model")
tokenizer.save_pretrained("./legalbert_ilsi_fast_model")


model 2

In [1]:
import pandas as pd

statutes = pd.read_parquet(r"/kaggle/input/ilsidata/statutes-00000-of-00001.parquet")
train = pd.read_parquet(r"/kaggle/input/ilsicon/train-00000-of-00001.parquet")

statutes = statutes[['id', 'text']]
train = train[['id', 'text', 'labels']]

Load legal sentence transformer

In [3]:
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"

In [5]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id="bert-base-uncased", local_dir="bert-base-uncased")

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Manifest.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

model.mlmodel:   0%|          | 0.00/165k [00:00<?, ?B/s]

coreml/fill-mask/float32_model.mlpackage(…):   0%|          | 0.00/532M [00:00<?, ?B/s]

flax_model.msgpack:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

rust_model.ot:   0%|          | 0.00/534M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

'/kaggle/working/bert-base-uncased'

In [8]:
model_path = "/kaggle/working/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

print("✅ Model and tokenizer loaded successfully from local folder!")

2025-10-10 14:22:46.257742: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760106166.437469      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760106166.485261      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✅ Model and tokenizer loaded successfully from local folder!


compute embeddings

In [12]:
import torch

def get_embeddings(texts, tokenizer, model):
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encoded)
    return outputs.last_hidden_state[:, 0, :]  # [CLS] embeddings

# Clean statute text column before encoding
statutes["text"] = statutes["text"].astype(str)  # ensure string type
statutes["text"] = statutes["text"].fillna("")   # replace NaN with empty string

# Now encode safely
statute_embeddings = get_embeddings(statutes["text"].tolist(), tokenizer, model)

In [None]:
case_text = train['text'][0]  # example case
case_embedding = model.encode(case_text, convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.cos_sim(case_embedding, statute_embeddings)[0]

# Get top 5 statutes
top_results = torch.topk(cosine_scores, k=5)
for score, idx in zip(top_results.values, top_results.indices):
    print(f"Statute: {statutes.iloc[idx]['section_id']}, Score: {score:.4f}")

# Graph based statute mapping model

In [1]:
!pip install pandas numpy torch transformers sentence-transformers networkx pyvis scikit-learn

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidi

Loading and preprocessing

In [3]:
import pandas as pd

# Load your Kaggle dataset
df = pd.read_csv("/kaggle/input/indian-penal-code-ipc-sections-information/ipc_sections.csv")

# Inspect
print(df.columns)

# Combine relevant fields for embedding
df['text'] = df['Description'].fillna('') + ' ' + \
             df['Offense'].fillna('') + ' ' + \
             df['Punishment'].fillna('')

df['Section'] = df['Section'].astype(str).str.strip().str.upper()  # ensures IPC_420 etc. is consistent

print(df.head())

Index(['Description', 'Offense', 'Punishment', 'Section'], dtype='object')
                                         Description  \
0  Description of IPC Section 140\nAccording to s...   
1  Description of IPC Section 127\nAccording to s...   
2  Description of IPC Section 128\nAccording to s...   
3  Description of IPC Section 129\nAccording to s...   
4  Description of IPC Section 130\nAccording to s...   

                                             Offense  \
0  Wearing the dress or carrying any token used b...   
1  Receiving property taken by war or depredation...   
2  Public servant voluntarily allowing prisoner o...   
3  Public servant negligently suffering prisoner ...   
4  Aiding escape of, rescuing or harbouring, such...   

                                 Punishment  Section  \
0                  3 Months or Fine or Both  IPC_140   
1   7 Years + Fine + forfeiture of property  IPC_127   
2  Imprisonment for Life or 10 Years + Fine  IPC_128   
3        Simple Imprisonmen

legal embedding generation

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('law-ai/InLegalBERT')  # legal-tuned model
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

2025-10-10 20:59:20.296108: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760129960.466425      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760129960.517743      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/534M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

compute similarity matrix

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity_matrix = cosine_similarity(embeddings)

build graph using networkx

In [7]:
import networkx as nx

G = nx.Graph()

# Add nodes
for i, row in df.iterrows():
    G.add_node(
        row['Section'],
        description=row['Description'],
        offense=row['Offense'],
        punishment=row['Punishment']
    )

# Add similarity-based edges
threshold = 0.75  # adjust as needed
for i in range(len(df)):
    for j in range(i+1, len(df)):
        sim = similarity_matrix[i][j]
        if sim > threshold:
            G.add_edge(df.loc[i, 'Section'], df.loc[j, 'Section'], weight=sim)

query traversal (nearest nodes)

In [8]:
def find_related_sections(section_id, k=5):
    section_id = section_id.upper()  # ensure matching
    if section_id not in G:
        print(f"{section_id} not found in graph!")
        return []
    neighbors = [(nbr, G[section_id][nbr]['weight']) for nbr in G.neighbors(section_id)]
    neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)
    return neighbors[:k]

# Example:
related = find_related_sections('IPC_420', k=5)
print("Top related sections to IPC_420:")
for sec, score in related:
    print(f"{sec} (similarity={score:.3f})")

Top related sections to IPC_420:
IPC_423 (similarity=0.968)
IPC_330 (similarity=0.966)
IPC_477 (similarity=0.966)
IPC_331 (similarity=0.965)
IPC_348 (similarity=0.964)


In [19]:
def find_related_sections(section_id, k=5):
    section_id = section_id.upper()  # ensure matching
    if section_id not in G:
        print(f"{section_id} not found in graph!")
        return []
    neighbors = [(nbr, G[section_id][nbr]['weight']) for nbr in G.neighbors(section_id)]
    neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)
    return neighbors[:k]

# Example:
related = find_related_sections('IPC_326', k=5)
print("Top related sections to IPC_326:")
for sec, score in related:
    print(f"{sec} (similarity={score:.3f})")

Top related sections to IPC_326:
IPC_324 (similarity=0.716)


Visualization

In [11]:
from pyvis.network import Network

net = Network(height='600px', width='100%', bgcolor='#222222', font_color='white')
net.from_nx(G)
net.show_buttons(filter_=['physics'])  # helps initialize HTML template
net.save_graph('ipc_graph.html')       # ✅ use save_graph instead of show

In [12]:
import os

os.listdir('/kaggle/working')

['lib', 'ipc_graph.html', '.virtual_documents']

In [13]:
from IPython.display import FileLink

FileLink('ipc_graph.html')

In [15]:
import plotly.graph_objects as go

# Take a smaller subgraph for visualization
sub_nodes = list(G.nodes())[:50]
H = G.subgraph(sub_nodes)

# Get positions
pos = nx.spring_layout(H)

# Extract edge coordinates
edge_x, edge_y = [], []
for edge in H.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# Extract node coordinates
node_x, node_y = [], []
for node in H.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=list(H.nodes()),
    textposition="bottom center",
    hoverinfo='text',
    marker=dict(
        showscale=False,
        color='skyblue',
        size=10,
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='IPC Section Similarity Graph (Plotly)',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0,l=0,r=0,t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
               )

fig.show()

In [17]:
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Example data (replace with your dataset)
df = pd.read_csv("/kaggle/input/indian-penal-code-ipc-sections-information/ipc_sections.csv")  # must have 'section' and 'text' columns

# Step 1: Compute similarity between IPC sections
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Description'])

similarity_matrix = cosine_similarity(X)

# Step 2: Create graph with edges only if similarity > threshold
threshold = 0.5  # tweak this value (0.4–0.7 is a good range)
G = nx.Graph()

sections = df['Section'].tolist()
for i in range(len(sections)):
    for j in range(i+1, len(sections)):
        sim = similarity_matrix[i, j]
        if sim > threshold:
            G.add_edge(sections[i], sections[j], weight=sim)

# Step 3: Layout (similar nodes closer)
pos = nx.spring_layout(G, weight='weight', k=0.3, iterations=50)

# Step 4: Convert to Plotly scatter
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
text_labels = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    text_labels.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=text_labels,
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        showscale=False,
        color='skyblue',
        size=10,
        line_width=1.5))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='IPC Section Similarity Network',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0,l=0,r=0,t=40)))

fig.show()

In [20]:
num_nodes = G.number_of_nodes()
print("Total number of nodes:", num_nodes)

Total number of nodes: 111


In [21]:
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Example data (replace with your dataset)
df = pd.read_csv("/kaggle/input/indian-penal-code-ipc-sections-information/ipc_sections.csv")  # must have 'section' and 'text' columns

# Step 1: Compute similarity between IPC sections
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Description'])

similarity_matrix = cosine_similarity(X)

# Step 2: Create graph with edges only if similarity > threshold
threshold = 0.5  # tweak this value (0.4–0.7 is a good range)
G1 = nx.Graph()
G1.add_nodes_from(sections)  # add all sections first

for i in range(len(sections)):
    for j in range(i+1, len(sections)):
        sim = similarity_matrix[i, j]
        if sim > threshold:
            G1.add_edge(sections[i], sections[j], weight=sim)

# sections = df['Section'].tolist()
# for i in range(len(sections)):
#     for j in range(i+1, len(sections)):
#         sim = similarity_matrix[i, j]
#         if sim > threshold:
#             G.add_edge(sections[i], sections[j], weight=sim)

# Step 3: Layout (similar nodes closer)
pos = nx.spring_layout(G1, weight='weight', k=0.3, iterations=50)

# Step 4: Convert to Plotly scatter
edge_x = []
edge_y = []
for edge in G1.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
text_labels = []
for node in G1.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    text_labels.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=text_labels,
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        showscale=False,
        color='skyblue',
        size=10,
        line_width=1.5))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='IPC Section Similarity Network',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0,l=0,r=0,t=40)))

fig.show()

In [23]:
isolated = list(nx.isolates(G1))
print("Number of isolated nodes:", len(isolated))

Number of isolated nodes: 331
