In [None]:
# Step 1: Install required libraries
!pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn



In [None]:
# Step 2: Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
 import pandas as pd   # table data ke liye library

df = pd.read_csv("indian_bail_judgments.csv")
# CSV ko memory me dataframe (table) banaya

df.shape

(1200, 25)

In [None]:
df.columns

Index(['case_id', 'case_title', 'court', 'date', 'judge', 'ipc_sections',
       'bail_type', 'bail_cancellation_case', 'landmark_case', 'accused_name',
       'accused_gender', 'prior_cases', 'bail_outcome',
       'bail_outcome_label_detailed', 'crime_type', 'facts', 'legal_issues',
       'judgment_reason', 'summary', 'bias_flag', 'parity_argument_used',
       'legal_principles_discussed', 'region', 'source_filename',
       'special_laws'],
      dtype='object')

In [None]:
#HERE IN THIS DATASET, THE " bail_outcome " COLUMN IS THE TARGET COLUMN
#DATATYPE AUR COLUMN KA NAAM AUR REAL WPRLD MEANING SE HINT MILTA H KI USPE KONSA CLASSIFICATION USE KRNA HAI
#AB JAISEE YE DSTASET MEIN JO " bail_outcome " COLUMN HAI USME SIRF 2 HI OUTCOME HAI ---> GRANTED OR REJECTED AUR HUMARE PROJECT MEIN BHI YEHI FIND KRNA HAI
#ISILITYE HUMNE ISSE HI TARGET COLUMN BANAYA HAI

In [None]:
# ================================
#Target Variable Encoding -- TO CONVEERT VALUES OF TARGET COLUMN IN FORM OF 1 AND 0
# ================================

# 'bail_outcome' column me court ka final decision text form me diya hua hai
# Jaise: "Granted" ya "Rejected"
# Machine Learning models text ko directly samajh nahi sakte,
# isliye hume is decision ko numerical form me convert karna padta hai

# Yahan hum ek naya column 'bail_label' bana rahe hainn
# taaki original 'bail_outcome' column safe rahee

df['bail_label'] = df['bail_outcome'].map({
    'Granted': 1,     # Agar bail grant hui ho, toh label = 1
    'Rejected': 0     # Agar bail reject hui ho, toh label = 0
})

# 'map()' function dictionary ke through values ko replace karta hai:
# - Agar value "Granted" milti hai ‚Üí usse 1 me convert karta hai
# - Agar value "Rejected" milti hai ‚Üí usse 0 me convert karta hai

# Is process ko "Label Encoding" kehte hain
# Ye step classification problem ke liye "IMPORTANT AND ZARURUI" hota hai JAHA TEXT KO NUMERIC MEI CONVERT KRNA HO

In [None]:
#VERIFYING THE ABOVE CODE
df[['bail_outcome', 'bail_label']].head()

Unnamed: 0,bail_outcome,bail_label
0,Rejected,0
1,Rejected,0
2,Rejected,0
3,Granted,1
4,Rejected,0


In [None]:
#==========================================
#Step 3: Feature Engineering (Text Creation)
#==========================================

#Court ke decision ko samajhne ke liye
#'facts', 'judgment_reason' aur 'summary' sabse important text columns hain

#NLP models ko ek single text input chahiye hota hai,
#isliye hum multiple text columns ko combine kar rahe hain

df['text'] = (
    df['facts'] + " " +              #Case ke factual details
    df['judgment_reason'] + " " +     #Court ka reasoning
    df['summary']                     #Short judgment summary
)

#Yahan " " (space) isliye add kiya gaya hai
#taaki words aapas me chipak na jaayein

In [None]:
# -----------------------------
# loc vs iloc (Pandas indexing)
# -----------------------------

# iloc = index number ke basis par data select karta hai
# Matlab row ya column ka POSITION (0, 1, 2, 3...) use hota hai
# Example
# df.iloc[0]    -> pehli row
# df.iloc[0:5]  -> pehli 5 rows
# df.iloc[0, 2] -> pehli row, 3rd column

# loc = label/name ke basis par data select karta hai
# Matlab row ka label ya column ka naam use hota hai
# Example
# df.loc[0]                 -> jiska index label 0 ho
# df.loc[:, 'bail_outcome'] -> poora 'bail_outcome' column
# df.loc[5:10, ['court','judge']] -> specific rows & columns

# Simple yaad rakhne ka trick:
# iloc = I for Index (number based)
# loc  = L for Label (name based)

# 'text' column ka pehla row dekh rahe hain
# sirf pehle 500 characters show kar rahe hain
# taaki verify ho sake ki text sahi combine hua hai
df['text'].iloc[0][:500]

'Jibangshu Paul was apprehended carrying Rs. 32,11,000 in cash, suspected to be intended for the DHD(J) militant group. He was earlier granted bail for IPC sections. Later, serious sections under the Unlawful Activities (Prevention) Act were added, and NIA sought his re-arrest. The Special Court directed custody, rejecting his prayer to continue on earlier bail. The court held that newly added serious UA(P) Act offences required separate consideration and the Special Judge could not continue prio'

In [None]:
#TF-IDF ka matlab (yaad rehne wala)
#TF (Term Frequency)
#ek word document me kitni baar aaya
#IDF (Inverse Document Frequency)
#jo word sab documents me aata hai (court, judge)
#uski importance kam
# Aur jo rare but important hai (custody, gravity, anticipatory)
#uski importance zyada

#INSORT MATLAB Common words ignore, meaningful words highlight

In [None]:
# ======================================
# Step 4: Text to Number Conversion
# Using TF-IDF Vectorizer
# ======================================

from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer text ko numerical matrix me convert karta hai

# TF-IDF object create kar rahe hain
tfidf = TfidfVectorizer(
    max_features=5000,        # sirf top 5000 most important words rakhenge
    stop_words='english'      # the, is, was jaise common words hata dega
)

# 'text' column ko numbers me convert kar rahe hain
X = tfidf.fit_transform(df['text'])

# Target variable (jo humne pehle banaya tha)
y = df['bail_label']

In [None]:
# X.shape ka use yeh verify karne ke liye hota hai ki
# TF-IDF vectorization ke baad data ka size kya bana hai

# X.shape output deta hai: (number_of_rows, number_of_features)

# number_of_rows  -> total judgments / documents
# number_of_features -> total unique important words (TF-IDF features)

# Example:
# (1200, 5000) ka matlab hai:
# - 1200 judgments ko represent kiya gaya hai
# - har judgment ko 5000 numerical features (words) ke through

X.shape

(1200, 5000)

In [None]:
# ======================================
# Train-Test Split
# ======================================

from sklearn.model_selection import train_test_split
# train_test_split data ko training aur testing parts me divide karta hai

# X = TF-IDF se bana input features (numbers)
# y = target labels (0 = Rejected, 1 = Granted)

X_train, X_test, y_train, y_test = train_test_split(
    X,                 # input features
    y,                 # target labels
    test_size=0.2,     # 20% data testing ke liye
    random_state=42    # same result reproduce karne ke liye
)

In [None]:
# X_train.shape ka use yeh check karne ke liye hota hai ki
# training data me kitne samples (rows) aur kitne features (columns) hain

# Output format hota hai: (number_of_training_samples, number_of_features)
# Example: (960, 5000)
# Matlab
# - 960 judgments model ko train karne ke liye
# - har judgment 5000 TF-IDF features se represent hua hai

# X_test.shape ka use yeh verify karne ke liye hota hai ki
# testing data me kitne unseen samples hain jinpe model ka performance test hoga

# Output format hota hai: (number_of_test_samples, number_of_features)
# Example: (240, 5000)
# Matlab:
# - 240 judgments model ke liye bilkul naye (unseen) hain
# - features training jaise hi hain (5000)
X_train.shape
X_test.shape

(240, 5000)

In [None]:
# ======================================
# Train Logistic Regression Model
# ======================================

from sklearn.linear_model import LogisticRegression
# LogisticRegression ek classification algorithm hai
# jo binary outcomes (0/1) predict karta hai

# Model ka object create kar rahe hain
model = LogisticRegression(
    max_iter=1000        # iterations limit badha rahe hain
)

# Model ko training data pe train kar rahe hain
# X_train = input features (TF-IDF numbers)
# y_train = correct answers (0 or 1)
model.fit(X_train, y_train)

In [None]:
# ======================================
# Prediction on Test Data
# ======================================

# Trained model se test data ke liye prediction kar rahe hain
# model.predict() har sample ke liye 0 ya 1 return karega

y_pred = model.predict(X_test)

# y_pred me:
# 0 = Bail Rejected (model ka prediction)
# 1 = Bail Granted (model ka prediction)

#model.predict(X_test)
#Model ko naya (unseen) data diya
#Model apni learning use karke guess karta hai
#Output sirf labels (0/1) hote hain
#Important baat Ye probabilities nahi deta Sirf final decision deta

In [None]:
y_pred[:10]
# ye Pehle 10 test cases ke predictions
#sirf dekhne ke liye (debugging)

array([1, 1, 0, 0, 0, 1, 0, 1, 1, 1])

In [None]:
# ======================================
# Model Evaluation
# ======================================

from sklearn.metrics import classification_report
# classification_report model ki performance ka detailed summary deta hai

# Actual answers (y_test) vs Model predictions (y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        88
           1       0.95      0.95      0.95       152

    accuracy                           0.94       240
   macro avg       0.94      0.94      0.94       240
weighted avg       0.94      0.94      0.94       240



In [None]:
# classification_report ek table hoti hai jo model ki performance ko detail me dikhati hai
# Isme hum actual answers (y_test) aur model ke predictions (y_pred) ko compare karte hain

# Labels ka matlab:
# 0 = Bail Rejected
# 1 = Bail Granted

# ---------------------------------------------
# PRECISION ka matlab:
# Jab model kisi class ka prediction karta hai,
# toh kitni baar woh prediction sahi hoti hai
#
# Example:
# Agar model bole "Bail Granted" 100 baar
# aur unme se 95 baar bail sach me granted ho
# toh precision = 0.95
#
# High precision = kam galat approvals / rejections
# ---------------------------------------------

# ---------------------------------------------
# RECALL ka matlab:
# Jitne cases me bail actually granted / rejected thi,
# unme se model ne kitne cases sahi pakad liye
#
# Example:
# Agar 100 cases me bail actually granted thi
# aur model ne unme se 95 detect kar li
# toh recall = 0.95
#
# High recall = kam genuine cases miss honge
# ---------------------------------------------

# ---------------------------------------------
# F1-SCORE ka matlab:
# Precision aur Recall ka balance
#
# Jab dono important hote hain (jaise legal cases me),
# tab F1-score sabse reliable metric hota hai
# ---------------------------------------------

# ---------------------------------------------
# SUPPORT ka matlab:
# Test dataset me har class ke actual cases ki count
#
# Example:
# 0 (Rejected) = 88 cases
# 1 (Granted)  = 152 cases
# ---------------------------------------------

# ---------------------------------------------
# ACCURACY ka matlab:
# Total test cases me se kitne cases model ne sahi predict kiye
#
# Example:
# 240 test cases me se approx 226 correct
# accuracy = 0.94 (94%)
# ---------------------------------------------

# ---------------------------------------------
# MACRO AVG ka matlab:
# Dono classes ka simple average
# Data imbalance ko ignore karta hai
# Fairness check ke liye use hota hai
# ---------------------------------------------

# ---------------------------------------------
# WEIGHTED AVG ka matlab:
# Har class ke support ke hisaab se weighted average
# Jis class ke zyada cases, uska zyada impact
#
# Real-world performance ko better represent karta hai
# ---------------------------------------------


In [None]:
# Confusion Matrix import kar rahe hain
from sklearn.metrics import confusion_matrix

# Confusion matrix create kar rahe hain
# y_test = actual judge decisions
# y_pred = model ke predicted decisions
cm = confusion_matrix(y_test, y_pred)

cm
#Confusion Matrix batata hai:
#model kahan sahi decision le raha hai
#model kahan galat decision le raha hai
# TN (True Negative):
# Actual bail rejected thi
# Model ne bhi bail rejected predict ki
# ‚úÖ Correct rejection

# TP (True Positive):
# Actual bail granted thi
# Model ne bhi bail granted predict ki
# ‚úÖ Correct approval

# FP (False Positive):
# Actual bail rejected thi
# Model ne bail granted predict kar di
# ‚ùå Dangerous mistake (galat bail)

# FN (False Negative):
# Actual bail granted thi
# Model ne bail reject predict kar di
# ‚ùå Genuine case miss ho gaya

array([[ 81,   7],
       [  7, 145]])

In [None]:
# Confusion Matrix interpretation:
# TN = 81  ‚Üí Correctly predicted bail rejections
# FP = 7   ‚Üí Bail wrongly granted by model (risky errors)
# FN = 7   ‚Üí Bail wrongly rejected by model
# TP = 145 ‚Üí Correctly predicted bail grants
#
# Model performs well overall with very few critical errors,
# which is important for judicial decision support systems.

In [None]:
# ab hum Check karna h ki trained model
#ek bilkul NAYA bail case dekhkar
#kaise decision leta hai

In [None]:
# =====================================================
# New ya Unseen Bail Case Prediction
# =====================================================

# Ek naya bail case ka text likh rahe hain
# Ye case model ne training ke time kabhi nahi dekha
new_case_text = """
The accused has no prior criminal history.
The offense is non-violent in nature.
Investigation is complete and charge sheet is filed.
The accused has cooperated with authorities.
"""

# Explanation:
# - no prior criminal history  ‚Üí bail ke chances badhte hain
# - non-violent offense        ‚Üí court lenient hoti hai
# - investigation complete     ‚Üí evidence tampering ka risk kam
# - cooperation with police    ‚Üí positive factor
#new_case_text ek normal English paragraph hai
#Machine Learning model text directly nahi samajhta Isliye hume ise NUMBERS me convert karna padta hai

# Text ko TF-IDF vector me convert kar rahe hain
# IMPORTANT baat ki ye fit() nahi hai , sirf transform()
# kyunki fit sirf training data pe hota hai
new_case_vector = tfidf.transform([new_case_text])

#tfidf.transform() ka use isliye
#Taaki naye case ko bhi exact same language rules aur vocabulary me
#convert kiya ja sake

In [None]:
new_case_vector.shape

(1, 5000)

In [None]:
#Trained model se final decision nikal rahe hain
prediction = model.predict(new_case_vector)

prediction

array([1])

In [None]:
# this is the output so array([1]) means the bail granted
#ab isse readable form mei deekhne k liye
# Numeric output ko readable decision me convert kar rahe hain
if prediction[0] == 1:
    print("‚úÖ Model Prediction: Bail Granted")
else:
    print("‚ùå Model Prediction: Bail Rejected")

‚úÖ Model Prediction: Bail Granted


In [None]:
#********THRESHOLD TUNNING TAKI FALSE POSITIVE KAM HO
#MODEL 0.5 KA THRESHOLD LE PROBABILTY KRTA THA LEKIN VO 0.55 PR BHI BAIL GRANT KRDETA H

# =====================================================
# Prediction probabilities nikalna
# =====================================================

# predict_proba har sample ke liye probability deta hai
y_prob = model.predict_proba(X_test)

# Pehle 5 samples ki probabilities dekh rahe hain
y_prob[:5]

array([[0.28073563, 0.71926437],
       [0.26462034, 0.73537966],
       [0.52604488, 0.47395512],
       [0.64980013, 0.35019987],
       [0.56468642, 0.43531358]])

In [None]:
# y_prob output format:
# [Probability of Bail Rejected (0), Probability of Bail Granted (1)]

# Example:
# [0.28, 0.71] ‚Üí 71% chance bail granted (strong case)
# [0.52, 0.47] ‚Üí low confidence, risky case

# This probability output allows us to tune threshold
# to reduce false positive bail grants

In [None]:
# =====================================================
# STEP: Apply custom threshold (strict bail decision)
# =====================================================
# Is step ka goal:
# Model normally 0.5 threshold use karta hai (default)
# Lekin judiciary sensitive system hai,
# isliye hum bail sirf tab denge jab confidence kaafi high ho

threshold = 0.7
# threshold = 0.7 ka matlab:
# Agar model ko bail granted hone ka confidence ‚â• 70% ho
# tabhi bail grant karenge
# warna bail reject

# -----------------------------------------------------

# y_prob ka format hota hai:
# y_prob = [
#   [P(bail_rejected), P(bail_granted)],
#   [P(bail_rejected), P(bail_granted)],
#   ...
# ]

# y_prob[:, 1] ka matlab:
# ":"  ‚Üí saare test samples
# "1"  ‚Üí sirf second column (bail granted probability)
# Example:
# y_prob[:,1] = [0.71, 0.73, 0.47, 0.35, 0.43, ...]

# -----------------------------------------------------

y_pred_strict = (y_prob[:, 1] >= threshold).astype(int)

# Breakdown of above line:
#
# Step 1: y_prob[:,1] >= threshold
#   - Har case ke liye check karta hai:
#     kya bail granted probability >= 0.7 hai?
#
# Example:
#   0.71 >= 0.7 ‚Üí True
#   0.47 >= 0.7 ‚Üí False
#
# Step 2: Result becomes Boolean array:
#   [True, True, False, False, False, ...]
#
# Step 3: .astype(int)
#   True  ‚Üí 1 (Bail Granted)
#   False ‚Üí 0 (Bail Rejected)
#
# Final output:
#   [1, 1, 0, 0, 0, ...]

# -----------------------------------------------------

# First 10 predictions dekh rahe hain
# taaki verify ho sake ki strict rule ka effect pada ya nahi
y_pred_strict[:10]

array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1])

In [None]:
#Ab hum old model (0.5) vs strict model (0.7) ka comparison dekhte hain
#taaki clear ho:
#False Positives kam hue ya nahi?
#Bail safety improve hui ya nahi?

In [None]:
# =====================================================
# Confusion Matrix after Threshold Tuning
# =====================================================

from sklearn.metrics import confusion_matrix

# Strict threshold ke predictions vs actual labels
cm_strict = confusion_matrix(y_test, y_pred_strict)

cm_strict
# confusion_matrix(y_test, y_pred_strict) ka matlab:
#
# y_test        ‚Üí actual judge decisions (ground truth)
# y_pred_strict ‚Üí model ke strict predictions (threshold = 0.7)
#
# Output format hota hai:
# [[TN, FP],
#  [FN, TP]]
#
# TN (True Negative):
#   Actual: Bail Rejected
#   Model:  Bail Rejected  ‚úÖ (safe decision)
#
# FP (False Positive) ‚ùå MOST DANGEROUS:
#   Actual: Bail Rejected
#   Model:  Bail Granted
#   ‚Üí Galat bail (judicial risk)
#
# FN (False Negative):
#   Actual: Bail Granted
#   Model:  Bail Rejected
#   ‚Üí Thoda strict but safe
#
# TP (True Positive):
#   Actual: Bail Granted
#   Model:  Bail Granted ‚úÖ

array([[ 88,   0],
       [ 43, 109]])

In [None]:
# Toh ISS OUTPUT ka matlab:
# 88 cases ‚Üí sahi bail reject
# 0 cases  ‚Üí ‚ùå galat bail (FP)  ‚Üê üî• ye kam hona chahiye
# 43 cases ‚Üí thoda zyada strict (FN)
# 109 cases ‚Üí sahi bail granted
#By increasing the decision threshold to 0.7,
#the model completely eliminated false positive bail grants, prioritizing judicial safety over recall.‚Äù

In [None]:
# =====================================================
# AGAIN TRYING New Unseen Bail Case Prediction (AFTER TUNNING)
# =====================================================

# New unseen bail application text (real-world style)
new_case_text = """
The accused has no prior criminal history.
The offence is non-violent in nature.
Investigation is complete and charge sheet is filed.
The accused has cooperated with the authorities.
"""

#Ye ek normal English paragraph hai Court judgement YA bail application jaisa,
#Ye data training me kabhi use nahi hua (UNSEEN DATA)

# Text ko numbers me convert karna (TF-IDF vector)
# IMPORTANT KI
# - yahan .fit() use NAHI karte
# - kyunki fit sirf training data pe hota hai
# - transform() ka matlab: same language rules me convert karna

new_case_vector = tfidf.transform([new_case_text])

# üëâ Output sparse numeric vector hota hai
# üëâ Shape: (1, 5000)
# üëâ 1 = ek case, 5000 = same features jo training me the


# 3Ô∏è‚É£ Model se probability nikalna
# predict_proba() 2 probabilities deta hai:
# [ P(class=0), P(class=1) ]
# class 0 = Bail Rejected
# class 1 = Bail Granted

new_case_prob = model.predict_proba(new_case_vector)

new_case_prob

array([[0.42394394, 0.57605606]])

In [None]:
# predict_proba() model ki confidence batata hai
# Output format: [P(bail_rejected), P(bail_granted)]
# Is case me
# - Bail Rejected probability ‚âà 42%
# - Bail Granted probability ‚âà 58%
# Strict threshold (0.7) ke hisaab se
# bail grant nahi hoti (safety first)