In [1]:
!pip install transformers==4.30.0
!pip install torch torchvision torchaudio
!pip install PyPDF2
!pip install sentence-transformers
!pip install numpy pandas
!pip install gradio

Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m112.6/113.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
import PyPDF2
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import warnings
import os
from google.colab import files
import io

warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")


✅ All libraries imported successfully!
PyTorch version: 2.6.0+cu124
CUDA available: False
GPU device: CPU only


In [3]:
MODEL_NAME = "law-ai/InLegalBERT"

# Step 2: Load tokenizer
print("\n🔤 STEP 1: Loading InLegalBERT Tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("✅ Tokenizer loaded successfully!")
    print(f"   • Vocabulary size: {len(tokenizer.vocab)}")
    print(f"   • Max sequence length: {tokenizer.model_max_length}")
    print(f"   • Special tokens: {list(tokenizer.special_tokens_map.keys())}")
except Exception as e:
    print(f"❌ Tokenizer loading failed: {e}")
    exit()


# Step 3: Load model
print("\n🧠 STEP 2: Loading InLegalBERT Model...")
try:
    model = AutoModel.from_pretrained(MODEL_NAME)
    print("✅ InLegalBERT model loaded successfully!")
    print(f"   • Model type: {type(model).__name__}")
    print(f"   • Hidden size: {model.config.hidden_size}")
    print(f"   • Number of layers: {model.config.num_hidden_layers}")
    print(f"   • Attention heads: {model.config.num_attention_heads}")
    print(f"   • Total parameters: ~{sum(p.numel() for p in model.parameters())/1e6:.1f}M")
except Exception as e:
    print(f" Model loading failed: {e}")
    exit()



🔤 STEP 1: Loading InLegalBERT Tokenizer...


tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

✅ Tokenizer loaded successfully!
   • Vocabulary size: 30522
   • Max sequence length: 512
   • Special tokens: ['unk_token', 'sep_token', 'pad_token', 'cls_token', 'mask_token']

🧠 STEP 2: Loading InLegalBERT Model...


config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

✅ InLegalBERT model loaded successfully!
   • Model type: BertModel
   • Hidden size: 768
   • Number of layers: 12
   • Attention heads: 12
   • Total parameters: ~109.5M


In [12]:
class LegalDocumentProcessor:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

        # Load additional models
        print("Loading additional models...")
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

        try:
            self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
            print("✅ BART summarizer loaded")
        except:
            print("⚠️ BART failed - using extractive only")
            self.summarizer = None

        try:
            self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
            print("✅ Q&A system loaded")
        except:
            print("⚠️ Q&A system failed - using similarity")
            self.qa_pipeline = None

        print("🎯 Legal Document Processor ready!")

    def extract_pdf_text(self, uploaded_file):
        """Extract text from uploaded PDF"""
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file))
            text = ""

            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"

            # Clean text
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'Page \d+', '', text)

            return text.strip()
        except Exception as e:
            print(f"❌ PDF extraction failed: {e}")
            return None

    def get_embeddings(self, text):
        """Get InLegalBERT embeddings"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def extractive_summary(self, text, num_sentences=5):
        """Create extractive summary using InLegalBERT"""
        sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]

        if len(sentences) <= num_sentences:
            return text

        try:
            # Get embeddings for each sentence
            embeddings = []
            for sentence in sentences:
                emb = self.get_embeddings(sentence)
                embeddings.append(emb.squeeze().numpy())

            embeddings = np.array(embeddings)

            # Calculate document centroid
            doc_embedding = np.mean(embeddings, axis=0)

            # Score sentences by similarity to document
            scores = []
            for emb in embeddings:
                score = np.dot(emb, doc_embedding) / (np.linalg.norm(emb) * np.linalg.norm(doc_embedding) + 1e-8)
                scores.append(score)

            # Add legal keyword bonus
            legal_keywords = ['court', 'held', 'judgment', 'petitioner', 'respondent', 'article', 'constitution']
            for i, sentence in enumerate(sentences):
                bonus = sum(0.1 for keyword in legal_keywords if keyword.lower() in sentence.lower())
                scores[i] += min(bonus, 0.3)

            # Select top sentences
            top_indices = np.argsort(scores)[-num_sentences:]
            top_indices = sorted(top_indices)

            summary = '. '.join([sentences[i] for i in top_indices]) + '.'
            return summary

        except Exception as e:
            print(f"⚠️ Extractive summary failed: {e}")
            return ' '.join(text.split()[:200]) + "..."

    def abstractive_summary(self, text, max_length=150):
        """Create abstractive summary using BART"""
        if not self.summarizer:
            return self.extractive_summary(text)

        try:
            # Limit input length
            words = text.split()
            if len(words) > 1000:
                text = ' '.join(words[:1000])

            summary = self.summarizer(text, max_length=max_length, min_length=50, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            print(f"⚠️ Abstractive summary failed: {e}")
            return self.extractive_summary(text)

    def answer_question(self, question, context):
        """Answer question about the document"""
        if self.qa_pipeline:
            try:
                result = self.qa_pipeline(question=question, context=context[:2000])
                return {
                    'answer': result['answer'],
                    'confidence': result['score'],
                    'method': 'transformer'
                }
            except Exception as e:
                print(f"⚠️ Q&A failed: {e}")

        # Fallback: similarity-based
        sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
        if not sentences:
            return {'answer': 'No relevant information found.', 'confidence': 0.0}

        question_emb = self.sentence_model.encode([question])
        sentence_embs = self.sentence_model.encode(sentences)

        similarities = np.dot(sentence_embs, question_emb.T).flatten()
        best_idx = np.argmax(similarities)

        return {
            'answer': sentences[best_idx],
            'confidence': float(similarities[best_idx]),
            'method': 'similarity'
        }

    def extract_sections(self, text):
        """Extract key sections from legal document"""
        sections = {}
        patterns = {
            'facts': r'(?i)(facts?|background|case background)',
            'arguments': r'(?i)(arguments?|submissions?|contentions?)',
            'judgment': r'(?i)(judgment|decision|held|ruling|court held)',
            'orders': r'(?i)(orders?|directions?|disposed)',
            'ratio': r'(?i)(ratio|principle|legal principle)'
        }

        for section_name, pattern in patterns.items():
            matches = list(re.finditer(pattern, text))
            if matches:
                start_pos = matches[0].end()
                end_pos = min(start_pos + 800, len(text))
                section_text = text[start_pos:end_pos].strip()
                if len(section_text) > 50:
                    sections[section_name] = section_text

        return sections

# Initialize processor
processor = LegalDocumentProcessor(tokenizer, model)


Loading additional models...


Device set to use cpu


✅ BART summarizer loaded


Device set to use cpu


✅ Q&A system loaded
🎯 Legal Document Processor ready!


In [13]:
def upload_and_process():
    """Upload PDF and process it completely"""
    print("📤 UPLOAD YOUR LEGAL DOCUMENT (PDF)")
    print("="*50)

    # Upload file
    uploaded = files.upload()

    if not uploaded:
        print("❌ No file uploaded")
        return None

    filename = list(uploaded.keys())[0]
    file_content = uploaded[filename]

    print(f"\n📄 Processing: {filename}")
    print("⏳ Extracting text from PDF...")

    # Extract text
    text = processor.extract_pdf_text(file_content)

    if not text:
        print("❌ Failed to extract text")
        return None

    print(f"✅ Extracted {len(text)} characters")

    # Process document
    print("\n🧠 Processing with InLegalBERT...")

    # Extract sections
    sections = processor.extract_sections(text)

    # Generate summaries
    print("📝 Generating summaries...")
    extractive_summary = processor.extractive_summary(text, 5)
    abstractive_summary = processor.abstractive_summary(text)

    # Results
    results = {
        'filename': filename,
        'original_text': text,
        'length': len(text),
        'sections': sections,
        'extractive_summary': extractive_summary,
        'abstractive_summary': abstractive_summary
    }

    # Display results
    print("\n" + "="*60)
    print("📊 PROCESSING RESULTS")
    print("="*60)

    print(f"📄 Document: {filename}")
    print(f"📏 Length: {len(text):,} characters")
    print(f"📑 Sections found: {len(sections)}")

    if sections:
        print(f"   • {', '.join(sections.keys())}")

    print(f"\n📋 EXTRACTIVE SUMMARY ({len(extractive_summary.split())} words):")
    print("-" * 50)
    print(extractive_summary)

    print(f"\n🎯 ABSTRACTIVE SUMMARY ({len(abstractive_summary.split())} words):")
    print("-" * 50)
    print(abstractive_summary)

    if sections:
        print(f"\n📑 KEY SECTIONS:")
        print("-" * 20)
        for section_name, section_text in sections.items():
            print(f"\n🔹 {section_name.upper()}:")
            preview = section_text[:300] + "..." if len(section_text) > 300 else section_text
            print(preview)

    return results

# Run the upload and processing
document_results = upload_and_process()


📤 UPLOAD YOUR LEGAL DOCUMENT (PDF)


Saving case1.pdf to case1.pdf

📄 Processing: case1.pdf
⏳ Extracting text from PDF...
✅ Extracted 1189 characters

🧠 Processing with InLegalBERT...
📝 Generating summaries...

📊 PROCESSING RESULTS
📄 Document: case1.pdf
📏 Length: 1,189 characters
📑 Sections found: 4
   • facts, arguments, judgment, ratio

📋 EXTRACTIVE SUMMARY (73 words):
--------------------------------------------------
, the Hon'ble Supreme Court of India examined the question of breach of contract and commercial obligations. FACTS: The appellant XYZ Corporation entered into a supply agreement with respondent ABC Ltd. The respondent contended that time was of essence and the delay caused significant business losses. JUDGMENT: The Court held that commercial contracts must be performed with due diligence. 50 lakhs to the respondent and directed completion of the contract within 3 months.

🎯 ABSTRACTIVE SUMMARY (55 words):
--------------------------------------------------
 XYZ Corporation vs ABC Ltd., the Hon'ble Supreme 

In [None]:
def ask_your_questions(document_results):
    """Ask your own custom questions about the document"""
    
    if not document_results:
        print("❌ No document processed. Please run the upload cell first.")
        return
    
    text = document_results['original_text']
    
    print("🔥 ASK YOUR OWN QUESTIONS")
    print("="*30)
    print(f"Document: {document_results['filename']}")
    print(f"Length: {len(text):,} characters")
    print("\nType your questions below:")
    print("• Type 'done' when finished")
    print("• Type 'summary' to see document summary again")
    print("• Type 'help' for tips on good questions")
    
    all_answers = {}
    question_count = 1
    
    while True:
        try:
            print(f"\n{'='*50}")
            user_question = input(f"❓ Question {question_count}: ").strip()
            
            if user_question.lower() in ['done', 'exit', 'quit']:
                print("👋 Q&A session completed!")
                break
            
            if user_question.lower() == 'summary':
                print("\n📋 DOCUMENT SUMMARY:")
                print("-" * 25)
                print(document_results['summary'])
                continue
            
            if user_question.lower() == 'help':
                print("\n💡 QUESTION TIPS:")
                print("• What was the main issue in this case?")
                print("• Who were the parties involved?")
                print("• What did the court decide?")
                print("• What legal principles were applied?")
                print("• What damages were awarded?")
                print("• What were the key facts?")
                print("• What was the court's reasoning?")
                continue
            
            if user_question == '':
                print("⚠️ Please enter a question")
                continue
            
            print("🤔 Analyzing with InLegalBERT...")
            answer_data = processor.answer_question(user_question, text)
            all_answers[user_question] = answer_data
            
            print(f"\n💡 ANSWER:")
            print("-" * 15)
            print(f"{answer_data['answer']}")
            print(f"🔧 Method: {answer_data['method']}")
            
            question_count += 1
            
        except KeyboardInterrupt:
            print("\n\n👋 Q&A session ended.")
            break
        except Exception as e:
            print(f"⚠️ Error: {e}")
            continue
    
    return all_answers

# Start your custom Q&A session
if document_results:
    your_answers = ask_your_questions(document_results)
    
    # Show summary of your Q&A session
    if your_answers:
        print(f"\n📊 SESSION SUMMARY:")
        print("="*25)
        print(f"Questions asked: {len(your_answers)}")
        print(f"Document: {document_results['filename']}")
        
        high_confidence = sum(1 for ans in your_answers.values() if ans['confidence'] > 0.5)
        print(f"High confidence answers: {high_confidence}/{len(your_answers)}")
else:
    print("⚠️ Please upload a document first by running the previous cell")
