In [None]:
!pip install transformers==4.30.0
!pip install torch torchvision torchaudio
!pip install PyPDF2
!pip install sentence-transformers
!pip install numpy pandas
!pip install gradio

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
import PyPDF2
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import warnings
import os
from google.colab import files
import io

warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")


ModuleNotFoundError: No module named 'torch'

In [3]:
MODEL_NAME = "law-ai/InLegalBERT"

# Step 2: Load tokenizer
print("\nüî§ STEP 1: Loading InLegalBERT Tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("‚úÖ Tokenizer loaded successfully!")
    print(f"   ‚Ä¢ Vocabulary size: {len(tokenizer.vocab)}")
    print(f"   ‚Ä¢ Max sequence length: {tokenizer.model_max_length}")
    print(f"   ‚Ä¢ Special tokens: {list(tokenizer.special_tokens_map.keys())}")
except Exception as e:
    print(f"‚ùå Tokenizer loading failed: {e}")
    exit()


# Step 3: Load model
print("\nüß† STEP 2: Loading InLegalBERT Model...")
try:
    model = AutoModel.from_pretrained(MODEL_NAME)
    print("‚úÖ InLegalBERT model loaded successfully!")
    print(f"   ‚Ä¢ Model type: {type(model).__name__}")
    print(f"   ‚Ä¢ Hidden size: {model.config.hidden_size}")
    print(f"   ‚Ä¢ Number of layers: {model.config.num_hidden_layers}")
    print(f"   ‚Ä¢ Attention heads: {model.config.num_attention_heads}")
    print(f"   ‚Ä¢ Total parameters: ~{sum(p.numel() for p in model.parameters())/1e6:.1f}M")
except Exception as e:
    print(f" Model loading failed: {e}")
    exit()



üî§ STEP 1: Loading InLegalBERT Tokenizer...


tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

‚úÖ Tokenizer loaded successfully!
   ‚Ä¢ Vocabulary size: 30522
   ‚Ä¢ Max sequence length: 512
   ‚Ä¢ Special tokens: ['unk_token', 'sep_token', 'pad_token', 'cls_token', 'mask_token']

üß† STEP 2: Loading InLegalBERT Model...


config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

‚úÖ InLegalBERT model loaded successfully!
   ‚Ä¢ Model type: BertModel
   ‚Ä¢ Hidden size: 768
   ‚Ä¢ Number of layers: 12
   ‚Ä¢ Attention heads: 12
   ‚Ä¢ Total parameters: ~109.5M


In [12]:
class LegalDocumentProcessor:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

        # Load additional models
        print("Loading additional models...")
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

        try:
            self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
            print("‚úÖ BART summarizer loaded")
        except:
            print("‚ö†Ô∏è BART failed - using extractive only")
            self.summarizer = None

        try:
            self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
            print("‚úÖ Q&A system loaded")
        except:
            print("‚ö†Ô∏è Q&A system failed - using similarity")
            self.qa_pipeline = None

        print("üéØ Legal Document Processor ready!")

    def extract_pdf_text(self, uploaded_file):
        """Extract text from uploaded PDF"""
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file))
            text = ""

            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"

            # Clean text
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'Page \d+', '', text)

            return text.strip()
        except Exception as e:
            print(f"‚ùå PDF extraction failed: {e}")
            return None

    def get_embeddings(self, text):
        """Get InLegalBERT embeddings"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def extractive_summary(self, text, num_sentences=5):
        """Create extractive summary using InLegalBERT"""
        sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]

        if len(sentences) <= num_sentences:
            return text

        try:
            # Get embeddings for each sentence
            embeddings = []
            for sentence in sentences:
                emb = self.get_embeddings(sentence)
                embeddings.append(emb.squeeze().numpy())

            embeddings = np.array(embeddings)

            # Calculate document centroid
            doc_embedding = np.mean(embeddings, axis=0)

            # Score sentences by similarity to document
            scores = []
            for emb in embeddings:
                score = np.dot(emb, doc_embedding) / (np.linalg.norm(emb) * np.linalg.norm(doc_embedding) + 1e-8)
                scores.append(score)

            # Add legal keyword bonus
            legal_keywords = ['court', 'held', 'judgment', 'petitioner', 'respondent', 'article', 'constitution']
            for i, sentence in enumerate(sentences):
                bonus = sum(0.1 for keyword in legal_keywords if keyword.lower() in sentence.lower())
                scores[i] += min(bonus, 0.3)

            # Select top sentences
            top_indices = np.argsort(scores)[-num_sentences:]
            top_indices = sorted(top_indices)

            summary = '. '.join([sentences[i] for i in top_indices]) + '.'
            return summary

        except Exception as e:
            print(f"‚ö†Ô∏è Extractive summary failed: {e}")
            return ' '.join(text.split()[:200]) + "..."

    def abstractive_summary(self, text, max_length=150):
        """Create abstractive summary using BART"""
        if not self.summarizer:
            return self.extractive_summary(text)

        try:
            # Limit input length
            words = text.split()
            if len(words) > 1000:
                text = ' '.join(words[:1000])

            summary = self.summarizer(text, max_length=max_length, min_length=50, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            print(f"‚ö†Ô∏è Abstractive summary failed: {e}")
            return self.extractive_summary(text)

    def answer_question(self, question, context):
        """Answer question about the document"""
        if self.qa_pipeline:
            try:
                result = self.qa_pipeline(question=question, context=context[:2000])
                return {
                    'answer': result['answer'],
                    'confidence': result['score'],
                    'method': 'transformer'
                }
            except Exception as e:
                print(f"‚ö†Ô∏è Q&A failed: {e}")

        # Fallback: similarity-based
        sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
        if not sentences:
            return {'answer': 'No relevant information found.', 'confidence': 0.0}

        question_emb = self.sentence_model.encode([question])
        sentence_embs = self.sentence_model.encode(sentences)

        similarities = np.dot(sentence_embs, question_emb.T).flatten()
        best_idx = np.argmax(similarities)

        return {
            'answer': sentences[best_idx],
            'confidence': float(similarities[best_idx]),
            'method': 'similarity'
        }

    def extract_sections(self, text):
        """Extract key sections from legal document"""
        sections = {}
        patterns = {
            'facts': r'(?i)(facts?|background|case background)',
            'arguments': r'(?i)(arguments?|submissions?|contentions?)',
            'judgment': r'(?i)(judgment|decision|held|ruling|court held)',
            'orders': r'(?i)(orders?|directions?|disposed)',
            'ratio': r'(?i)(ratio|principle|legal principle)'
        }

        for section_name, pattern in patterns.items():
            matches = list(re.finditer(pattern, text))
            if matches:
                start_pos = matches[0].end()
                end_pos = min(start_pos + 800, len(text))
                section_text = text[start_pos:end_pos].strip()
                if len(section_text) > 50:
                    sections[section_name] = section_text

        return sections

# Initialize processor
processor = LegalDocumentProcessor(tokenizer, model)


Loading additional models...


Device set to use cpu


‚úÖ BART summarizer loaded


Device set to use cpu


‚úÖ Q&A system loaded
üéØ Legal Document Processor ready!


In [13]:
def upload_and_process():
    """Upload PDF and process it completely"""
    print("üì§ UPLOAD YOUR LEGAL DOCUMENT (PDF)")
    print("="*50)

    # Upload file
    uploaded = files.upload()

    if not uploaded:
        print("‚ùå No file uploaded")
        return None

    filename = list(uploaded.keys())[0]
    file_content = uploaded[filename]

    print(f"\nüìÑ Processing: {filename}")
    print("‚è≥ Extracting text from PDF...")

    # Extract text
    text = processor.extract_pdf_text(file_content)

    if not text:
        print("‚ùå Failed to extract text")
        return None

    print(f"‚úÖ Extracted {len(text)} characters")

    # Process document
    print("\nüß† Processing with InLegalBERT...")

    # Extract sections
    sections = processor.extract_sections(text)

    # Generate summaries
    print("üìù Generating summaries...")
    extractive_summary = processor.extractive_summary(text, 5)
    abstractive_summary = processor.abstractive_summary(text)

    # Results
    results = {
        'filename': filename,
        'original_text': text,
        'length': len(text),
        'sections': sections,
        'extractive_summary': extractive_summary,
        'abstractive_summary': abstractive_summary
    }

    # Display results
    print("\n" + "="*60)
    print("üìä PROCESSING RESULTS")
    print("="*60)

    print(f"üìÑ Document: {filename}")
    print(f"üìè Length: {len(text):,} characters")
    print(f"üìë Sections found: {len(sections)}")

    if sections:
        print(f"   ‚Ä¢ {', '.join(sections.keys())}")

    print(f"\nüìã EXTRACTIVE SUMMARY ({len(extractive_summary.split())} words):")
    print("-" * 50)
    print(extractive_summary)

    print(f"\nüéØ ABSTRACTIVE SUMMARY ({len(abstractive_summary.split())} words):")
    print("-" * 50)
    print(abstractive_summary)

    if sections:
        print(f"\nüìë KEY SECTIONS:")
        print("-" * 20)
        for section_name, section_text in sections.items():
            print(f"\nüîπ {section_name.upper()}:")
            preview = section_text[:300] + "..." if len(section_text) > 300 else section_text
            print(preview)

    return results

# Run the upload and processing
document_results = upload_and_process()


üì§ UPLOAD YOUR LEGAL DOCUMENT (PDF)


Saving case1.pdf to case1.pdf

üìÑ Processing: case1.pdf
‚è≥ Extracting text from PDF...
‚úÖ Extracted 1189 characters

üß† Processing with InLegalBERT...
üìù Generating summaries...

üìä PROCESSING RESULTS
üìÑ Document: case1.pdf
üìè Length: 1,189 characters
üìë Sections found: 4
   ‚Ä¢ facts, arguments, judgment, ratio

üìã EXTRACTIVE SUMMARY (73 words):
--------------------------------------------------
, the Hon'ble Supreme Court of India examined the question of breach of contract and commercial obligations. FACTS: The appellant XYZ Corporation entered into a supply agreement with respondent ABC Ltd. The respondent contended that time was of essence and the delay caused significant business losses. JUDGMENT: The Court held that commercial contracts must be performed with due diligence. 50 lakhs to the respondent and directed completion of the contract within 3 months.

üéØ ABSTRACTIVE SUMMARY (55 words):
--------------------------------------------------
 XYZ Corporation 

In [None]:
def ask_your_questions(document_results):
    """Ask your own custom questions about the document"""
    
    if not document_results:
        print("‚ùå No document processed. Please run the upload cell first.")
        return
    
    text = document_results['original_text']
    
    print("üî• ASK YOUR OWN QUESTIONS")
    print("="*30)
    print(f"Document: {document_results['filename']}")
    print(f"Length: {len(text):,} characters")
    print("\nType your questions below:")
    print("‚Ä¢ Type 'done' when finished")
    print("‚Ä¢ Type 'summary' to see document summary again")
    print("‚Ä¢ Type 'help' for tips on good questions")
    
    all_answers = {}
    question_count = 1
    
    while True:
        try:
            print(f"\n{'='*50}")
            user_question = input(f"‚ùì Question {question_count}: ").strip()
            
            if user_question.lower() in ['done', 'exit', 'quit']:
                print("üëã Q&A session completed!")
                break
            
            if user_question.lower() == 'summary':
                print("\nüìã DOCUMENT SUMMARY:")
                print("-" * 25)
                print(document_results['summary'])
                continue
            
            if user_question.lower() == 'help':
                print("\nüí° QUESTION TIPS:")
                print("‚Ä¢ What was the main issue in this case?")
                print("‚Ä¢ Who were the parties involved?")
                print("‚Ä¢ What did the court decide?")
                print("‚Ä¢ What legal principles were applied?")
                print("‚Ä¢ What damages were awarded?")
                print("‚Ä¢ What were the key facts?")
                print("‚Ä¢ What was the court's reasoning?")
                continue
            
            if user_question == '':
                print("‚ö†Ô∏è Please enter a question")
                continue
            
            print("ü§î Analyzing with InLegalBERT...")
            answer_data = processor.answer_question(user_question, text)
            all_answers[user_question] = answer_data
            
            print(f"\nüí° ANSWER:")
            print("-" * 15)
            print(f"{answer_data['answer']}")
            print(f"üîß Method: {answer_data['method']}")
            
            question_count += 1
            
        except KeyboardInterrupt:
            print("\n\nüëã Q&A session ended.")
            break
        except Exception as e:
            print(f"‚ö†Ô∏è Error: {e}")
            continue
    
    return all_answers

# Start your custom Q&A session
if document_results:
    your_answers = ask_your_questions(document_results)
    
    # Show summary of your Q&A session
    if your_answers:
        print(f"\nüìä SESSION SUMMARY:")
        print("="*25)
        print(f"Questions asked: {len(your_answers)}")
        print(f"Document: {document_results['filename']}")
        
        high_confidence = sum(1 for ans in your_answers.values() if ans['confidence'] > 0.5)
        print(f"High confidence answers: {high_confidence}/{len(your_answers)}")
else:
    print("‚ö†Ô∏è Please upload a document first by running the previous cell")
