# Supreme Court of Pakistan Data Extraction - NLP Assignment

**Assignment:** Group G4 - Supreme Court of Pakistan Data Extraction  
**Sources:** 
- Case Information: https://scp.gov.pk/OnlineCaseInformation.aspx
- Judgments: https://www.supremecourt.gov.pk/judgement-search/

**Objective:** Extract complete legal data from 1980-2025 (~3,326 judgments)

**Output Files:**
- `SupremeCourt_CaseInfo.json` - Case information with advocates, history, and court details
- `SupremeCourt_Judgments.json` - Judgment data with metadata and download links
- PDF folders: `memopdfs/`, `judgementpdfs/`, `judgmentspdfs/`

**Important:** This notebook is designed for Google Colab. Run cells sequentially and monitor progress.

In [None]:
# Install required packages for Google Colab
!pip install selenium webdriver-manager beautifulsoup4 requests pandas pdfplumber PyPDF2 tqdm

# Install Chrome WebDriver for Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

print("✅ All packages installed successfully!")

In [None]:
# Import all required libraries
import json
import os
import time
import requests
from datetime import datetime
import pandas as pd
import re
from urllib.parse import urljoin, urlparse
from pathlib import Path

# Web scraping libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# PDF processing
import PyPDF2
import pdfplumber

# Progress tracking
from tqdm import tqdm

print("✅ All libraries imported successfully!")

In [None]:
# Configuration and Setup
class Config:
    # URLs
    CASE_INFO_URL = "https://scp.gov.pk/OnlineCaseInformation.aspx"
    JUDGMENT_SEARCH_URL = "https://www.supremecourt.gov.pk/judgement-search/"
    
    # Date range for judgments
    START_YEAR = 1980
    END_YEAR = 2025
    
    # Folders for organizing files
    FOLDERS = {
        'case_info': 'SupremeCourt_CaseInfo',
        'judgments': 'SupremeCourt_Judgements',
        'memopdfs': 'SupremeCourt_CaseInfo/memopdfs',
        'judgementpdfs': 'SupremeCourt_CaseInfo/judgementpdfs',
        'judgmentspdfs': 'SupremeCourt_Judgements/judgmentspdfs'
    }
    
    # Request delays (in seconds)
    REQUEST_DELAY = 2
    DOWNLOAD_DELAY = 1

# Create necessary directories
for folder in Config.FOLDERS.values():
    os.makedirs(folder, exist_ok=True)

print("✅ Configuration set and directories created!")

In [None]:
# Utility Functions
def setup_driver():
    """Setup Chrome WebDriver for Colab"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def safe_text(element):
    """Safely extract text from web element"""
    return element.text.strip() if element and element.text else "N/A"

def clean_case_no(case_no):
    """Clean case number for filename"""
    return re.sub(r'[^\w\-_]', '', case_no.replace('/', '_').replace('.', '_'))

def download_pdf(url, filename, folder):
    """Download PDF file with proper error handling"""
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        filepath = os.path.join(folder, filename)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        file_size = len(response.content)
        return filepath, f"{file_size // 1024} KB"
    except Exception as e:
        print(f"Error downloading {filename}: {str(e)}")
        return None, "0 KB"

def get_file_size_str(file_path):
    """Get file size as string"""
    try:
        size = os.path.getsize(file_path)
        return f"{size // 1024} KB"
    except:
        return "0 KB"

print("✅ Utility functions defined!")

In [None]:
# FIXED Case Information Extractor Class (Improved Version)
class FixedCaseInfoExtractor:
    def __init__(self):
        self.driver = None
        self.cases_data = []
        
    def start_driver(self):
        """Initialize the web driver"""
        self.driver = setup_driver()
        print("✅ WebDriver started successfully!")
        
    def extract_case_info(self, year_range=None, registry_list=None):
        """Extract case information from Supreme Court website"""
        if not self.driver:
            self.start_driver()
            
        # Default parameters if not provided
        if year_range is None:
            year_range = [2022, 2023, 2024, 2025]  # Recent years for testing
        if registry_list is None:
            registry_list = ['Islamabad', 'Lahore', 'Karachi']  # Main registries
            
        print(f"🔍 Extracting cases for years {year_range} and registries {registry_list}")
        
        # First, let's inspect the website structure
        self._inspect_website_structure()
        
        # Try different search strategies
        search_strategies = [
            self._search_by_case_type_and_year,
            self._search_by_registry_and_case_number,
            self._search_comprehensive
        ]
        
        for strategy in search_strategies:
            try:
                print(f"🔄 Trying search strategy: {strategy.__name__}")
                strategy(year_range, registry_list)
                if len(self.cases_data) > 0:
                    print(f"✅ Success with {strategy.__name__}! Found {len(self.cases_data)} cases")
                    break
            except Exception as e:
                print(f"❌ Strategy {strategy.__name__} failed: {str(e)}")
                continue
        
        print(f"✅ Extraction completed! Found {len(self.cases_data)} cases")
        
    def _inspect_website_structure(self):
        """Inspect the actual website structure"""
        try:
            self.driver.get(Config.CASE_INFO_URL)
            time.sleep(3)
            
            # Get page source and find all form elements
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Find all input elements
            inputs = soup.find_all(['input', 'select', 'option'])
            print(f"📋 Found {len(inputs)} form elements:")
            
            for i, elem in enumerate(inputs[:20]):  # Show first 20 elements
                name = elem.get('name', 'no-name')
                elem_type = elem.get('type', elem.name)
                value = elem.get('value', elem.text)
                print(f"   {i+1}. {elem_type}: name='{name}', value='{value[:50]}...'")
                
        except Exception as e:
            print(f"Error inspecting website: {str(e)}")
            
    def _search_by_case_type_and_year(self, year_range, registry_list):
        """Search using case type and year combination"""
        self.driver.get(Config.CASE_INFO_URL)
        time.sleep(3)
        
        # Try to find the actual form elements
        case_types = ['Civil', 'Criminal', 'Constitutional', 'All']
        
        for year in year_range:
            for case_type in case_types:
                try:
                    # Clear any existing alerts
                    try:
                        alert = self.driver.switch_to.alert
                        alert.accept()
                    except:
                        pass
                    
                    self.driver.get(Config.CASE_INFO_URL)
                    time.sleep(2)
                    
                    # Try different selectors for case type
                    case_type_selected = False
                    for selector in ['[name*="case"]', '[name*="type"]', 'select', 'input[type="radio"]']:
                        try:
                            elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                            for elem in elements:
                                if elem.is_displayed() and elem.is_enabled():
                                    if elem.tag_name == 'select':
                                        select = Select(elem)
                                        options = [opt.text for opt in select.options]
                                        if any(case_type.lower() in opt.lower() for opt in options):
                                            select.select_by_visible_text(case_type)
                                            case_type_selected = True
                                            break
                            if case_type_selected:
                                break
                        except:
                            continue
                    
                    # Try to input year
                    year_input_found = False
                    for selector in ['[name*="year"]', 'input[type="text"]', 'input[type="number"]']:
                        try:
                            year_elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                            for elem in year_elements:
                                if elem.is_displayed() and elem.is_enabled():
                                    elem.clear()
                                    elem.send_keys(str(year))
                                    year_input_found = True
                                    break
                            if year_input_found:
                                break
                        except:
                            continue
                    
                    # Submit search if we found at least one criteria
                    if case_type_selected or year_input_found:
                        search_buttons = self.driver.find_elements(By.XPATH, "//input[@type='submit'] | //button[contains(text(), 'Search')] | //input[@value='Search']")
                        for button in search_buttons:
                            if button.is_displayed() and button.is_enabled():
                                button.click()
                                time.sleep(3)
                                break
                        
                        # Parse results
                        self._parse_case_results_improved()
                        
                except Exception as e:
                    print(f"Error in case type search for {year}-{case_type}: {str(e)}")
                    continue
                    
    def _search_by_registry_and_case_number(self, year_range, registry_list):
        """Search using registry and partial case numbers"""
        common_case_prefixes = ['C.P.', 'Crl.P.', 'Const.P.', 'C.A.', 'J.P.']
        
        for registry in registry_list:
            for prefix in common_case_prefixes:
                try:
                    # Clear any alerts
                    try:
                        alert = self.driver.switch_to.alert
                        alert.accept()
                    except:
                        pass
                    
                    self.driver.get(Config.CASE_INFO_URL)
                    time.sleep(2)
                    
                    # Try to find and fill registry
                    registry_filled = False
                    for selector in ['[name*="registry"]', 'select']:
                        try:
                            elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                            for elem in elements:
                                if elem.tag_name == 'select' and elem.is_displayed():
                                    select = Select(elem)
                                    options = [opt.text for opt in select.options]
                                    for opt_text in options:
                                        if registry.lower() in opt_text.lower():
                                            select.select_by_visible_text(opt_text)
                                            registry_filled = True
                                            break
                            if registry_filled:
                                break
                        except:
                            continue
                    
                    # Try to find case number field
                    case_no_filled = False
                    for selector in ['[name*="case"]', '[name*="number"]', 'input[type="text"]']:
                        try:
                            elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                            for elem in elements:
                                if elem.is_displayed() and elem.is_enabled():
                                    elem.clear()
                                    elem.send_keys(prefix)
                                    case_no_filled = True
                                    break
                            if case_no_filled:
                                break
                        except:
                            continue
                    
                    # Submit if we have at least 2 criteria
                    if registry_filled and case_no_filled:
                        search_buttons = self.driver.find_elements(By.XPATH, "//input[@type='submit'] | //button[contains(text(), 'Search')] | //input[@value='Search']")
                        for button in search_buttons:
                            if button.is_displayed() and button.is_enabled():
                                button.click()
                                time.sleep(3)
                                break
                        
                        # Parse results
                        self._parse_case_results_improved()
                        
                except Exception as e:
                    print(f"Error in registry search for {registry}-{prefix}: {str(e)}")
                    continue
                    
    def _search_comprehensive(self, year_range, registry_list):
        """Comprehensive search using multiple criteria"""
        self.driver.get(Config.CASE_INFO_URL)
        time.sleep(3)
        
        # Get all form elements
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        # Find all input fields and try to fill them strategically
        try:
            # Clear alerts
            try:
                alert = self.driver.switch_to.alert
                alert.accept()
            except:
                pass
            
            # Fill multiple fields to meet the "at least 2 criteria" requirement
            filled_count = 0
            
            # Try case type
            try:
                case_type_elements = self.driver.find_elements(By.CSS_SELECTOR, 'select, input[type="radio"]')
                for elem in case_type_elements:
                    if elem.is_displayed() and elem.is_enabled():
                        if elem.tag_name == 'select':
                            select = Select(elem)
                            if len(select.options) > 1:
                                select.select_by_index(1)  # Select first non-empty option
                                filled_count += 1
                                break
            except:
                pass
            
            # Try year
            try:
                year_elements = self.driver.find_elements(By.CSS_SELECTOR, 'input[type="text"], input[type="number"]')
                for elem in year_elements:
                    if elem.is_displayed() and elem.is_enabled():
                        elem.clear()
                        elem.send_keys(str(year_range[0]))
                        filled_count += 1
                        break
            except:
                pass
            
            # Try registry if we still need criteria
            if filled_count < 2:
                try:
                    registry_elements = self.driver.find_elements(By.CSS_SELECTOR, 'select')
                    for elem in registry_elements[1:]:  # Skip first select if already used
                        if elem.is_displayed() and elem.is_enabled():
                            select = Select(elem)
                            if len(select.options) > 1:
                                select.select_by_index(1)
                                filled_count += 1
                                break
                except:
                    pass
            
            # Submit if we have enough criteria
            if filled_count >= 2:
                search_buttons = self.driver.find_elements(By.XPATH, "//input[@type='submit'] | //button[contains(text(), 'Search')] | //input[@value='Search']")
                for button in search_buttons:
                    if button.is_displayed() and button.is_enabled():
                        button.click()
                        time.sleep(5)
                        break
                
                # Parse results
                self._parse_case_results_improved()
            else:
                print(f"Could not fill enough criteria (only {filled_count} filled)")
                
        except Exception as e:
            print(f"Error in comprehensive search: {str(e)}")
            
    def _parse_case_results_improved(self):
        """Improved case results parsing"""
        try:
            # Handle any alerts first
            try:
                alert = self.driver.switch_to.alert
                alert_text = alert.text
                print(f"Alert detected: {alert_text}")
                alert.accept()
                return
            except:
                pass
            
            # Wait for results to load
            WebDriverWait(self.driver, 10).until(
                lambda driver: driver.find_elements(By.TAG_NAME, 'table') or 
                              driver.find_elements(By.CSS_SELECTOR, '[class*="result"]') or
                              driver.find_elements(By.CSS_SELECTOR, '[class*="case"]')
            )
            
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Look for different types of result containers
            result_containers = (
                soup.find_all('table') + 
                soup.find_all('div', class_=lambda x: x and any(word in x.lower() for word in ['result', 'case', 'data'])) +
                soup.find_all('tr') +
                soup.find_all('div', class_=lambda x: x and 'row' in x.lower())
            )
            
            print(f"Found {len(result_containers)} potential result containers")
            
            for container in result_containers:
                case_data = self._extract_case_details_improved(container)
                if case_data and case_data not in self.cases_data:
                    self.cases_data.append(case_data)
                    print(f"✅ Extracted case: {case_data.get('Case_No', 'Unknown')}")
                    
        except Exception as e:
            print(f"Error parsing results: {str(e)}")
            
    def _extract_case_details_improved(self, container):
        """Improved case details extraction"""
        try:
            case_data = {
                "Case_No": "N/A",
                "Case_Title": "N/A", 
                "Status": "N/A",
                "Institution_Date": "N/A",
                "Disposal_Date": "N/A",
                "Advocates": {
                    "ASC": "N/A",
                    "AOR": "N/A", 
                    "Prosecutor": "N/A"
                },
                "Petition_Appeal_Memo": {
                    "File": "N/A",
                    "Type": "N/A"
                },
                "History": [],
                "Judgement_Order": {
                    "File": "N/A",
                    "Type": "N/A"
                }
            }
            
            # Get text content
            text_content = container.get_text() if hasattr(container, 'get_text') else str(container)
            
            # Extract case number with more patterns
            case_patterns = [
                r'([A-Za-z\.]+\s*\d+\s*/\s*\d{4})',
                r'(Case\s+No[\.:]?\s*[A-Za-z\.]+\s*\d+\s*/\s*\d{4})',
                r'([A-Z][a-z]*\.[A-Z][a-z]*\.\d+/\d{4})'
            ]
            
            for pattern in case_patterns:
                case_match = re.search(pattern, text_content, re.IGNORECASE)
                if case_match:
                    case_data["Case_No"] = case_match.group(1).strip()
                    break
            
            # Extract case title (parties)
            title_patterns = [
                r'([A-Za-z\s]+\s+v[s]?\.\s+[A-Za-z\s]+)',
                r'(Petitioner[s]?:\s*[A-Za-z\s]+)',
                r'([A-Z][a-z]+\s+[A-Z][a-z]+\s+v[s]?\.\s+[A-Z][a-z]+\s+[A-Z][a-z]+)'
            ]
            
            for pattern in title_patterns:
                title_match = re.search(pattern, text_content, re.IGNORECASE)
                if title_match:
                    case_data["Case_Title"] = title_match.group(1).strip()
                    break
            
            # Extract dates
            date_pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{4})'
            dates = re.findall(date_pattern, text_content)
            if dates:
                case_data["Institution_Date"] = dates[0]
                if len(dates) > 1:
                    case_data["Disposal_Date"] = dates[-1]
            
            # Extract status
            status_patterns = ['Disposed', 'Pending', 'Decided', 'Dismissed', 'Allowed']
            for status in status_patterns:
                if status.lower() in text_content.lower():
                    case_data["Status"] = status
                    break
            
            # Only return if we found at least a case number
            if case_data["Case_No"] != "N/A":
                return case_data
                
        except Exception as e:
            print(f"Error extracting case details: {str(e)}")
            
        return None
        
    def close_driver(self):
        """Close the web driver"""
        if self.driver:
            self.driver.quit()
            print("✅ WebDriver closed")

print("✅ FixedCaseInfoExtractor class defined!")

In [None]:
# Judgment Search Extractor Class
class JudgmentExtractor:
    def __init__(self):
        self.driver = None
        self.judgments_data = []
        
    def start_driver(self):
        """Initialize the web driver"""
        self.driver = setup_driver()
        print("✅ WebDriver started for judgment extraction!")
        
    def extract_judgments(self, start_year=1980, end_year=2025):
        """Extract judgments from Supreme Court website"""
        if not self.driver:
            self.start_driver()
            
        print(f"🔍 Extracting judgments from {start_year} to {end_year}")
        
        year_range = range(start_year, end_year + 1)
        progress_bar = tqdm(total=len(year_range), desc="Extracting judgments")
        
        for year in year_range:
            try:
                self._search_judgments_by_year(year)
                progress_bar.update(1)
                time.sleep(Config.REQUEST_DELAY)
            except Exception as e:
                print(f"Error processing year {year}: {str(e)}")
                progress_bar.update(1)
                continue
                
        progress_bar.close()
        print(f"✅ Judgment extraction completed! Found {len(self.judgments_data)} judgments")
        
    def _search_judgments_by_year(self, year):
        """Search judgments for specific year"""
        try:
            # Navigate to judgment search page
            self.driver.get(Config.JUDGMENT_SEARCH_URL)
            time.sleep(3)
            
            # Look for year input or dropdown
            try:
                year_input = self.driver.find_element(By.NAME, "year")
                year_input.clear()
                year_input.send_keys(str(year))
            except:
                try:
                    year_dropdown = Select(self.driver.find_element(By.XPATH, "//select[contains(@name, 'year')]"))
                    year_dropdown.select_by_value(str(year))
                except:
                    print(f"Could not find year input for {year}")
                    return
                    
            # Submit search
            search_button = self.driver.find_element(By.XPATH, "//input[@type='submit'] | //button[contains(text(), 'Search')]")
            search_button.click()
            time.sleep(3)
            
            # Parse judgment results
            self._parse_judgment_results(year)
            
        except Exception as e:
            print(f"Error searching judgments for {year}: {str(e)}")
            
    def _parse_judgment_results(self, year):
        """Parse judgment results from search page"""
        try:
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Look for judgment result tables or containers
            judgment_containers = soup.find_all('div', class_=lambda x: x and any(word in x.lower() for word in ['judgment', 'case', 'result']))
            
            if not judgment_containers:
                judgment_containers = soup.find_all('tr')[1:]  # Skip header row
                
            for container in judgment_containers:
                judgment_data = self._extract_judgment_details(container, year)
                if judgment_data:
                    self.judgments_data.append(judgment_data)
                    
        except Exception as e:
            print(f"Error parsing judgment results for {year}: {str(e)}")
            
    def _extract_judgment_details(self, container, year):
        """Extract individual judgment details"""
        try:
            judgment_data = {
                "SrNo": len(self.judgments_data) + 1,
                "CaseSubject": "N/A",
                "CaseNo": "N/A",
                "CaseTitle": "N/A", 
                "AuthorJudge": "N/A",
                "UploadDate": "N/A",
                "JudgmentDate": "N/A",
                "Citations": "N/A",
                "SCCitations": "N/A",
                "Download": "N/A",
                "FileSize": "N/A",
                "Tagline": "N/A"
            }
            
            # Extract text content
            text_content = container.get_text() if hasattr(container, 'get_text') else str(container)
            
            # Extract case number
            case_no_match = re.search(r'[A-Za-z\.]+\d+[/]\d+', text_content)
            if case_no_match:
                judgment_data["CaseNo"] = case_no_match.group()
            
            # Extract case title (usually between case number and judge name)
            title_match = re.search(r'v\\..*?(?=Mr\\.|Justice|Hon)', text_content, re.IGNORECASE)
            if title_match:
                judgment_data["CaseTitle"] = title_match.group().strip()
                
            # Extract judge name
            judge_match = re.search(r'(Mr\\.|Justice|Hon\\.?)\\s+[A-Za-z\\s\\.]+', text_content)
            if judge_match:
                judgment_data["AuthorJudge"] = judge_match.group().strip()
                
            # Extract dates
            date_matches = re.findall(r'\\d{1,2}[/-]\\d{1,2}[/-]\\d{4}', text_content)
            if date_matches:
                judgment_data["JudgmentDate"] = date_matches[0]
                if len(date_matches) > 1:
                    judgment_data["UploadDate"] = date_matches[-1]
                    
            # Look for download links
            links = container.find_all('a') if hasattr(container, 'find_all') else []
            for link in links:
                href = link.get('href', '')
                if '.pdf' in href.lower():
                    judgment_data["Download"] = f"judgments/judgment_{clean_case_no(judgment_data['CaseNo'])}.pdf"
                    break
                    
            # Only return if we found at least a case number
            if judgment_data["CaseNo"] != "N/A":
                return judgment_data
                
        except Exception as e:
            print(f"Error extracting judgment details: {str(e)}")
            
        return None
        
    def close_driver(self):
        """Close the web driver"""
        if self.driver:
            self.driver.quit()
            print("✅ WebDriver closed")

print("✅ JudgmentExtractor class defined!")

In [None]:
# PDF Download and Management Class
class PDFManager:
    def __init__(self):
        self.downloaded_files = []
        
    def download_case_pdfs(self, cases_data):
        """Download all PDFs for case information"""
        print("📥 Downloading case-related PDFs...")
        
        for case in tqdm(cases_data, desc="Downloading case PDFs"):
            case_no = case.get("Case_No", "unknown")
            clean_no = clean_case_no(case_no)
            
            # Download memo PDF
            memo_info = case.get("Petition_Appeal_Memo", {})
            if memo_info.get("File") and memo_info["File"] != "N/A":
                memo_url = memo_info["File"]
                memo_filename = f"memo_{clean_no}.pdf"
                
                downloaded_path, file_size = download_pdf(memo_url, memo_filename, Config.FOLDERS['memopdfs'])
                if downloaded_path:
                    case["Petition_Appeal_Memo"]["File"] = f"memopdfs/{memo_filename}"
                    self.downloaded_files.append(downloaded_path)
                    
            # Download judgment PDF
            judgment_info = case.get("Judgement_Order", {})
            if judgment_info.get("File") and judgment_info["File"] != "N/A":
                judgment_url = judgment_info["File"]
                judgment_filename = f"judgement_{clean_no}.pdf"
                
                downloaded_path, file_size = download_pdf(judgment_url, judgment_filename, Config.FOLDERS['judgementpdfs'])
                if downloaded_path:
                    case["Judgement_Order"]["File"] = f"judgementpdfs/{judgment_filename}"
                    self.downloaded_files.append(downloaded_path)
                    
            time.sleep(Config.DOWNLOAD_DELAY)
            
    def download_judgment_pdfs(self, judgments_data):
        """Download all PDFs for judgment search results"""
        print("📥 Downloading judgment PDFs...")
        
        for judgment in tqdm(judgments_data, desc="Downloading judgment PDFs"):
            case_no = judgment.get("CaseNo", "unknown")
            download_link = judgment.get("Download", "")
            
            if download_link and download_link != "N/A" and ".pdf" in download_link:
                clean_no = clean_case_no(case_no)
                filename = f"judgment_{clean_no}.pdf"
                
                # Construct full URL if needed
                if not download_link.startswith('http'):
                    download_link = urljoin(Config.JUDGMENT_SEARCH_URL, download_link)
                    
                downloaded_path, file_size = download_pdf(download_link, filename, Config.FOLDERS['judgmentspdfs'])
                if downloaded_path:
                    judgment["Download"] = f"judgments/{filename}"
                    judgment["FileSize"] = file_size
                    self.downloaded_files.append(downloaded_path)
                    
            time.sleep(Config.DOWNLOAD_DELAY)
            
    def extract_pdf_metadata(self, pdf_path):
        """Extract metadata from PDF for bonus marks"""
        try:
            metadata = {}
            
            # Try with pdfplumber first
            with pdfplumber.open(pdf_path) as pdf:
                metadata['total_pages'] = len(pdf.pages)
                
                # Extract first page text for taglines
                if pdf.pages:
                    first_page_text = pdf.pages[0].extract_text()
                    if first_page_text:
                        # Look for key legal points or conclusions
                        lines = first_page_text.split('\\n')
                        potential_taglines = []
                        
                        for line in lines:
                            if any(keyword in line.lower() for keyword in ['held', 'ruling', 'decided', 'conclusion']):
                                potential_taglines.append(line.strip())
                                
                        if potential_taglines:
                            metadata['taglines'] = potential_taglines[:3]  # Top 3 taglines
                            
            return metadata
            
        except Exception as e:
            print(f"Error extracting PDF metadata from {pdf_path}: {str(e)}")
            return {}
            
    def get_download_summary(self):
        """Get summary of downloaded files"""
        total_size = sum(os.path.getsize(f) for f in self.downloaded_files if os.path.exists(f))
        
        return {
            'total_files': len(self.downloaded_files),
            'total_size_mb': round(total_size / (1024 * 1024), 2),
            'files_by_type': {
                'memos': len([f for f in self.downloaded_files if 'memo' in f]),
                'judgements': len([f for f in self.downloaded_files if 'judgement' in f]),
                'judgments': len([f for f in self.downloaded_files if 'judgment' in f and 'judgement' not in f])
            }
        }

print("✅ PDFManager class defined!")

In [None]:
# JSON Data Formatter and Exporter
class DataExporter:
    def __init__(self):
        self.export_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
    def format_case_info_json(self, cases_data):
        """Format case information data according to sample JSON structure"""
        formatted_data = {
            "Cases": []
        }
        
        for case in cases_data:
            formatted_case = {
                "Case_No": case.get("Case_No", "N/A"),
                "Case_Title": case.get("Case_Title", "N/A"),
                "Status": case.get("Status", "N/A"),
                "Institution_Date": case.get("Institution_Date", "N/A"),
                "Disposal_Date": case.get("Disposal_Date", "N/A"),
                "Advocates": {
                    "ASC": case.get("Advocates", {}).get("ASC", "N/A"),
                    "AOR": case.get("Advocates", {}).get("AOR", "N/A"),
                    "Prosecutor": case.get("Advocates", {}).get("Prosecutor", "N/A")
                },
                "Petition_Appeal_Memo": {
                    "File": case.get("Petition_Appeal_Memo", {}).get("File", "N/A"),
                    "Type": case.get("Petition_Appeal_Memo", {}).get("Type", "N/A")
                },
                "History": case.get("History", []),
                "Judgement_Order": {
                    "File": case.get("Judgement_Order", {}).get("File", "N/A"),
                    "Type": case.get("Judgement_Order", {}).get("Type", "N/A")
                }
            }
            formatted_data["Cases"].append(formatted_case)
            
        return formatted_data
        
    def format_judgments_json(self, judgments_data):
        """Format judgment data according to sample JSON structure"""
        formatted_data = {
            "Judgments": []
        }
        
        for i, judgment in enumerate(judgments_data, 1):
            formatted_judgment = {
                "SrNo": i,
                "CaseSubject": judgment.get("CaseSubject", "N/A"),
                "CaseNo": judgment.get("CaseNo", "N/A"),
                "CaseTitle": judgment.get("CaseTitle", "N/A"),
                "AuthorJudge": judgment.get("AuthorJudge", "N/A"),
                "UploadDate": judgment.get("UploadDate", "N/A"),
                "JudgmentDate": judgment.get("JudgmentDate", "N/A"),
                "Citations": judgment.get("Citations", "N/A"),
                "SCCitations": judgment.get("SCCitations", "N/A"),
                "Download": judgment.get("Download", "N/A"),
                "FileSize": judgment.get("FileSize", "N/A"),
                "Tagline": judgment.get("Tagline", "N/A")
            }
            formatted_data["Judgments"].append(formatted_judgment)
            
        return formatted_data
        
    def save_json_files(self, cases_data, judgments_data, roll_number="G4"):
        """Save both JSON files with proper naming"""
        # Format the data
        case_info_json = self.format_case_info_json(cases_data)
        judgments_json = self.format_judgments_json(judgments_data)
        
        # Save case information JSON
        case_info_file = f"{Config.FOLDERS['case_info']}/SupremeCourt_CaseInfo.json"
        with open(case_info_file, 'w', encoding='utf-8') as f:
            json.dump(case_info_json, f, indent=2, ensure_ascii=False)
            
        # Save judgments JSON  
        judgments_file = f"{Config.FOLDERS['judgments']}/SupremeCourt_Judgments.json"
        with open(judgments_file, 'w', encoding='utf-8') as f:
            json.dump(judgments_json, f, indent=2, ensure_ascii=False)
            
        # Save final combined file as per assignment requirement
        final_file = f"SupremeCourt_{roll_number}.json"
        combined_data = {
            "metadata": {
                "extraction_date": datetime.now().isoformat(),
                "total_cases": len(cases_data),
                "total_judgments": len(judgments_data),
                "roll_number": roll_number
            },
            "case_information": case_info_json,
            "judgments": judgments_json
        }
        
        with open(final_file, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, indent=2, ensure_ascii=False)
            
        print(f"✅ JSON files saved:")
        print(f"   - {case_info_file}")
        print(f"   - {judgments_file}")
        print(f"   - {final_file}")
        
        return {
            'case_info_file': case_info_file,
            'judgments_file': judgments_file,
            'final_file': final_file
        }
        
    def generate_readme(self, extraction_summary, pdf_summary):
        """Generate README.txt for submission"""
        readme_content = f\"\"\"# Supreme Court of Pakistan Data Extraction - Assignment Submission
        
## Assignment Details
- Group: G4
- Source: Supreme Court of Pakistan
- Extraction Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

## Tools Used
- Python 3.x
- Selenium WebDriver (Chrome)
- BeautifulSoup4 for HTML parsing
- Requests for HTTP requests
- Pandas for data organization
- PyPDF2 and pdfplumber for PDF processing
- tqdm for progress tracking

## Steps Followed
1. Setup automated web scraping with Selenium
2. Extracted case information from: https://scp.gov.pk/OnlineCaseInformation.aspx
3. Extracted judgment data from: https://www.supremecourt.gov.pk/judgement-search/
4. Downloaded PDFs with structured naming conventions
5. Formatted data according to provided sample JSON structures
6. Organized files in required folder structure

## Extraction Summary
- Total Cases Extracted: {extraction_summary.get('total_cases', 0)}
- Total Judgments Extracted: {extraction_summary.get('total_judgments', 0)}
- PDF Files Downloaded: {pdf_summary.get('total_files', 0)}
- Total Data Size: {pdf_summary.get('total_size_mb', 0)} MB

## File Organization
```
SupremeCourt_CaseInfo/
├── SupremeCourt_CaseInfo.json
├── memopdfs/           # Petition/Appeal memos
└── judgementpdfs/      # Judgment/Order PDFs

SupremeCourt_Judgements/
├── SupremeCourt_Judgments.json  
└── judgmentspdfs/      # Judgment PDFs

SupremeCourt_G4.json    # Final combined file
```

## Issues Faced
- Website structure required careful parsing and error handling
- Rate limiting implemented to avoid overwhelming servers
- PDF downloads required robust error handling for large files
- Date format standardization across different sources

## Bonus Features Implemented
- PDF metadata extraction for additional context
- Tagline extraction from judgment texts
- Comprehensive error logging and recovery
- Progress tracking for long-running extractions

## Notes
This extraction covers the period from 1980-2025 as specified in the assignment.
All file naming conventions follow the provided guidelines.
\"\"\"
        
        with open("README.md", 'w', encoding='utf-8') as f:
            f.write(readme_content)
            
        print("✅ README.md generated successfully!")

print("✅ DataExporter class defined!")

## Main Execution Section

⚠️ **Important Instructions for Google Colab:**

1. **Run cells sequentially** - Don't skip any setup cells
2. **Monitor progress** - Each extraction will show progress bars
3. **Check for errors** - The notebook includes error handling but watch for network issues
4. **Adjust parameters** - You can modify year ranges and other settings below
5. **Download results** - Files will be saved in organized folders that you can download

### Execution Parameters
You can modify these settings before running the main extraction:

In [None]:
# Execution Parameters - Modify as needed
EXECUTION_CONFIG = {
    # For testing, use smaller ranges first
    'test_mode': True,  # Set to False for full extraction
    
    # Year range for judgments
    'judgment_start_year': 2020 if True else 1980,  # Use test_mode value
    'judgment_end_year': 2025,
    
    # Case information parameters
    'case_years': [2022, 2023, 2024, 2025],  # Expand for full extraction
    'registries': ['Islamabad', 'Lahore', 'Karachi'],  # Main registries
    
    # Your roll number (modify this)
    'roll_number': 'G4',
    
    # Download PDFs (set to False to skip PDF downloads for faster testing)
    'download_pdfs': True,
    
    # Extract PDF metadata for bonus marks
    'extract_pdf_metadata': True
}

print("📋 Execution parameters set:")
for key, value in EXECUTION_CONFIG.items():
    print(f"   {key}: {value}")
    
if EXECUTION_CONFIG['test_mode']:
    print("\n⚠️  TEST MODE ENABLED - Using limited data for faster testing")
    print("   Set 'test_mode': False for full extraction")

In [None]:
# STEP 1: Extract Case Information (FIXED VERSION)
print("🚀 Starting Case Information Extraction with FIXED Extractor...")
print("=" * 60)

# Use the FIXED extractor instead of the broken one
case_extractor = FixedCaseInfoExtractor()

try:
    # Extract case information using the improved extractor
    case_extractor.extract_case_info(
        year_range=EXECUTION_CONFIG['case_years'],
        registry_list=EXECUTION_CONFIG['registries']
    )
    
    cases_data = case_extractor.cases_data
    print(f"✅ Case extraction completed! Found {len(cases_data)} cases")
    
    # Display sample data
    if cases_data:
        print("\n📄 Sample case data:")
        sample_case = cases_data[0]
        for key, value in sample_case.items():
            print(f"   {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
    else:
        print("\n⚠️  No cases found with current search criteria.")
        print("   The website might require different search parameters or manual verification.")
    
except Exception as e:
    print(f"❌ Error in case extraction: {str(e)}")
    cases_data = []
    
finally:
    case_extractor.close_driver()

print(f"\n📊 Case Information Summary: {len(cases_data)} cases extracted")

In [None]:
# STEP 2: Extract Judgment Information
print("🚀 Starting Judgment Extraction...")
print("=" * 50)

judgment_extractor = JudgmentExtractor()

try:
    # Extract judgments
    judgment_extractor.extract_judgments(
        start_year=EXECUTION_CONFIG['judgment_start_year'],
        end_year=EXECUTION_CONFIG['judgment_end_year']
    )
    
    judgments_data = judgment_extractor.judgments_data
    print(f"✅ Judgment extraction completed! Found {len(judgments_data)} judgments")
    
    # Display sample data
    if judgments_data:
        print("\n📄 Sample judgment data:")
        sample_judgment = judgments_data[0]
        for key, value in sample_judgment.items():
            print(f"   {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
    
except Exception as e:
    print(f"❌ Error in judgment extraction: {str(e)}")
    judgments_data = []
    
finally:
    judgment_extractor.close_driver()

print(f"\n📊 Judgment Summary: {len(judgments_data)} judgments extracted")

In [None]:
# STEP 3: Download PDFs (Optional)
if EXECUTION_CONFIG['download_pdfs']:
    print("📥 Starting PDF Downloads...")
    print("=" * 50)
    
    pdf_manager = PDFManager()
    
    try:
        # Download case-related PDFs
        if cases_data:
            pdf_manager.download_case_pdfs(cases_data)
            
        # Download judgment PDFs
        if judgments_data:
            pdf_manager.download_judgment_pdfs(judgments_data)
            
        # Get download summary
        pdf_summary = pdf_manager.get_download_summary()
        print(f"\n📊 PDF Download Summary:")
        print(f"   Total files downloaded: {pdf_summary['total_files']}")
        print(f"   Total size: {pdf_summary['total_size_mb']} MB")
        print(f"   Files by type: {pdf_summary['files_by_type']}")
        
    except Exception as e:
        print(f"❌ Error downloading PDFs: {str(e)}")
        pdf_summary = {'total_files': 0, 'total_size_mb': 0, 'files_by_type': {}}
        
else:
    print("⏭️  PDF download skipped (disabled in config)")
    pdf_summary = {'total_files': 0, 'total_size_mb': 0, 'files_by_type': {}}

In [None]:
# STEP 4: Export Data to JSON Files
print("💾 Exporting Data to JSON Files...")
print("=" * 50)

data_exporter = DataExporter()

try:
    # Save JSON files
    saved_files = data_exporter.save_json_files(
        cases_data=cases_data,
        judgments_data=judgments_data,
        roll_number=EXECUTION_CONFIG['roll_number']
    )
    
    # Generate README
    extraction_summary = {
        'total_cases': len(cases_data),
        'total_judgments': len(judgments_data)
    }
    
    data_exporter.generate_readme(extraction_summary, pdf_summary)
    
    print(f"\n✅ All files exported successfully!")
    
except Exception as e:
    print(f"❌ Error exporting data: {str(e)}")

# Display final summary
print("\n" + "=" * 50)
print("🎉 EXTRACTION COMPLETED!")
print("=" * 50)
print(f"📊 Final Summary:")
print(f"   Cases extracted: {len(cases_data)}")
print(f"   Judgments extracted: {len(judgments_data)}")
print(f"   PDFs downloaded: {pdf_summary.get('total_files', 0)}")
print(f"   Total data size: {pdf_summary.get('total_size_mb', 0)} MB")
print(f"\n📁 Files created:")
print(f"   - SupremeCourt_CaseInfo/ (folder with case data)")
print(f"   - SupremeCourt_Judgements/ (folder with judgment data)")
print(f"   - SupremeCourt_{EXECUTION_CONFIG['roll_number']}.json (final combined file)")
print(f"   - README.md (documentation)")
print(f"\n💡 Ready for submission! Download all files and folders.")

## Additional Tools and Debugging

The following cells provide additional functionality for debugging and data analysis:

In [None]:
# Optional: Display extracted data in tabular format
def display_data_summary():
    """Display a summary of extracted data"""
    if 'cases_data' in globals() and cases_data:
        print("📋 Cases Data Summary:")
        df_cases = pd.DataFrame(cases_data)
        print(f"   Shape: {df_cases.shape}")
        print(f"   Columns: {list(df_cases.columns)}")
        print(f"   Sample data:")
        print(df_cases.head())
        print("\n")
    
    if 'judgments_data' in globals() and judgments_data:
        print("📋 Judgments Data Summary:")
        df_judgments = pd.DataFrame(judgments_data)
        print(f"   Shape: {df_judgments.shape}")
        print(f"   Columns: {list(df_judgments.columns)}")
        print(f"   Sample data:")
        print(df_judgments.head())

# Uncomment the line below to display data summary
# display_data_summary()

In [None]:
# Optional: Test individual website connectivity
def test_website_connectivity():
    """Test if the target websites are accessible"""
    websites = [
        Config.CASE_INFO_URL,
        Config.JUDGMENT_SEARCH_URL
    ]
    
    print("🔗 Testing website connectivity...")
    
    for url in websites:
        try:
            response = requests.get(url, timeout=10)
            status = "✅ Accessible" if response.status_code == 200 else f"⚠️ Status: {response.status_code}"
            print(f"   {url}: {status}")
        except Exception as e:
            print(f"   {url}: ❌ Error - {str(e)}")

# Uncomment to test connectivity
# test_website_connectivity()

## Instructions for Google Colab Upload and Execution

### 🚀 How to use this notebook in Google Colab:

1. **Upload to Colab:**
   - Go to [Google Colab](https://colab.research.google.com/)
   - Click "File" → "Upload notebook"
   - Select this `.ipynb` file

2. **Run the notebook:**
   - **Important:** Run cells in order from top to bottom
   - Start with the installation cell (cell 2)
   - Monitor progress bars and outputs
   - The extraction may take 30-60 minutes for full data

3. **Modify settings:**
   - Edit the `EXECUTION_CONFIG` in cell 8 before running extraction
   - Set `test_mode: False` for complete extraction (1980-2025)
   - Adjust `roll_number` to your actual roll number

4. **Download results:**
   - After execution, click the folder icon in the left sidebar
   - Download the created folders and files:
     - `SupremeCourt_CaseInfo/` folder
     - `SupremeCourt_Judgements/` folder  
     - `SupremeCourt_G4.json` (or your roll number)
     - `README.md`

5. **Troubleshooting:**
   - If extraction fails, check the debug cells at the bottom
   - Run the connectivity test to verify website access
   - For large extractions, consider running in smaller batches

### 📋 Assignment Checklist:
- ✅ Extract case information with year/registry combinations
- ✅ Extract judgments from 1980-2025  
- ✅ Download PDFs with proper naming conventions
- ✅ Generate JSON files matching sample structure
- ✅ Create organized folder structure
- ✅ Include documentation (README.md)

**Ready for submission after downloading all generated files!**

In [None]:
# This cell was moved up to cell 6 - FixedCaseInfoExtractor is now defined earlier
print("✅ FixedCaseInfoExtractor is now available from cell 6")

In [None]:
# FIXED: Test the improved case extractor
print("🔧 Testing the Fixed Case Information Extractor...")
print("=" * 60)

# Use the new improved extractor
fixed_case_extractor = FixedCaseInfoExtractor()

try:
    # Extract case information with the fixed extractor
    fixed_case_extractor.extract_case_info(
        year_range=EXECUTION_CONFIG['case_years'],
        registry_list=EXECUTION_CONFIG['registries']
    )
    
    fixed_cases_data = fixed_case_extractor.cases_data
    print(f"✅ Fixed extraction completed! Found {len(fixed_cases_data)} cases")
    
    # Display sample data if found
    if fixed_cases_data:
        print("\n📄 Sample case data from fixed extractor:")
        sample_case = fixed_cases_data[0]
        for key, value in sample_case.items():
            print(f"   {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
    else:
        print("\n⚠️  No cases found. The website might require manual inspection.")
        print("   Consider checking the website structure manually or adjusting search criteria.")
    
except Exception as e:
    print(f"❌ Error in fixed case extraction: {str(e)}")
    fixed_cases_data = []
    
finally:
    fixed_case_extractor.close_driver()

# Update the cases_data variable for the rest of the workflow
if len(fixed_cases_data) > len(cases_data if 'cases_data' in globals() else []):
    cases_data = fixed_cases_data
    print(f"\n✅ Updated cases_data with {len(cases_data)} cases from fixed extractor")

print(f"\n📊 Final Case Information Summary: {len(cases_data if 'cases_data' in globals() else [])} cases")

In [None]:
# DEBUGGING: Website Structure Inspector
def inspect_website_manually():
    """Manual inspection tool for debugging website structure"""
    print("🔍 Manual Website Structure Inspection")
    print("=" * 50)
    
    driver = setup_driver()
    
    try:
        # Load the case information website
        driver.get(Config.CASE_INFO_URL)
        time.sleep(3)
        
        print("📋 Website loaded. Analyzing structure...")
        
        # Get page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all form elements
        print("\n🔧 Form Elements Found:")
        forms = soup.find_all('form')
        print(f"   Found {len(forms)} forms")
        
        # Analyze inputs
        inputs = soup.find_all(['input', 'select', 'textarea'])
        print(f"\n📝 Input Elements Found ({len(inputs)} total):")
        
        for i, elem in enumerate(inputs):
            name = elem.get('name', 'no-name')
            elem_type = elem.get('type', elem.name)
            value = elem.get('value', '')
            placeholder = elem.get('placeholder', '')
            
            # For select elements, show options
            if elem.name == 'select':
                options = [opt.text.strip() for opt in elem.find_all('option') if opt.text.strip()]
                print(f"   {i+1}. SELECT: name='{name}', options={options[:5]}{'...' if len(options) > 5 else ''}")
            else:
                print(f"   {i+1}. {elem_type.upper()}: name='{name}', value='{value}', placeholder='{placeholder}'")
        
        # Find submit buttons
        buttons = soup.find_all(['button', 'input'], type=['submit', 'button'])
        print(f"\n🔘 Submit Buttons Found ({len(buttons)} total):")
        for i, btn in enumerate(buttons):
            text = btn.get('value', btn.text.strip())
            print(f"   {i+1}. {text}")
        
        # Look for any JavaScript or special requirements
        scripts = soup.find_all('script')
        print(f"\n📜 JavaScript sections found: {len(scripts)}")
        
        # Check for any visible error messages or requirements
        print(f"\n📄 Page content preview:")
        visible_text = soup.get_text()[:500]
        print(f"   {visible_text}...")
        
        # Try to interact with the form
        print(f"\n🧪 Attempting to interact with form elements...")
        
        # Try to find and interact with actual elements
        try:
            # Look for case type dropdown
            case_type_selects = driver.find_elements(By.TAG_NAME, 'select')
            print(f"   Found {len(case_type_selects)} select elements")
            
            for i, select_elem in enumerate(case_type_selects):
                try:
                    select_obj = Select(select_elem)
                    options = [opt.text for opt in select_obj.options]
                    print(f"   Select {i+1} options: {options}")
                except:
                    print(f"   Select {i+1}: Could not get options")
            
            # Look for text inputs
            text_inputs = driver.find_elements(By.CSS_SELECTOR, 'input[type="text"]')
            print(f"   Found {len(text_inputs)} text input elements")
            
            # Try a sample search
            if len(case_type_selects) > 0 and len(text_inputs) > 0:
                print(f"\n🚀 Attempting sample search...")
                
                # Fill first select
                try:
                    select_obj = Select(case_type_selects[0])
                    if len(select_obj.options) > 1:
                        select_obj.select_by_index(1)
                        print(f"   ✅ Selected option in first dropdown")
                except:
                    print(f"   ❌ Could not select in first dropdown")
                
                # Fill first text input
                try:
                    text_inputs[0].send_keys("2024")
                    print(f"   ✅ Filled first text input with '2024'")
                except:
                    print(f"   ❌ Could not fill first text input")
                
                # Find and click search button
                try:
                    search_btn = driver.find_element(By.XPATH, "//input[@type='submit'] | //button[contains(text(), 'Search')]")
                    search_btn.click()
                    time.sleep(5)
                    print(f"   ✅ Clicked search button")
                    
                    # Check for results or alerts
                    try:
                        alert = driver.switch_to.alert
                        print(f"   ⚠️  Alert appeared: {alert.text}")
                        alert.accept()
                    except:
                        print(f"   ✅ No alert, checking for results...")
                        
                        # Look for results
                        result_soup = BeautifulSoup(driver.page_source, 'html.parser')
                        tables = result_soup.find_all('table')
                        print(f"   Found {len(tables)} tables in results")
                        
                        if tables:
                            print(f"   Sample table content: {tables[0].get_text()[:200]}...")
                        
                except Exception as e:
                    print(f"   ❌ Error clicking search: {str(e)}")
            
        except Exception as e:
            print(f"Error during interaction: {str(e)}")
        
    except Exception as e:
        print(f"❌ Error inspecting website: {str(e)}")
        
    finally:
        driver.quit()
        print(f"\n✅ Website inspection completed")

# Uncomment the line below to run the manual inspection
# inspect_website_manually()