In [19]:
import json
import logging
import re
from collections import Counter
import numpy as np
import pandas as pd # Used for easy frequency tables
from transformers import AutoTokenizer
# --- Configuration ---
JSON_FILE_PATH = "saudi_laws_scraped.json"

# Load once and reuse (donโt load inside the function for performance)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")

# Configure logging (kept simple)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)

# --- Helper Functions ---

def load_data(file_path):
    """Loads the main JSON data file."""
    logging.info(f"Loading data from {file_path}...")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        logging.info("Data loaded successfully.")
        return data
    except Exception as e:
        logging.critical(f"ERROR loading data: {e}")
        return None


def count_tokens(text):
    """Count tokens using a Hugging Face model tokenizer."""
    if not text:
        return 0
    # The tokenizer.encode method returns a list of token IDs. Its length is the token count.
    return len(tokenizer.encode(text, add_special_tokens=False))


def print_stats(name, data_list):
    """Helper function to print common statistics, including quantiles, for a list of numbers."""
    if not data_list:
        logging.warning(f"No data found for '{name}' statistics.")
        print(f"\n--- Statistics for {name} ---")
        print("  No data available.")
        return

    arr = np.array(data_list)
    print(f"\n--- Statistics for {name} ---")
    
    # Central Tendency and Range
    print(f"  Mean:   {np.mean(arr):,.2f}")
    print(f"  Median: {np.median(arr):,.0f}")
    print(f"  Min:    {np.min(arr):,.0f}")
    print(f"  Max:    {np.max(arr):,.0f}")
    
    # Quantiles
    # The quantiles are calculated using the 'np.quantile' function.
    q25, q75, q90 = np.quantile(arr, [0.25, 0.75, 0.90]) # Q1, Q3, and 90th percentile
    
    print("  --- Quantiles ---")
    print(f"  25th (Q1): {q25:,.0f}")
    print(f"  75th (Q3): {q75:,.0f}")
    print(f"  90th:      {q90:,.0f}")

    # Optional: Interquartile Range (IQR) for dispersion
    iqr = q75 - q25
    print(f"  IQR (Q3-Q1): {iqr:,.0f}")

def run_eda(data):
    """Main function to run the Exploratory Data Analysis with reduced scope, now including Tokens Per Part."""
    
    # --- 1. Initialization ---
    total_laws = 0
    total_parts = 0
    total_articles = 0

    # For Distributions
    parts_per_law_counts = []
    articles_per_law_counts = []
    tokens_per_law_counts = []
    tokens_per_article_counts = []
    articles_per_part_counts = [] 
    tokens_per_part_counts = [] # NEW: Tracking tokens per part
    brief_token_lengths = []

    # For Tracking Max/Min
    max_parts_law = {'title': '', 'count': 0}
    max_articles_law = {'title': '', 'count': 0}
    max_tokens_law = {'title': '', 'length': 0}
    max_articles_part = {'law': '', 'part': '', 'count': 0}
    max_tokens_part = {'law': '', 'part': '', 'length': 0} # NEW: Max tokens per part
    longest_article = {'law': '', 'part': '', 'title': '', 'length': 0}
    
    # For Counters
    article_status_counter = Counter()

    # --- 2. Main Processing Loop ---
    logging.info("Starting EDA processing loop...")
    
    for main_cat_name, main_cat_data in data.items():
        for sub_cat_name, sub_cat_data in main_cat_data.items():
            total_laws += len(sub_cat_data)
            
            for law_title, law_data in sub_cat_data.items():
                
                parts = law_data.get('parts', {})
                brief = law_data.get('brief', '')

                # --- Law-Level Counts ---
                num_parts = len(parts)
                parts_per_law_counts.append(num_parts)
                if num_parts > max_parts_law['count']:
                    max_parts_law = {'title': law_title, 'count': num_parts}
                
                brief_token_lengths.append(count_tokens(brief))

                current_law_article_count = 0
                current_law_token_count = 0
                
                for part_title, articles_list in parts.items():
                    total_parts += 1
                    num_articles_in_part = len(articles_list)
                    articles_per_part_counts.append(num_articles_in_part) 
                    current_law_article_count += num_articles_in_part
                    current_part_token_count = 0 # Initialize part token count
                    
                    if num_articles_in_part > max_articles_part['count']:
                        max_articles_part = {
                            'law': law_title, 
                            'part': part_title, 
                            'count': num_articles_in_part
                        }

                    for article in articles_list:
                        total_articles += 1
                        
                        # --- Article-Level Analysis (Tokens) ---
                        article_text = article.get('Article_Text', '')
                        article_token_len = count_tokens(article_text)
                        
                        tokens_per_article_counts.append(article_token_len)
                        current_law_token_count += article_token_len
                        current_part_token_count += article_token_len # Accumulate for part

                        if article_token_len > longest_article['length']:
                            longest_article = {
                                'law': law_title,
                                'part': part_title,
                                'title': article.get('Article_Title', 'N/A'),
                                'length': article_token_len
                            }
                        
                        article_status = article.get('status', 'Unknown')
                        article_status_counter.update([article_status])
                        
                    # --- Finish Part-Level (Token Counts) ---
                    tokens_per_part_counts.append(current_part_token_count) # Store Part Token Count

                    if current_part_token_count > max_tokens_part['length']:
                        max_tokens_part = {
                            'law': law_title, 
                            'part': part_title, 
                            'length': current_part_token_count
                        }

                
                # --- Finish Law-Level (Article & Token Counts) ---
                articles_per_law_counts.append(current_law_article_count)
                tokens_per_law_counts.append(current_law_token_count) 

                if current_law_article_count > max_articles_law['count']:
                    max_articles_law = {'title': law_title, 'count': current_law_article_count}
                
                if current_law_token_count > max_tokens_law['length']:
                    max_tokens_law = {'title': law_title, 'length': current_law_token_count}

    logging.info("...EDA processing loop finished.")
    logging.info("Generating report...")

    # --- 3. Print Report ---
    
    print("=============================================")
    print(" ๐ Concise Exploratory Data Analysis (EDA) Report")
    print("=============================================")

    # --- Section 1: Overall Dataset Statistics โ๏ธ ---
    print("\n## 1. Overall Dataset Statistics โ๏ธ")
    print("---------------------------------------------")
    print(f"  Total Number of Laws:  {total_laws:,}")
    print(f"  Total Number of Parts: {total_parts:,}")
    print(f"  Total Number of Articles: {total_articles:,}")

    # ---------------------------------------------

    ## 2. Structural Analysis (Law & Part Level) ๐
    
    print("\n## 2. Structural Analysis (Law & Part Level) ๐")
    print("---------------------------------------------")

    # Articles per Part Statistics
    print_stats("Articles per Part", articles_per_part_counts)
    print(f"  Part with the Most Articles: Law: \"{max_articles_part['law']}\", Part: \"{max_articles_part['part']}\" ({max_articles_part['count']} articles)")
    
    # Law-Level Article Stats
    print_stats("Articles per Law", articles_per_law_counts)
    print(f"  Law with Most Articles: \"{max_articles_law['title']}\" ({max_articles_law['count']:,} articles)")
    
    # Parts per Law Stats
    print_stats("Parts per Law", parts_per_law_counts)
    print(f"  Law with Most Parts: \"{max_parts_law['title']}\" ({max_parts_law['count']} parts)")

    # ---------------------------------------------

    ## 3. Token/Length Analysis โ๏ธ
    
    print("\n## 3. Token/Length Analysis โ๏ธ")
    print("---------------------------------------------")
    

    # Tokens per Article Statistics
    print_stats("Tokens per Article", tokens_per_article_counts)
    print("  Longest Single Article (Tokens):")
    print(f"    Law:     \"{longest_article['law']}\"")
    print(f"    Article: \"{longest_article['title']}\"")
    print(f"    Length:  {longest_article['length']:,} tokens")

    # Tokens per Part Statistics (ADDED)
    print_stats("Tokens per Part", tokens_per_part_counts)
    print("  Longest Part (by tokens):")
    print(f"    Law:     \"{max_tokens_part['law']}\"")
    print(f"    Part:    \"{max_tokens_part['part']}\"")
    print(f"    Length:  {max_tokens_part['length']:,} tokens")
    

    # Tokens per Law Stats
    print_stats("Tokens per Law", tokens_per_law_counts)
    print(f"  Longest Law (by tokens): \"{max_tokens_law['title']}\" ({max_tokens_law['length']:,} tokens)")
    
    # Brief Token Stats
    print_stats("Law Brief Token Length", brief_token_lengths)

    # ---------------------------------------------

    ## 4. Article Status โน๏ธ
    
    print("\n## 4. Article Status โน๏ธ")
    print("---------------------------------------------")
    
    print("\n  Article Status Distribution:")
    # Using Pandas for a clean percentage table
    article_status_df = pd.DataFrame(
        article_status_counter.items(), 
        columns=['Status', 'Count']
    ).set_index('Status')
    article_status_df['Percentage'] = (article_status_df['Count'] / total_articles * 100).round(2)
    print(article_status_df.to_markdown(floatfmt=",.0f"))
            
    print("\n=============================================")
    print(" โ Concise EDA Report Complete")
    print("=============================================")


if __name__ == "__main__":
    # 1. Load the data
    scraped_data = load_data(JSON_FILE_PATH)
    
    # 2. Run the EDA
    if scraped_data:
        run_eda(scraped_data)
    else:
        logging.error("Could not run EDA because data failed to load.")

2025-10-19 04:31:56,810 - INFO - Loading data from saudi_laws_scraped.json...
2025-10-19 04:31:56,865 - INFO - Data loaded successfully.
2025-10-19 04:31:56,870 - INFO - Starting EDA processing loop...
2025-10-19 04:32:00,568 - INFO - ...EDA processing loop finished.
2025-10-19 04:32:00,569 - INFO - Generating report...


 ๐ Concise Exploratory Data Analysis (EDA) Report

## 1. Overall Dataset Statistics โ๏ธ
---------------------------------------------
  Total Number of Laws:  517
  Total Number of Parts: 2,039
  Total Number of Articles: 16,371

## 2. Structural Analysis (Law & Part Level) ๐
---------------------------------------------

--- Statistics for Articles per Part ---
  Mean:   8.03
  Median: 5
  Min:    1
  Max:    222
  --- Quantiles ---
  25th (Q1): 2
  75th (Q3): 11
  90th:      18
  IQR (Q3-Q1): 9
  Part with the Most Articles: Law: "ูุธุงู ุงูุฅุฌุฑุงุกุงุช ุงูุฌุฒุงุฆูุฉ", Part: "main" (222 articles)

--- Statistics for Articles per Law ---
  Mean:   31.67
  Median: 18
  Min:    0
  Max:    721
  --- Quantiles ---
  25th (Q1): 14
  75th (Q3): 30
  90th:      51
  IQR (Q3-Q1): 16
  Law with Most Articles: "ูุธุงู ุงููุนุงููุงุช ุงููุฏููุฉ" (721 articles)

--- Statistics for Parts per Law ---
  Mean:   3.94
  Median: 1
  Min:    0
  Max:    133
  --- Quantiles --

In [10]:
def build_search_index(file_path="saudi_laws_scraped.json"):
    """
    Loads the hierarchical JSON and flattens it into a search index 
    where keys are Law Titles.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return {}

    search_index = {}
    
    # Traverse the 3-level hierarchy
    for main_cat_name, sub_categories in data.items():
        for sub_cat_name, laws in sub_categories.items():
            for law_title, law_data in laws.items():
                search_index[law_title] = law_data
                
    return search_index

LAW_SEARCH_INDEX = build_search_index()


In [11]:
LAW_SEARCH_INDEX["ูุธุงู ุงูุฅุฌุฑุงุกุงุช ุงูุฌุฒุงุฆูุฉ"]

{'brief': 'ูุชุถูู ุงููุธุงู:\nุฃุญูุงู ุนุงูุฉุ (ูุทุงู ุชุทุจูู ุงููุธุงูุ ุชูููู ุงูููุงุทู ูุชูุชูุดูุ ุญููู ุงููุชูู..ุฅูุฎ) ุ ุงูุฏุนูู ุงูุฌุฒุงุฆูุฉ (ุฑูุน ุงูุฏุนููุ ุงููุถุงุก ุงูุฏุนูู)ุ ุฅุฌุฑุงุกุงุช ุงูุงุณุชุฏูุงู (ุฌูุน ุงููุนูููุงุช ูุถุจุทูุงุ ุงูุชูุจุณ ุจุงูุฌุฑููุฉุ ุงููุจุถ ุนูู ุงููุชููุ ุชูุชูุด ุงูุฃุดุฎุงุต ูุงููุณุงููุ ุถุจุท ุงูุฑุณุงุฆู ููุฑุงูุจุฉ ุงูููุงููุงุช)ุ ุฅุฌุฑุงุกุงุช ุงูุชุญููู (ุชุตุฑูุงุช ุงููุญููุ ูุฏุจ ุงูุฎุจุฑุงุกุ ุงูุงูุชูุงู ูุงููุนุงููุฉ ูุงูุชูุชูุด ูุถุจุท ุงูุฃุดูุงุก ุงููุชุนููุฉ ุจุงูุฌุฑููุฉุ ุงูุชุตุฑู ูู ุงูุฃุดูุงุก ุงููุถุจูุทุฉุ ุงูุงุณุชูุงุน ุฅูู ุงูุดููุฏุ ุงูุงุณุชุฌูุงุจ ูุงูููุงุฌูุฉุ ุงูุชูููู ุจุงูุญุถูุฑ ูุฃูุฑ ุงูุถุจุท ูุงูุฅุญุถุงุฑุ ุฃูุฑ ุงูุชููููุ ุงูุฅูุฑุงุฌ ุงููุคูุชุ ุงูุชูุงุก ุงูุชุญููู ูุงูุชุตุฑู ูู ุงู

In [17]:
LAW_SEARCH_INDEX["ูุธุงู ุงูุจูุฆุฉ"]['parts']['ุงููุตู ุงูุฃูู ( ุฃุญูุงู ุนุงูุฉ )']

[{'id': 6635,
  'Article_Title': 'ุงููุงุฏุฉ ุงูุฃููู',
  'status': 'Modified',
  'Article_Text': 'ููุตุฏ ุจุงูุฃููุงุธ ูุงูุนุจุงุฑุงุช ุงูุขุชูุฉ -ุฃูููุง ูุฑุฏุช ูู ูุฐุง ุงููุธุงู- ุงููุนุงูู ุงููุจููุฉ ุฃูุงู ูู ูููุงุ ูุง ูู ููุชุถ ุงูุณูุงู ุบูุฑ ุฐูู:\nุงููุธุงู: ูุธุงู ุงูุจูุฆุฉ.\nุงูููุงุฆุญ: ุงูููุงุฆุญ ุงูุชูููุฐูุฉ ูููุธุงู.\nุงููุฒุงุฑุฉ: ูุฒุงุฑุฉ ุงูุจูุฆุฉ ูุงูููุงู ูุงูุฒุฑุงุนุฉ.\nุงููุฒูุฑ: ูุฒูุฑ ุงูุจูุฆุฉ ูุงูููุงู ูุงูุฒุฑุงุนุฉ.\nุงูุฌูุฉ ุงููุฎุชุตุฉ: ุงููุฒุงุฑุฉุ ุฃู ุฃู ูู ุงููุฑุงูุฒ ุงููุทููุฉ ููุทุงุน ุงูุจูุฆุฉุ ูู ุจุญุณุจ ุงุฎุชุตุงุตูุ ููููุงู ููุง ุชุญุฏุฏู ุงูููุงุฆุญ.\nุงูุฌูุฉ ุงููุดุฑูุฉ: ุฃู ุฌูุฉ ุญููููุฉ ููุฎููููุฉ ูุธุงูุงู ุจุงูุฅุดุฑุงู ุนูู ูุทุงุน ุชููููุ ูุชุฎุชุต ุจุฅุตุฏุงุฑ ุชุฑุงุฎูุต ูููุงุฑุณุฉ ุงูุฃูุดุทุฉ ุงูุชู ุชุฎุถุน ูุฅุดุฑุงููุง.\nุงูุดุฎุต: ุฃ

In [23]:
len([1 for k in LAW_SEARCH_INDEX if len(LAW_SEARCH_INDEX[k]['parts'].keys())>1])

116