<a href="https://colab.research.google.com/github/Layantt/Data-Science-Project/blob/main/DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Required Libraries Installation
import subprocess
import sys

def install_packages():
    packages = ["requests", "beautifulsoup4", "pandas", "lxml"]
    for package in packages:
        try:
            __import__(package.replace("beautifulsoup4", "bs4"))
        except ImportError:
            subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)

install_packages()

# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime
from google.colab import files

# Create Output Directory
output_dir = "raw_data"
os.makedirs(output_dir, exist_ok=True)

# Saudi Personal Data Protection Law URL
url = "https://laws.boe.gov.sa/boelaws/laws/lawdetails/b7cfae89-828e-4994-b167-adaa00e37188/1"

print("Starting enhanced data collection with article status detection...")

try:
    # Fetch Web Page
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()

    # Save Raw HTML (Unstructured Data)
    html_file = os.path.join(output_dir, "pdpl_raw_html.html")
    with open(html_file, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"Raw HTML saved: {html_file}")

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(response.text, "lxml")

    # NEW: Extract articles using HTML structure first (if available)
    articles_from_html = []
    article_divs = soup.find_all("div", class_="article_item")

    if article_divs:
        print(f"Found {len(article_divs)} article divs in HTML structure")

        for div in article_divs:
            # Determine article status based on CSS classes
            css_classes = div.get("class", [])

            # Check if article is modified
            if "changed-article" in css_classes:
                article_status = "معدلة"
            elif "no_alternate" in css_classes:
                article_status = "أصلية"
            else:
                article_status = "غير محدد"

            # Extract article title
            title_element = div.find("h3")
            article_title = title_element.get_text(strip=True) if title_element else "عنوان غير محدد"

            # Extract article content
            content_div = div.find("div", class_="HTMLContainer")
            if content_div:
                article_text = content_div.get_text(separator=" ", strip=True)

                # NEW: Extract modification text from popup if exists
                modification_text = "لا يوجد نص معدل"  # Default for original articles

                if article_status == "معدلة":
                    # Look for article_item_popup within this article div
                    popup_div = div.find("div", class_="article_item_popup")
                    if popup_div:
                        popup_content = popup_div.find("div", class_="HTMLContainer")
                        if popup_content:
                            popup_text = popup_content.get_text(separator=" ", strip=True)
                            if popup_text and len(popup_text) > 10:
                                modification_text = popup_text
                        else:
                            # If no HTMLContainer in popup, get all text from popup
                            popup_text = popup_div.get_text(separator=" ", strip=True)
                            if popup_text and len(popup_text) > 10:
                                modification_text = popup_text

                # Clean content from administrative text
                if len(article_text) > 10:  # Ensure meaningful content
                    articles_from_html.append({
                        "article_title": article_title,
                        "article_text": article_text,
                        "article_status": article_status,
                        "modification_text": modification_text
                    })

    # Fallback: Extract from text if no HTML structure found
    articles_from_text = []
    if not articles_from_html:
        print("No structured HTML found, falling back to text extraction...")

        text = soup.get_text(separator="\n", strip=True)
        lines = text.split("\n")
        current_article = None
        current_content = []
        article_found = False

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check for New Article Header
            if re.match(r"^المادة\s+", line):
                # Save Previous Article if exists
                if current_article and current_content and article_found:
                    article_text = " ".join(current_content).strip()
                    if len(article_text) > 10:

                        # Determine status from text content
                        article_status = "أصلية"  # Default
                        modification_text = "لا يوجد نص معدل"  # Default for original articles

                        if any(indicator in article_text for indicator in [
                            "عُدلت هذه المادة", "المرسوم الملكي", "بموجب", "معدلة"
                        ]):
                            article_status = "معدلة"
                            # Try to extract modification text from content
                            modification_patterns = [
                                r'عُدلت هذه المادة.*?"(.*?)"',
                                r'بموجب.*?المرسوم الملكي.*?(.*?)(?:\.|$)',
                                r'لتكون بالنص الآتي.*?"(.*?)"'
                            ]

                            for pattern in modification_patterns:
                                match = re.search(pattern, article_text, re.DOTALL)
                                if match:
                                    modification_text = match.group(1).strip()
                                    break

                            # If no specific pattern found, use general modification text
                            if modification_text == "لا يوجد نص معدل":
                                mod_start = article_text.find("عُدلت هذه المادة")
                                if mod_start != -1:
                                    modification_text = article_text[mod_start:mod_start+200] + "..."

                        articles_from_text.append({
                            "article_title": current_article,
                            "article_text": article_text,
                            "article_status": article_status,
                            "modification_text": modification_text
                        })

                # Start New Article
                current_article = line
                current_content = []
                article_found = True

            else:
                # Add Content to Current Article
                if current_article and article_found:
                    # Smart Filtering
                    line_lower = line.lower().strip()

                    # Skip Very Short Lines
                    if len(line) <= 3:
                        continue

                    # Skip Administrative Lines
                    skip_line = False
                    if (line.startswith("تاريخ") or
                        line.startswith("رقم") or
                        line.startswith("الجريدة الرسمية") or
                        line.startswith("*") or
                        line.startswith("-")):
                        skip_line = True

                    # Administrative patterns
                    admin_only_patterns = [
                        "تعديلات المادة",
                        "مادة معدلة",
                        "مادة ملغية"
                    ]

                    for pattern in admin_only_patterns:
                        if (pattern in line_lower and
                            len(line.strip()) < 150 and
                            line.count('.') <= 1):
                            skip_line = True
                            break

                    if not skip_line:
                        current_content.append(line)

        # Save Last Article
        if current_article and current_content and article_found:
            article_text = " ".join(current_content).strip()
            if len(article_text) > 10:
                # Determine status from text content
                article_status = "أصلية"  # Default
                modification_text = "لا يوجد نص معدل"  # Default for original articles

                if any(indicator in article_text for indicator in [
                    "عُدلت هذه المادة", "المرسوم الملكي", "بموجب", "معدلة"
                ]):
                    article_status = "معدلة"
                    # Try to extract modification text from content
                    modification_patterns = [
                        r'عُدلت هذه المادة.*?"(.*?)"',
                        r'بموجب.*?المرسوم الملكي.*?(.*?)(?:\.|$)',
                        r'لتكون بالنص الآتي.*?"(.*?)"'
                    ]

                    for pattern in modification_patterns:
                        match = re.search(pattern, article_text, re.DOTALL)
                        if match:
                            modification_text = match.group(1).strip()
                            break

                    # If no specific pattern found, use general modification text
                    if modification_text == "لا يوجد نص معدل":
                        mod_start = article_text.find("عُدلت هذه المادة")
                        if mod_start != -1:
                            modification_text = article_text[mod_start:mod_start+200] + "..."

                articles_from_text.append({
                    "article_title": current_article,
                    "article_text": article_text,
                    "article_status": article_status,
                    "modification_text": modification_text
                })

    # Use HTML extraction if available, otherwise use text extraction
    articles = articles_from_html if articles_from_html else articles_from_text

    # Convert to DataFrame
    df = pd.DataFrame(articles)

    # Data Cleaning
    if not df.empty:
        df = df[df["article_text"].str.len() > 10]  # Meaningful content
        df = df.drop_duplicates(subset=["article_title"])  # Remove duplicates

        # Special handling for first article duplicates
        first_articles = df[df["article_title"].str.contains("المادة الأولى|المادة \\(1\\)|المادة ١", case=False, na=False, regex=True)]
        if len(first_articles) > 1:
            print("Detected duplicate first article - fixing...")
            best_first = first_articles.loc[first_articles["article_text"].str.len().idxmax()]
            df = df[~df["article_title"].str.contains("المادة الأولى|المادة \\(1\\)|المادة ١", case=False, na=False, regex=True)]
            df = pd.concat([pd.DataFrame([best_first]), df]).reset_index(drop=True)

    # Save Enhanced Structured Data
    csv_file = os.path.join(output_dir, "pdpl_articles_enhanced.csv")
    df.to_csv(csv_file, index=False, encoding="utf-8-sig")

    print(f"Successfully extracted {len(df)} articles with status information!")
    print(f"Enhanced data saved to: {csv_file}")

    # Display Statistics
    if not df.empty:
        print("\n📊 Article Status Summary:")
        print("-" * 40)
        status_counts = df['article_status'].value_counts()
        for status, count in status_counts.items():
            print(f"• {status}: {count} مادة")

        print(f"\n📝 Total Articles: {len(df)}")

        # Display Sample Data
        print("\n🔍 Sample of collected data:")
        print("-" * 50)
        for i, row in df.head(3).iterrows():
            status_emoji = "✏️" if row['article_status'] == "معدلة" else "📋"
            print(f"{status_emoji} {row['article_title']} ({row['article_status']})")
            print(f"   Content: {row['article_text'][:100]}...")
            print(f"   Modification: {row['modification_text'][:100]}{'...' if len(row['modification_text']) > 100 else ''}")
            print()

except Exception as e:
    print(f"❌ Error in data collection: {str(e)}")
    import traceback
    traceback.print_exc()

print(f"\n📁 Files saved in directory: {output_dir}")
print("✅ Enhanced data collection completed successfully!")

# Download files (for Google Colab)
try:
    files.download(f'{output_dir}/pdpl_raw_html.html')
    files.download(f'{output_dir}/pdpl_articles_enhanced.csv')
except:
    print("📥 Files ready for download in the output directory")

Starting enhanced data collection with article status detection...
❌ Error in data collection: HTTPSConnectionPool(host='laws.boe.gov.sa', port=443): Max retries exceeded with url: /boelaws/laws/lawdetails/b7cfae89-828e-4994-b167-adaa00e37188/1 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1010)')))

📁 Files saved in directory: raw_data
✅ Enhanced data collection completed successfully!
📥 Files ready for download in the output directory


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 464, in _make_request
    self._validate_conn(conn)
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
    conn.connect()
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 790, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/connection.py", line 969, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/urllib3/util/ssl_.py", line 480, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-package

# Data Processing and Cleaning

In [2]:
# Import required libraries
import pandas as pd

# Load the structured dataset extracted from the scraping code
df = pd.read_csv("raw_data/pdpl_articles_enhanced.csv")

# Display basic info
print("Initial shape:", df.shape)
df.info()


FileNotFoundError: [Errno 2] No such file or directory: 'raw_data/pdpl_articles_enhanced.csv'

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Check for duplicates
duplicates = df.duplicated(subset=["article_title"]).sum()
print(f"\nDuplicate articles found: {duplicates}")

# Display sample
df.head()


In [None]:
import re

def clean_text(text):
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove non-Arabic or special symbols if any
    text = re.sub(r'[^\u0600-\u06FF\s.,؛:،]', '', text)
    return text.strip()

# Apply cleaning
df["article_text"] = df["article_text"].apply(clean_text)
df["modification_text"] = df["modification_text"].apply(clean_text)


In [None]:
# Normalize status column to ensure consistency
df["article_status"] = df["article_status"].replace({
    "معدلة": "Modified",
    "أصلية": "Original",
    "غير محدد": "Unknown"
})


In [None]:
# Save the cleaned and processed dataset
cleaned_file = "raw_data/pdpl_cleaned.csv"
df.to_csv(cleaned_file, index=False, encoding="utf-8-sig")

print(f"✅ Cleaned dataset saved to: {cleaned_file}")
