<a href="https://colab.research.google.com/github/Layantt/Data-Science-Project/blob/main/DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries Installation
import subprocess
import sys

def install_packages():
    packages = ["requests", "beautifulsoup4", "pandas", "lxml"]
    for package in packages:
        try:
            __import__(package.replace("beautifulsoup4", "bs4"))
        except ImportError:
            subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)

install_packages()

# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime
from google.colab import files

# Create Output Directory
output_dir = "raw_data"
os.makedirs(output_dir, exist_ok=True)

# Saudi Personal Data Protection Law URL
url = "https://laws.boe.gov.sa/boelaws/laws/lawdetails/b7cfae89-828e-4994-b167-adaa00e37188/1"

print("Starting data collection from official source...")

try:
    # Fetch Web Page
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()

    # Save Raw HTML (Unstructured Data)
    html_file = os.path.join(output_dir, "pdpl_raw_html.html")
    with open(html_file, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"Raw HTML saved: {html_file}")

    # Parse HTML and Extract Articles
    soup = BeautifulSoup(response.text, "lxml")
    text = soup.get_text(separator="\n", strip=True)

    # Article Extraction using Regex
    articles = []

    # Process Text Line by Line
    lines = text.split("\n")
    current_article = None
    current_content = []
    article_found = False  # Track if we started processing articles

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check for New Article Header
        if re.match(r"^المادة\s+", line):
            # Save Previous Article if exists
            if current_article and current_content and article_found:
                article_text = " ".join(current_content).strip()
                if len(article_text) > 10:  # Ensure meaningful content
                    articles.append({
                        "article_title": current_article,
                        "article_text": article_text,
                        "source_url": url,
                        "collection_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    })

            # Start New Article
            current_article = line
            current_content = []
            article_found = True

        else:
            # Add Content to Current Article
            if current_article and article_found:
                # Smart Filtering - Remove Administrative Content Only
                line_lower = line.lower().strip()

                # Skip Very Short Lines
                if len(line) <= 3:
                    continue

                # Check if line should be skipped
                skip_line = False

                # Clear Administrative Lines
                if (line.startswith("تاريخ") or
                    line.startswith("رقم") or
                    line.startswith("الجريدة الرسمية") or
                    line.startswith("*") or
                    line.startswith("-")):
                    skip_line = True

                # Amendment Boxes (Pure Administrative Information)
                admin_only_patterns = [
                    "تعديلات المادة",
                    "مادة معدلة",
                    "مادة ملغية",
                    "معدلة بموجب المرسوم الملكي",
                    "ملغاة بموجب المرسوم الملكي",
                    "وتاريخ"
                ]

                # Check for Pure Administrative Content
                for pattern in admin_only_patterns:
                    if (pattern in line_lower and
                        len(line.strip()) < 150 and  # Short line
                        line.count('.') <= 1):       # Not complex legal text
                        skip_line = True
                        break

                # Add Line if Not Pure Administrative
                if not skip_line:
                    current_content.append(line)

    # Save Last Article
    if current_article and current_content and article_found:
        article_text = " ".join(current_content).strip()
        if len(article_text) > 10:
            articles.append({
                "article_title": current_article,
                "article_text": article_text,
                "source_url": url,
                "collection_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    # Convert to DataFrame
    df = pd.DataFrame(articles)

    # Data Cleaning - Remove Empty or Duplicate Articles
    df = df[df["article_text"].str.len() > 10]  # Articles with meaningful content
    df = df.drop_duplicates(subset=["article_title"])  # Remove duplicates

    # Special Check for First Article - Remove Duplicates
    first_articles = df[df["article_title"].str.contains("المادة الأولى|المادة \\(1\\)|المادة ١", case=False, na=False, regex=True)]
    if len(first_articles) > 1:
        print("Detected duplicate first article - fixing...")
        # Keep the article with longest text (most complete)
        best_first = first_articles.loc[first_articles["article_text"].str.len().idxmax()]
        # Remove all first articles and replace with best one
        df = df[~df["article_title"].str.contains("المادة الأولى|المادة \\(1\\)|المادة ١", case=False, na=False, regex=True)]
        df = pd.concat([pd.DataFrame([best_first]), df]).reset_index(drop=True)

    # Save Structured Data
    csv_file = os.path.join(output_dir, "pdpl_articles.csv")
    df.to_csv(csv_file, index=False, encoding="utf-8-sig")

    print(f"Successfully extracted {len(df)} articles and saved to: {csv_file}")

    # Display Data Sample
    print("\nSample of collected data:")
    print("-" * 50)
    for i, row in df.head(3).iterrows():
        print(f"• {row['article_title']}")
        print(f"   Content: {row['article_text'][:100]}...")
        print()

except Exception as e:
    print(f"Error in data collection: {str(e)}")

print(f"\nFiles saved in directory: {output_dir}")
print("Data collection completed successfully!")
files.download('/content/raw_data/pdpl_raw_html.html')
files.download('/content/raw_data/pdpl_articles.csv')