In [1]:
import requests
import pandas as pd
import os
import subprocess
import random
import string
import base64
import glob
import time
import re
import logging

from tqdm.auto import tqdm, trange
from bs4 import BeautifulSoup
from pprint import pprint

from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from pathlib import Path

from io import BytesIO
from pypdf import PdfReader
from pypdf.errors import PdfReadError

pd.options.display.max_rows = 500
pd.options.display.max_colwidth = 500

- Meta data is the most important thing. DON"T MESS SUP META DATA !!!

In [2]:
# len(os.listdir("../pdf_documents"))

In [3]:
def is_valid_pdf(response, logger):
    """
    Checks if the response content is a valid PDF.

    Args:
        response (requests.Response): The response object from requests.get()
        logger (logging.Logger): Logger instance for logging messages.

    Returns:
        bool: True if valid PDF, False otherwise.
    """
    try:
        if response.status_code != 200:
            logger.error(f"Invalid response status code: {response.status_code}")
            return False
        
        content_type = response.headers.get("Content-Type", "").lower()
        if "pdf" not in content_type:
            logger.error(f"Invalid content type: {content_type}")
            return False  # Content-Type should contain "pdf"

        pdf_stream = BytesIO(response.content)
        PdfReader(pdf_stream)  # Attempt to parse
        return True
    except PdfReadError:
        logger.error("Failed to parse PDF. The file is not a valid PDF.")
        return False
    except Exception as e:
        logger.error(f"Unexpected error while validating PDF: {e}")
        return False


def download_pdf(
    session: requests.Session,
    logger,
    url: str,
    save_path_str: str,
    verify: bool = False,
):
    logger.info(f"Starting download from: {url}")

    try:
        response = session.get(url, stream=True, verify=verify, timeout=20)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
        return False

    if not is_valid_pdf(response, logger):
        logger.error(f"Download aborted: {url} is not a valid PDF.")
        return False

    save_path = Path(save_path_str)

    try:
        with save_path.open("wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)

        logger.info(f"File successfully downloaded to: {save_path}")
        return True
    except OSError as e:
        logger.error(f"Failed to save file: {e}")
        return False


In [4]:
BASE_PATH_SAVE = "../pdf_documents"
BASE_PATH_SAVE_META = "../pdf_meta"

In [5]:
def wget_pdf(command: list) -> None:
    try:
        subprocess.run(command, check=True)
        print("File downloaded successfully!")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")


def save_pdf_meta(meta: dict, meta_save_path: str):
    pd.DataFrame([meta]).to_csv(meta_save_path, index=False)


def generate_random_string(length=10):
    return "".join(random.choices(string.ascii_letters + string.digits, k=length))

In [6]:
def string_to_code(input_string: str) -> str:
    """
    Converts a string into a deterministic, one-to-one code using Base64 encoding.

    Args:
        input_string (str): The input string to be converted.

    Returns:
        str: The encoded string as a deterministic code.
    """
    # Encode the string into bytes
    string_bytes = input_string.replace(" ", "").encode("utf-8")
    # Convert the bytes to a Base64 encoded string
    encoded_string = base64.urlsafe_b64encode(string_bytes).decode("utf-8")
    print(f"lenght encode string: {len(encoded_string)}")
    if len(encoded_string) >= 255:
        print("name too long, truncate name")
        encoded_string = encoded_string[-64:]
    return encoded_string


def code_to_string(encoded_string: str) -> str:
    """
    Decodes the deterministic code back to the original string.

    Args:
        encoded_string (str): The encoded string to be decoded.

    Returns:
        str: The original string.
    """
    # Decode the Base64 string back to bytes
    string_bytes = base64.urlsafe_b64decode(encoded_string.encode("utf-8"))
    # Convert bytes back to a string
    original_string = string_bytes.decode("utf-8")
    return original_string

In [7]:
# Example usage
original_string = "Hello, World!?"
encoded = string_to_code(original_string)
decoded = code_to_string(encoded)

print("Original String:", original_string)
print("Encoded Code:", encoded)
print("Decoded String:", decoded)

lenght encode string: 20
Original String: Hello, World!?
Encoded Code: SGVsbG8sV29ybGQhPw==
Decoded String: Hello,World!?


In [8]:
def remove_file(file_path: str) -> None:
    """
    Removes a file at the specified path.

    Args:
        file_path (str): The path to the file to be removed.

    Returns:
        None
    """
    try:
        # Check if the file exists
        if os.path.exists(file_path):
            # Remove the file
            os.remove(file_path)
            print(f"File '{file_path}' has been removed successfully.")
        else:
            print(f"File '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred while trying to remove the file: {e}")

In [9]:
def get_pdf_and_save_meta(
    title: str,
    url: str,
    source: str,
    license_: str,
    filename: str,
    command: list,
    meta_save_path: str,
) -> None:
    pdf = {
        "title": title,
        "url": url,
        "source": source,
        "license": license_,
        "filename": filename,
    }
    print("meta view")
    pprint(pdf)

    wget_pdf(command=command)
    save_pdf_meta(meta=pdf, meta_save_path=meta_save_path)

In [10]:
# web_url = "https://gdcatalog.go.th/dataset/gdpublish-dsb21-1"
# # Use a session to reuse the TCP connection for faster requests
# session = requests.Session()

# # Fetch and parse the main page
# response = session.get(web_url)
# response.raise_for_status()
# soup = BeautifulSoup(response.text, "html.parser")

# # Extract resource links with titles
# base_url = "https://gdcatalog.go.th"
# resources_section = soup.find("section", id="dataset-resources")
# resource_links = [
#     {
#         "title": a["title"].strip(),
#         "href": base_url + a["href"],
#         "source": "Goverment data catalog smart plus",
#         "license": "Open Data Common",
#     }
#     for a in resources_section.find_all("a", class_="heading")
# ]

# # Process each resource to extract the PDF URL
# for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
#     pdf_page = session.get(resource["href"])
#     pdf_page.raise_for_status()
#     pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
#     pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
#     resource["url"] = pdf_url
#     resource["filename"] = f"pdf_มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล_0{i}.pdf"
#     resource.pop("href", None)

# resource_links = list(filter(lambda x: x["url"].endswith(".pdf"), resource_links))
# pd.DataFrame(resource_links).to_csv(
#     "../pdf_documents_meta_tmp/pdf_มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล.csv",
#     index=False,
# )
# for item in tqdm(resource_links, desc="donwloading pdf "):
#     download_pdf(
#         url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
#     )

In [11]:
# USE ONLY FOR GD DATACATALOG!!!
BASE_GD_CATALOG_URL = "https://gdcatalog.go.th"
GD_CATALOG_SOURCE = "Government data catalog smart plus"


def fetch_resource_links(
    request_session: requests.Session, web_url: str, license: str
) -> list[dict]:
    """Fetch resource links from the given web URL."""
    response = request_session.get(web_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    resources_section = soup.find("section", id="dataset-resources")
    if not resources_section:
        return []

    return [
        {
            "title": a["title"].strip(),
            "href": BASE_GD_CATALOG_URL + a["href"],
            "source": GD_CATALOG_SOURCE,
            "license": license,
        }
        for a in resources_section.find_all("a", class_="heading")
    ]


def extract_pdf_urls(
    request_session: requests.Session, resource_links: list[dict], batch_code: str
):
    """Extract PDF URLs from each resource page."""
    for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
        pdf_page = request_session.get(resource["href"])
        pdf_page.raise_for_status()
        pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
        pdf_url_tag = pdf_soup.find("a", class_="resource-url-analytics")

        if pdf_url_tag and "href" in pdf_url_tag.attrs:
            resource["url"] = pdf_url_tag["href"]
            resource["filename"] = f"pdf_{batch_code}_{i:05}.pdf"

        resource.pop("href", None)

    return list(filter(lambda x: x.get("url", "").endswith(".pdf"), resource_links))


def save_metadata(resource_links, output_path):
    """Save resource metadata to a CSV file."""
    df = pd.DataFrame(resource_links)
    df.to_csv(output_path, index=False)
    return df


def download_pdfs(resource_links, save_path):
    """Download PDFs from the extracted links."""
    for item in tqdm(resource_links, desc="Downloading PDFs"):
        download_pdf(url=item["url"], save_path=save_path)


def get_multiple_pdf_from_gd_catalog(
    request_session: requests.Session, web_url: str, license: str, batch_code: str
) -> list[dict]:
    # Fetch and process resource links
    resource_links = fetch_resource_links(request_session, web_url, license)
    resource_links = extract_pdf_urls(request_session, resource_links, batch_code)
    return resource_links

In [12]:
# get pdf from open base



In [13]:
from datetime import datetime

def get_current_datetime():
    # Get current date and time
    now = datetime.now()
    
    # Format the date-time string
    formatted_datetime = now.strftime("%Y-%m-%d_%H_%M_%S")
    
    return formatted_datetime

# Example usage
print(get_current_datetime())

2025-02-07_11_38_32


In [14]:
def create_filename_column(df: pd.DataFrame, batch_code: str) -> pd.DataFrame:
    df["filename"] = [f"pdf_doc_{batch_code}_{i:05}.pdf" for i in range(len(df))]
    return df

In [15]:
def setup_logger(
    prefix: str,
    log_file: str = None,
    console_level: str = "DEBUG",
    file_level: str = "WARNING",
):
    """
    Sets up a logger with a console and file handler.
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(
        logging.DEBUG
    )  # Set to the highest level; handlers will filter appropriately

    # Create console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(console_level)

    log_file = f"../log/{prefix}_{get_current_datetime()}.log"

    file_handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
    file_handler.setLevel(file_level)

    # Define log format
    formatter = logging.Formatter(
        "{asctime},{levelname},{message}",
        style="{",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    # Assign formatter to handlers
    console_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    # Add handlers to logger
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)

    return logger

In [16]:
# for i in range(102):  # Change the range as needed
#     filename = f"save_{i:03}"  # Format the number as three digits with leading zeros
#     print(filename)

# Single PDF

- First, create a new meta file. Use the format like files in `pdf_meta` dir
- Then, donwload each pdf using `download_pdf`
- return pdf name and link that can not be downloaded

- meta data ตอนแรกน่าจะต้องทำมือโดยมี columns: title, url, source, license จากนั้นค่อยอ่านไฟล์มาสร้าง columns ใหม่คือ filename
- ให้สร้างไฟล์ใน folder pdf_meta_pre
- filename จะต้องมีเลข batch ด้วย ชื่อไฟล์จะได้ไม่ซ้ำกัน
- จากนั้นให้ save dataframe meta แบบที่มี filename เข้าไปใน pdf_meta

- folder pdf_meta ห้ามมีไฟล์ที่ไม่ใช้

In [None]:
# files = os.listdir("../notuse_pdf_documents_meta_tmp")
# df  = pd.concat([pd.read_csv(os.path.join("../notuse_pdf_documents_meta_tmp", f)) for f in files]).drop(columns="filename")
# filenames = [f"pdf_doc_{i:05}.pdf" for i in range(len(df))]
# df["filename"] = filenames
# df.head(5)
# df.to_csv(f"../pdf_meta/meta_{get_current_datetime()}.csv", index=False)

In [None]:
# path = "../pdf_meta/meta_2025-01-30_10_15_23.csv" # อย่าลบบรรทัดนี้ ให้ comment out แทน
path = "../pdf_meta/meta_2025-01-31_09_18_50.csv"
meta = pd.read_csv(path).to_dict(orient="records")
meta[:3]

In [None]:
delete_filename = [os.path.join(BASE_PATH_SAVE ,item["filename"]) for item in meta]
for f in delete_filename:
    remove_file(f)

In [None]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

In [None]:
fail_item = []

for item in tqdm(meta, total=len(meta)):
    pdf_save_path = os.path.join(BASE_PATH_SAVE, item["filename"])
    check = download_pdf(session=session, url=item["url"], save_path=pdf_save_path)

    sleep_time = random.randint(0, 20)
    print(f"sleep for {sleep_time} secs")
    time.sleep(sleep_time)

    if not check:
        fail_item.append((item["title"], item["url"], item["filename"]))

In [None]:
if not fail_item:
    print("every pdf donwloaded successfully")
else:
    print("some pdf have a problem")

In [None]:
fail_item

# Multiple PDF for one site

## GD CATALOG

- We can scape pdf from this web
- use `get_multiple_pdf_from_gd_catalog` to get meta data for each website, then concat them
- use `download_pdfs` to donwload all the pdf

In [None]:
session = requests.Session()

In [None]:
generate_random_string(5)

In [None]:
n = 5
# format: url, license, group
# batch_code be anythoing just to distinguish file name
# urls = [
#     ("https://gdcatalog.go.th/dataset/gdpublish-abc1-1", "Open Data Common", "A"),
#     ("https://gdcatalog.go.th/dataset/gdpublish-xc6-4", "Open Data Common", "B")
# ]
urls = [
    (
        "https://gdcatalog.go.th/dataset/gdpublish-itd65_42_01",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-nsoloei",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-https-www2-uttaradit-go-th-news_devpro",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-nan01",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-cop",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-dataset-20-015",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-dataset-15-013",
        "Creative Commons Attributions",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-research-21-02",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-ta-11-04",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-ta-11-03",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-ta-11-02",
        "Open Data Common",
        generate_random_string(n),
    ),
    (
        "https://gdcatalog.go.th/dataset/gdpublish-ta-11-01",
        "Open Data Common",
        generate_random_string(n),
    ),
]

In [None]:
tmp = []
for url, license, batch_code in tqdm(urls):
    resource_links = get_multiple_pdf_from_gd_catalog(
        session, url, license, batch_code
    )
    tmp.extend(resource_links)

In [None]:
col_order = ["title", "url", "source", "license", "filename"]
df = pd.DataFrame(tmp).loc[:, col_order]
df.to_csv(f"../pdf_meta/meta_{get_current_datetime()}.csv", index=False)

## Special case

### แจกเอกสารประกอบการสอนเพื่อการศึกษา (PDF) KongRuksiam Studio

In [None]:
filename = [
    "พัฒนาเว็บด้วย Python & Flask (Free).pdf",
    "เรียนรู้การใช้งาน Git & GitHub สำหรับผู้เริ่มต้น.pdf",
    "JSON เบื้องต้น (Update).pdf",
    "Python & OpenCV Computer Vision & Image Processing (Free).pdf",
    "รวมเล่มพัฒนาเว็บด้วย PHP สำหรับผู้เริ่มต้น.pdf",
    "เรียนรู้การใช้งาน Visual Studio Code.pdf",
    "ปูพื้นฐาน HTML CSS JavaScript (Free).pdf",
    "รวมเล่มเขียนโปรแกรมภาษา C เบื้องต้น.pdf",
    "รวมเล่มเขียนโปรแกรมภาษา Python (อัปเดตล่าสุด).pdf",
    "เขียนโปรแกรมเชิงวัตถุด้วยภาษา Java (Update).pdf",
    "Python OOP.pdf",
]

url = [
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/Flask%20Framework%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B8%9E%E0%B8%B1%E0%B8%92%E0%B8%99%E0%B8%B2%E0%B9%80%E0%B8%A7%E0%B9%87%E0%B8%9A%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20Python%20%26%20Flask%20(Free).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/Git%20%26%20GitHub%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B9%80%E0%B8%A3%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B8%A3%E0%B8%B9%E0%B9%89%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B9%83%E0%B8%8A%E0%B9%89%E0%B8%87%E0%B8%B2%E0%B8%99%20Git%20%26%20GitHub%20%E0%B8%AA%E0%B8%B3%E0%B8%AB%E0%B8%A3%E0%B8%B1%E0%B8%9A%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B9%80%E0%B8%A3%E0%B8%B4%E0%B9%88%E0%B8%A1%E0%B8%95%E0%B9%89%E0%B8%99.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/JSON%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/JSON%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99%20(Update).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/OpenCV%20%26%20Python%20%E0%B8%AA%E0%B8%B3%E0%B8%AB%E0%B8%A3%E0%B8%B1%E0%B8%9A%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%A1%E0%B8%A7%E0%B8%A5%E0%B8%9C%E0%B8%A5%E0%B8%A0%E0%B8%B2%E0%B8%9E%20(Image%20Processing)/Python%20%26%20OpenCV%20Computer%20Vision%20%26%20Image%20Processing%20(Free).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/PHP%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B8%A3%E0%B8%A7%E0%B8%A1%E0%B9%80%E0%B8%A5%E0%B9%88%E0%B8%A1%E0%B8%9E%E0%B8%B1%E0%B8%92%E0%B8%99%E0%B8%B2%E0%B9%80%E0%B8%A7%E0%B9%87%E0%B8%9A%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20PHP%20%E0%B8%AA%E0%B8%B3%E0%B8%AB%E0%B8%A3%E0%B8%B1%E0%B8%9A%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B9%80%E0%B8%A3%E0%B8%B4%E0%B9%88%E0%B8%A1%E0%B8%95%E0%B9%89%E0%B8%99.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/Visual%20Studio%20Code%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B9%80%E0%B8%A3%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B8%A3%E0%B8%B9%E0%B9%89%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B9%83%E0%B8%8A%E0%B9%89%E0%B8%87%E0%B8%B2%E0%B8%99%20Visual%20Studio%20Code.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B8%9E%E0%B8%B7%E0%B9%89%E0%B8%99%E0%B8%90%E0%B8%B2%E0%B8%99%20HTML%20%2CCSS%20%2C%20JavaScript/%E0%B8%9B%E0%B8%B9%E0%B8%9E%E0%B8%B7%E0%B9%89%E0%B8%99%E0%B8%90%E0%B8%B2%E0%B8%99%20HTML%20CSS%20JavaScript%20(Free).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20C%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B8%A3%E0%B8%A7%E0%B8%A1%E0%B9%80%E0%B8%A5%E0%B9%88%E0%B8%A1%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20C%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20Python%20(%E0%B8%AD%E0%B8%B1%E0%B8%9B%E0%B9%80%E0%B8%94%E0%B8%95%E0%B8%A5%E0%B9%88%E0%B8%B2%E0%B8%AA%E0%B8%B8%E0%B8%94)/%E0%B8%A3%E0%B8%A7%E0%B8%A1%E0%B9%80%E0%B8%A5%E0%B9%88%E0%B8%A1%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20Python%20(%E0%B8%AD%E0%B8%B1%E0%B8%9B%E0%B9%80%E0%B8%94%E0%B8%95%E0%B8%A5%E0%B9%88%E0%B8%B2%E0%B8%AA%E0%B8%B8%E0%B8%94).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B9%80%E0%B8%8A%E0%B8%B4%E0%B8%87%E0%B8%A7%E0%B8%B1%E0%B8%95%E0%B8%96%E0%B8%B8%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20Java/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B9%80%E0%B8%8A%E0%B8%B4%E0%B8%87%E0%B8%A7%E0%B8%B1%E0%B8%95%E0%B8%96%E0%B8%B8%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20Java%20(Update).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B9%80%E0%B8%8A%E0%B8%B4%E0%B8%87%E0%B8%A7%E0%B8%B1%E0%B8%95%E0%B8%96%E0%B8%B8%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20Python/Python%20OOP.pdf",
]
source = ["https://github.com/kongruksiamza/ebook-for-education"] * len(filename)
license = ["CC BY-NC"] * len(filename)
title = [f.replace(".pdf", "") for f in filename]
group = "kongruksiam"

meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, f"meta_{group}_{get_current_datetime()}.csv"
)
df = pd.DataFrame(
    {
        "title": title,
        "url": url,
        "source": source,
        "license": license,
        "filename": filename,
    }
)
df.to_csv(meta_save_path, index=False)

### Open BASE

In [None]:
BASE_URL_OPENBASE = "https://www.openbase.in.th"
session = requests.Session()
RETRY_STRATEGY = Retry(
    total=4,
    backoff_factor=3,
    status_forcelist=[500, 502, 503, 504, 404],
    allowed_methods={"POST", "GET"},
)
session.mount("https://", HTTPAdapter(max_retries=RETRY_STRATEGY))
session.mount("http://", HTTPAdapter(max_retries=RETRY_STRATEGY))

#### Get Pages

In [None]:
# url = "https://www.openbase.in.th/categories/search?page=1"
# response = session.get(url)
# soup = BeautifulSoup(response.text, "html.parser")
# soup.find_all("ttt")

In [None]:
openbase_page_logger = setup_logger(prefix="openbase_page_logger")

In [None]:
# ROWS_INFO = []
# ROWS_HREF = []
# PAGES_NO = []

# for i in trange(0, 278):
#     url = f"{BASE_URL_OPENBASE}/categories/search?page={i}"

#     try:
#         response = session.get(url)
#         response.raise_for_status()
#     except requests.exceptions.RequestException as e:
#         openbase_page_logger.error(f"Error fetching {url}: {e}")
#         continue  # Skip to the next iteration

#     soup = BeautifulSoup(response.text, "html.parser")

#     # Find all tables and check if they exist
#     tables = soup.find_all("table")
#     if not tables:
#         openbase_page_logger.warning(f"No tables found on page {i}")
#         continue

#     last_table = tables[-1]
#     tbody = last_table.find("tbody")

#     if not tbody:
#         openbase_page_logger.warning(f"No tbody found in table on page {i}")
#         continue

#     rows = tbody.find_all("tr")
#     if not rows:
#         openbase_page_logger.warning(f"No rows found in tbody on page {i}")
#         continue

#     # Extract table data and links
#     rows_info = [[cell.text.strip() for cell in row.find_all("td")] for row in rows]
#     rows_href = [
#         BASE_URL_OPENBASE + row.find("a").get("href", "") if row.find("a") else None
#         for row in rows
#     ]

#     ROWS_INFO.extend(rows_info)
#     ROWS_HREF.extend(filter(None, rows_href))  # Removes None values
#     PAGES_NO.extend([i + 1] * len(rows_href))

# print(f"Scraped {len(ROWS_INFO)} rows of info and {len(ROWS_HREF)} links successfully.")

In [None]:
def openbase_scrape_pages(session, logger, max_pages=278):
    """Scrape data from paginated category search results.

    Args:
        session (requests.Session): A configured requests session.
        logger (logging.Logger): Logger for error tracking.
        max_pages (int): Number of pages to scrape (default: 278).

    Returns:
        dict: Dictionary containing extracted data.
    """

    # Storage for extracted data
    rows_info_list = []
    rows_href_list = []
    pages_no_list = []

    for page_num in trange(0, max_pages, desc="Scraping Pages"):
        url = f"{BASE_URL_OPENBASE}/categories/search?page={page_num}"

        try:
            response = session.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching {url}: {e}")
            continue  # Skip to next page

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract last table on the page
        last_table = soup.find_all("table")[-1] if soup.find_all("table") else None
        if not last_table:
            logger.warning(f"No tables found on page {page_num}")
            continue

        tbody = last_table.find("tbody")
        if not tbody:
            logger.warning(f"No tbody found in table on page {page_num}")
            continue

        rows = tbody.find_all("tr")
        if not rows:
            logger.warning(f"No rows found in tbody on page {page_num}")
            continue

        # Extract table data and links
        rows_info = [[cell.text.strip() for cell in row.find_all("td")] for row in rows]
        rows_href = [
            (
                f"{BASE_URL_OPENBASE}{row.find('a').get('href', '')}"
                if row.find("a")
                else None
            )
            for row in rows
        ]

        # Store data, removing None values from hrefs
        rows_info_list.extend(rows_info)
        rows_href_list.extend(filter(None, rows_href))
        pages_no_list.extend([page_num + 1] * len(rows_href))

    logger.info(
        f"Scraped {len(rows_info_list)} rows of info and {len(rows_href_list)} links successfully."
    )

    return {
        "rows_info": rows_info_list,
        "rows_href": rows_href_list,
        "pages_no": pages_no_list,
    }

In [None]:
ROW_INFO = openbase_scrape_pages(session, openbase_page_logger)

In [None]:
ROW_INFO.keys()

In [None]:
cols = ["title", "category", "last_update", "create_date"]
df = pd.DataFrame(ROW_INFO["rows_info"], columns=cols)
df["link"] = ROW_INFO["rows_href"]
df["page_no"] = ROW_INFO["pages_no"]

In [None]:
df.info()

In [None]:
# df["category"].value_counts()
# df.loc[df["category"].eq("")]

In [None]:
df.sample(5)

In [None]:
meta_save_path = os.path.join(
    "../pdf_meta_openbase", 
    f"meta_openbase_link_to_pdfs_{get_current_datetime()}.csv"
)
df.to_csv(meta_save_path, index=False)

#### Get PDF Link

In [None]:
path = "../pdf_meta_openbase/meta_openbase_link_to_pdfs_2025-02-06_18_45_09.csv"
df = pd.read_csv(path); print(df.shape)
links = df["link"].to_list()

In [None]:
df.info()

In [None]:
df.sample(3)

In [None]:
# # Define session and regex pattern for PDFs
# PDF_LINK_PAT = re.compile(r'href="([^"]+\.pdf)"')  # Extracts href content directly
# MAPPING = {}

# for url in tqdm(links):
#     for attempt in range(2):  # Try twice before failing
#         try:
#             response = session.get(url, timeout=30)
#             response.raise_for_status()
#             break  # If successful, exit retry loop
#         except requests.exceptions.RequestException as e:
#             if attempt == 0:
#                 time.sleep(5)  # Retry after delay
#             else:
#                 print(f"Failed to fetch {url}: {e}")
#                 MAPPING[url] = []  # Store an empty list for failed URLs
#                 continue

#     pdf_links = PDF_LINK_PAT.findall(response.text)
#     if pdf_links:
#         pdf_links = [
#             BASE_URL_OPENBASE + match for match in pdf_links if match.endswith(".pdf")
#         ]  # can still be an empty list if there is no pdf
#         MAPPING[url] = pdf_links
#     else:
#         MAPPING[url] = []


# print(f"Scraped {len(MAPPING)} pages successfully.")

In [None]:
openbase_pdf_logger = setup_logger("openbase_get_pdf_link")

In [None]:
# # Constants
# PDF_LINK_PAT = re.compile(r'href="([^"]+\.pdf)"')  # Extracts href content directly

# def fetch_pdfs(url) -> tuple[str, list]:
#     """Fetch a URL and extract PDF links."""
#     try:
#         response = session.get(url, timeout=10)  # Lower timeout for responsiveness
#         response.raise_for_status()
#         pdf_links = [
#             BASE_URL_OPENBASE + match for match in PDF_LINK_PAT.findall(response.text)
#         ]
#         return url, pdf_links
#     except requests.exceptions.RequestException as e:
#         openbase_pdf_logger.error(f"Failed to fetch {url}: {e}")
#         return url, []


# def scrape_all(links, max_workers=10) -> dict[str, list]:
#     """Scrape all links concurrently."""
#     MAPPING = {}

#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_url = {executor.submit(fetch_pdfs, url): url for url in links}

#         for future in tqdm(
#             as_completed(future_to_url), total=len(links), desc="Scraping PDFs"
#         ):
#             url, pdf_links = future.result()
#             MAPPING[url] = pdf_links

#     openbase_pdf_logger.info(f"Scraped {len(MAPPING)} pages successfully.")
#     return MAPPING


# pdf_mapping = scrape_all(links)

In [None]:
PDF_LINK_PAT = re.compile(r'href="([^"]+\.pdf)"')  # Extracts href content directly


def fetch_pdfs(url: str, session: requests.Session, logger) -> tuple[str, list]:
    """Fetch a URL and extract PDF links.

    Args:
        url (str): The URL to scrape.
        session (requests.Session): A session object for efficient requests.
        logger (logging.Logger): Logger for logging errors and info.

    Returns:
        tuple[str, list]: The URL and a list of extracted PDF links.
    """
    try:
        response = session.get(url, timeout=20)
        response.raise_for_status()
        # pdf_links = [match for match in PDF_LINK_PAT.findall(response.text)]
        pdf_links = PDF_LINK_PAT.findall(response.text)

        return url, pdf_links
    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to fetch {url}: {e}")
        return url, []


def openbase_scrape_pdf(
    links: list, session: requests.Session, logger, max_workers=10
) -> dict[str, list]:
    """Scrape multiple URLs concurrently to extract PDF links.

    Args:
        links (list): List of URLs to scrape.
        session (requests.Session): A requests session for efficient network calls.
        base_url (str): The base URL to prepend to relative links.
        logger (logging.Logger): Logger for logging progress.
        max_workers (int, optional): Number of concurrent threads. Defaults to 10.

    Returns:
        dict[str, list]: A dictionary mapping each URL to its extracted PDF links.
    """
    pdf_mapping = {}

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {
            executor.submit(fetch_pdfs, url, session, logger): url for url in links
        }

        for future in tqdm(
            as_completed(future_to_url),
            total=len(links),
            desc="Scraping PDFs",
            unit="page",
        ):
            url, pdf_links = future.result()
            pdf_mapping[url] = pdf_links

    logger.info(f"Scraped {len(pdf_mapping)} pages successfully.")
    return pdf_mapping

In [None]:
pdf_mapping = openbase_scrape_pdf(links, session, openbase_pdf_logger)

In [None]:
df["pdf_link"] = df["link"].map(pdf_mapping)
df["pdf_link"] = df["pdf_link"].apply(lambda x: x if x else pd.NA)
df.info()

In [None]:
df.to_parquet(f"./openbase_pdf_link_tmp_{get_current_datetime()}.parquet", index=False)

In [None]:
df = df.dropna(subset="pdf_link").explode("pdf_link")
df["source"] = "openbase.in.th"
df["license"] = "CC-BY-NC-SA"
df.loc[
    ~(
        df["pdf_link"].str.startswith("https://")
        | df["pdf_link"].str.startswith("http://")
    ),
    "pdf_link",
] = (
    BASE_URL_OPENBASE
    + df.loc[
        ~(
            df["pdf_link"].str.startswith("https://")
            | df["pdf_link"].str.startswith("http://")
        ),
        "pdf_link",
    ]
)


def create_filename_column(df: pd.DataFrame) -> pd.DataFrame:
    df["filename"] = [f"pdf_doc_openbase_{i:05}.pdf" for i in range(len(df))]
    return df


df = create_filename_column(df)
df = df.loc[~df["pdf_link"].str.contains("thailife", regex=False)]
df = df.drop_duplicates(subset="pdf_link")

In [None]:
df.info()
df["category"] = df["category"].fillna("NOCATEGORY")

In [None]:
meta_save_path = os.path.join(
    "../pdf_meta_openbase", 
    f"meta_openbase_{get_current_datetime()}.csv"
)
df.to_csv(meta_save_path, index=False)

#### Download PDF

In [None]:
path = "../pdf_meta_openbase/meta_openbase_2025-02-07_09_52_59.csv"
df = pd.read_csv(path)
df.info()

In [None]:
meta = df.to_dict(orient="records")
meta[:3]

In [None]:
# open_base_pdf_download_logger = setup_logger("openbase_pdf_download_logger")
# fail_item = []
# BASE_PATH_SAVE_OPENBASE = "../pdf_documents_openbase"

# for item in tqdm(meta, total=len(meta)):
#     pdf_save_path = os.path.join(BASE_PATH_SAVE_OPENBASE, item["filename"])
#     check = download_pdf(
#         session=session,
#         logger=open_base_pdf_download_logger,
#         url=item["pdf_link"],
#         save_path_str=pdf_save_path,
#         verify=True
#     )

#     sleep_time = random.randint(0, 3)
#     open_base_pdf_download_logger.info(f"sleep for {sleep_time} secs")
#     time.sleep(sleep_time)

#     if not check:
#         fail_item.append((item["title"], item["pdf_link"], item["filename"]))

In [None]:
# Ensure the directory exists
open_base_pdf_download_logger = setup_logger("openbase_pdf_download_logger")
BASE_PATH_SAVE_OPENBASE = Path("../pdf_documents_openbase")
fail_items = []


def download_pdf_task(item):
    """Task function to download a PDF."""
    pdf_save_path = BASE_PATH_SAVE_OPENBASE / item["filename"]

    check = download_pdf(
        session=session,
        logger=open_base_pdf_download_logger,
        url=item["pdf_link"],
        save_path_str=str(pdf_save_path),
        verify=True,
    )

    if not check:
        fail_items.append((item["title"], item["pdf_link"], item["filename"]))
        open_base_pdf_download_logger.error(
            f"Failed to download: {item['title']} ({item['pdf_link']})"
        )

    return item["filename"]  # Return filename for progress tracking


# Define the number of concurrent threads
MAX_WORKERS = 16  # Adjust based on your system/network

# Run downloads in parallel
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(download_pdf_task, item): item for item in meta}

    for future in tqdm(as_completed(futures), total=len(meta), desc="Downloading PDFs"):
        try:
            filename = future.result()
            open_base_pdf_download_logger.info(f"Completed: {filename}")
        except Exception as e:
            open_base_pdf_download_logger.error(f"Error during download: {e}")

In [24]:
pdf_files = os.listdir("../pdf_documents_openbase")
bool_list = []
for pdf_f in tqdm(pdf_files):
    path = os.path.join("../pdf_documents_openbase", pdf_f)
    try:
        PdfReader(path)
        bool_list.append(True)
    except PdfReadError:
        bool_list.append(False)

  0%|          | 0/4873 [00:00<?, ?it/s]

incorrect startxref pointer(1)
parsing for Object Streams
Ignoring wrong pointing object 0 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
incorrect startxref pointer(2)
parsing for Object Streams
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 523 0 (offset 0)
Ignoring wrong pointing object 0 0 (offset 0)
incorrect startxref pointer(1)
parsing for Object Streams
parsing for Object Streams
incorrect startxref pointer(2)
parsing for Object Streams
incorrect startxref pointer(2)
parsing for Object Streams
Ignoring wrong pointing object 21 0 (offset 0)
incorrect startxref pointer(1)
parsing for Object Streams
Ignoring wrong pointing object 0 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong p

In [26]:
pd.Series(bool_list).value_counts()

True    4873
Name: count, dtype: int64

# Check