In [1]:
import requests
import pandas as pd
import os
import subprocess
import random
import string
import base64

from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from pprint import pprint

In [2]:
def download_pdf(url, save_path, verify: bool = False):
    """
    Downloads a PDF file from the given URL and saves it to the specified path.

    Args:
        url (str): The URL of the PDF file to download.
        save_path (str): The local path where the PDF file should be saved.

    Returns:
        bool: True if the download was successful, False otherwise.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url, stream=True, verify=verify)
        response.raise_for_status()  # Raise an error for bad status codes

        # Save the file in chunks to handle large files
        try:
            with open(save_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)

            print(f"Downloaded file saved to: {save_path}")
        except OSError as e:
            print(e)
            with open(save_path[:-20], "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:  # Filter out keep-alive chunks
                        file.write(chunk)
        return True

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return False

In [3]:
BASE_PATH_SAVE = "../pdf_documents_tmp"
BASE_PATH_SAVE_META = "../pdf_documents_meta_tmp"

In [4]:
def wget_pdf(command: list) -> None:
    try:
        subprocess.run(command, check=True)
        print("File downloaded successfully!")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")


def save_pdf_meta(meta: dict, meta_save_path: str):
    pd.DataFrame([meta]).to_csv(meta_save_path, index=False)


def generate_random_string(length=10):
    return "".join(random.choices(string.ascii_letters + string.digits, k=length))

In [5]:
def string_to_code(input_string: str) -> str:
    """
    Converts a string into a deterministic, one-to-one code using Base64 encoding.

    Args:
        input_string (str): The input string to be converted.

    Returns:
        str: The encoded string as a deterministic code.
    """
    # Encode the string into bytes
    string_bytes = input_string.replace(" ", "").encode("utf-8")
    # Convert the bytes to a Base64 encoded string
    encoded_string = base64.urlsafe_b64encode(string_bytes).decode("utf-8")
    print(f"lenght encode string: {len(encoded_string)}")
    if len(encoded_string) >= 255:
        print("name too long, truncate name")
        encoded_string = encoded_string[-64:]
    return encoded_string


def code_to_string(encoded_string: str) -> str:
    """
    Decodes the deterministic code back to the original string.

    Args:
        encoded_string (str): The encoded string to be decoded.

    Returns:
        str: The original string.
    """
    # Decode the Base64 string back to bytes
    string_bytes = base64.urlsafe_b64decode(encoded_string.encode("utf-8"))
    # Convert bytes back to a string
    original_string = string_bytes.decode("utf-8")
    return original_string

In [None]:
# Example usage
original_string = "Hello, World!?"
encoded = string_to_code(original_string)
decoded = code_to_string(encoded)

print("Original String:", original_string)
print("Encoded Code:", encoded)
print("Decoded String:", decoded)

In [7]:
def remove_file(file_path: str) -> None:
    """
    Removes a file at the specified path.

    Args:
        file_path (str): The path to the file to be removed.

    Returns:
        None
    """
    try:
        # Check if the file exists
        if os.path.exists(file_path):
            # Remove the file
            os.remove(file_path)
            print(f"File '{file_path}' has been removed successfully.")
        else:
            print(f"File '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred while trying to remove the file: {e}")

In [8]:
def get_pdf_and_save_meta(
    title: str,
    url: str,
    source: str,
    license_: str,
    filename: str,
    command: list,
    meta_save_path: str,
) -> None:
    pdf = {
        "title": title,
        "url": url,
        "source": source,
        "license": license_,
        "filename": filename,
    }
    print("meta view")
    pprint(pdf)

    wget_pdf(command=command)
    save_pdf_meta(meta=pdf, meta_save_path=meta_save_path)

In [None]:
# web_url = "https://gdcatalog.go.th/dataset/gdpublish-dsb21-1"
# # Use a session to reuse the TCP connection for faster requests
# session = requests.Session()

# # Fetch and parse the main page
# response = session.get(web_url)
# response.raise_for_status()
# soup = BeautifulSoup(response.text, "html.parser")

# # Extract resource links with titles
# base_url = "https://gdcatalog.go.th"
# resources_section = soup.find("section", id="dataset-resources")
# resource_links = [
#     {
#         "title": a["title"].strip(),
#         "href": base_url + a["href"],
#         "source": "Goverment data catalog smart plus",
#         "license": "Open Data Common",
#     }
#     for a in resources_section.find_all("a", class_="heading")
# ]

# # Process each resource to extract the PDF URL
# for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
#     pdf_page = session.get(resource["href"])
#     pdf_page.raise_for_status()
#     pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
#     pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
#     resource["url"] = pdf_url
#     resource["filename"] = f"pdf_มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล_0{i}.pdf"
#     resource.pop("href", None)

# resource_links = list(filter(lambda x: x["url"].endswith(".pdf"), resource_links))
# pd.DataFrame(resource_links).to_csv(
#     "../pdf_documents_meta_tmp/pdf_มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล.csv",
#     index=False,
# )
# for item in tqdm(resource_links, desc="donwloading pdf "):
#     download_pdf(
#         url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
#     )

In [None]:
BASE_GD_CATALOG_URL = "https://gdcatalog.go.th"

def fetch_resource_links(
    request_session: requests.Session, web_url: str, license: str
) -> list[dict]:
    """Fetch resource links from the given web URL."""
    response = request_session.get(web_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    resources_section = soup.find("section", id="dataset-resources")
    if not resources_section:
        return []

    return [
        {
            "title": a["title"].strip(),
            "href": BASE_GD_CATALOG_URL + a["href"],
            "source": "Government data catalog smart plus",
            "license": license,
        }
        for a in resources_section.find_all("a", class_="heading")
    ]


def extract_pdf_urls(
    request_session: requests.Session, resource_links: list[dict], doc_name: str
):
    """Extract PDF URLs from each resource page."""
    for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
        pdf_page = request_session.get(resource["href"])
        pdf_page.raise_for_status()
        pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
        pdf_url_tag = pdf_soup.find("a", class_="resource-url-analytics")

        if pdf_url_tag and "href" in pdf_url_tag.attrs:
            resource["url"] = pdf_url_tag["href"]
            resource["filename"] = f"pdf_{doc_name}_0{i}.pdf"

        resource.pop("href", None)

    return list(filter(lambda x: x.get("url", "").endswith(".pdf"), resource_links))


def save_metadata(resource_links, output_path):
    """Save resource metadata to a CSV file."""
    df = pd.DataFrame(resource_links)
    df.to_csv(output_path, index=False)
    return df


def download_pdfs(resource_links, save_path):
    """Download PDFs from the extracted links."""
    for item in tqdm(resource_links, desc="Downloading PDFs"):
        download_pdf(url=item["url"], save_path=save_path)


def get_multiple_pdf_from_gd_catalog(web_url, mate_save_path):
    # Fetch and process resource links
    resource_links = fetch_resource_links(web_url)
    resource_links = extract_pdf_urls(resource_links)

    # Save metadata and download PDFs
    save_metadata(resource_links, mate_save_path)

# การประมงปลากะตักในอ่าวไทย

ready

In [None]:
title = "การประมงปลากะตักในอ่าวไทย"
url = "https://opendata.nesdc.go.th/dataset/7cb4549b-6efa-43c6-868d-612acaf06c5b/resource/765af6b1-2697-4a18-8382-0d750ff8f4e0/download/09-.pdf"
source = "Open-D"
license = "Open Government"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path)

# รวมบทคัดย่อ งานวิจัยพัฒนาที่อยู่อาศัยและเมือง การเคหะแห่งชาติ

In [None]:
# !wget -O "../pdf_documents_tmp/รวมบทคัดย่องานวิจัยพัฒนาที่อยู่อาศัยและเมืองการเคหะแห่งชาติ.pdf" "https://opendata.nesdc.go.th/dataset/867ef9e5-2522-45a6-8204-06551cfbc8de/resource/3c2ea193-2dad-4092-b795-0dc5acec929c/download/-.pdf"

ready

In [None]:
title = "รวมบทคัดย่อ งานวิจัยพัฒนาที่อยู่อาศัยและเมือง การเคหะแห่งชาติ"
url = "https://opendata.nesdc.go.th/dataset/867ef9e5-2522-45a6-8204-06551cfbc8de/resource/3c2ea193-2dad-4092-b795-0dc5acec929c/download/-.pdf"
source = "Open-D"
license = "Open Government"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path)

# รวมบทคัดย่อ งานวิจัยพัฒนาที่อยู่อาศัยและเมือง การเคหะแห่งชาติ พ.ศ. 2547 - 2560

In [None]:
# !wget -O "../pdf_documents/รวมบทคัดย่องานวิจัยพัฒนาที่อยู่อาศัยและเมืองการเคหะแห่งชาติ2547-2560.pdf" "https://opendata.nesdc.go.th/dataset/28248759-f2ef-4e2f-b0c2-38417e6598e1/resource/14d8efe8-42ee-44cf-bbea-5afd89b189d5/download/-.pdf"

ready

In [None]:
title = "รวมบทคัดย่อ งานวิจัยพัฒนาที่อยู่อาศัยและเมือง การเคหะแห่งชาติ พ.ศ. 2547 - 2560"
url = "https://opendata.nesdc.go.th/dataset/28248759-f2ef-4e2f-b0c2-38417e6598e1/resource/14d8efe8-42ee-44cf-bbea-5afd89b189d5/download/-.pdf"
source = "Open-D"
license = "Open Government"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path)

# การพัฒนาคลังเอกสารดิจิทัลระบบเปิดด้วยโอเพนซอร์สซอฟต์แวร์ Drupal

In [None]:
!wget -O "../pdf_documents/การพัฒนาคลังเอกสารดิจิทัลระบบเปิดด้วยโอเพนซอร์สซอฟต์แวร์Drupal.pdf" "https://oer.learn.in.th/search_detail/ZipDownload/11164"

ready

In [None]:
title = "การพัฒนาคลังเอกสารดิจิทัลระบบเปิดด้วยโอเพนซอร์สซอฟต์แวร์ Drupal"
url = "https://oer.learn.in.th/search_detail/ZipDownload/11164"
source = "คลังทรัพยากรการศึกษาแบบเปิด"
license = "ให้เผยแพร่-ดัดแปลง-โดยต้องระบุที่มา 3.0 Thailand"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# คลังเอกสารดิจิทัลด้วย Omeka

In [9]:
# !wget -O "../pdf_documents/คลังเอกสารดิจิทัลด้วยOmeka.pdf" "https://oer.learn.in.th/search_detail/ZipDownload/1002"

ready

In [None]:
title = "คลังเอกสารดิจิทัลด้วย Omeka"
url = "https://oer.learn.in.th/search_detail/ZipDownload/1002"
source = "คลังทรัพยากรการศึกษาแบบเปิด"
license = "ให้เผยแพร่-ดัดแปลง-โดยต้องระบุที่มา 3.0 Thailand"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# โครงร่างวิจัย (ฉบับย่อ) การเพิ่มประสิทธิภาพไทยต่อการเพิ่มมูลค่าการค้าชายแดน:กัมพูชา สปป.ลาว เมียนมา

error <br>
--2024-12-30 07:09:20--  https://catalog.neda.or.th/dataset/8bb91112-84b9-41c0-bd67-f77841892126/resource/beddda0a-26da-46a4-bd40-3015266a288f/download/1-_.pdf
Resolving catalog.neda.or.th (catalog.neda.or.th)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘catalog.neda.or.th’ <br>

Opt for manual donwload

In [11]:
# !(wget -O "../pdf_documents/ฉบับย่อ_การเพิ่มประสิทธิภาพไทยต่อการเพิ่มมูลค่าการค้าชายแดน.pdf" \
#   "https://catalog.neda.or.th/dataset/8bb91112-84b9-41c0-bd67-f77841892126/resource/beddda0a-26da-46a4-bd40-3015266a288f/download/1-_.pdf")

ready

In [None]:
title = (
    "โครงร่างวิจัย (ฉบับย่อ) การเพิ่มประสิทธิภาพไทยต่อการเพิ่มมูลค่าการค้าชายแดน:กัมพูชา สปป.ลาว เมียนมา"
)
url = "https://catalog.neda.or.th/dataset/8bb91112-84b9-41c0-bd67-f77841892126/resource/beddda0a-26da-46a4-bd40-3015266a288f/download/1-_.pdf"
source = "Goverment data catalog smart plus"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path)

# องค์ความรู้ด้านการอนุรักษ์และจัดการต้นน้ำ 

In [None]:
# !(wget --no-check-certificate -O \
#   "../pdf_documents/tmp.pdf" \
#   "https://catalog.dnp.go.th/dataset/52cbb6de-0e1b-4240-81f7-2ee02fbf3bed/resource/add851e5-769b-4eb1-8e99-4caa7bd136e8/download/r085101.pdf")

In [None]:
web_url = "https://gdcatalog.go.th/dataset/gdpublish-67-dnp08-21-01"

# Use a session to reuse the TCP connection for faster requests
session = requests.Session()

# Fetch and parse the main page
response = session.get(web_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Extract resource links with titles
base_url = "https://gdcatalog.go.th"
resources_section = soup.find("section", id="dataset-resources")
resource_links = [
    {
        "title": a["title"].strip(),
        "href": base_url + a["href"],
        "source": "Goverment data catalog smart plus",
        "license": "Open Data Common",
    }
    for a in resources_section.find_all("a", class_="heading")
]

# Process each resource to extract the PDF URL
for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
    pdf_page = session.get(resource["href"])
    pdf_page.raise_for_status()
    pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
    pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
    resource["url"] = pdf_url
    resource["filename"] = f"pdf_องค์ความรู้ด้านการอนุรักษ์และจัดการต้นน้ำ_0{i}.pdf"
    resource.pop("href", None)

In [None]:
resource_links[:3]

In [11]:
pd.DataFrame(resource_links).to_csv(
    "../pdf_documents_meta_tmp/pdf_องค์ความรู้ด้านการอนุรักษ์และจัดการต้นน้ำ.csv", index=False
)

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    download_pdf(
        url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
    )

# โครงร่างวิจัย (ฉบับเต็ม) พลวัตของจีน: กรณีศึกษาการให้ความร่วมมือเพื่อพัฒนาอย่างเป็นทางการของ สปป.ลาว 

In [17]:
# !wget -O "../pdf_documents_tmp/การศึกษาพลวัตของจีน.pdf" "https://catalog.neda.or.th/dataset/246abcc2-dc23-4431-88cf-c56cfecdeb8f/resource/30b18057-8186-47cd-8b11-9ef9b1b82881/download/2-_.pdf"

ready

In [None]:
title = "โครงร่างวิจัย (ฉบับเต็ม) พลวัตของจีน: กรณีศึกษาการให้ความร่วมมือเพื่อพัฒนาอย่างเป็นทางการของ สปป.ลาว"
url = "https://catalog.neda.or.th/dataset/246abcc2-dc23-4431-88cf-c56cfecdeb8f/resource/30b18057-8186-47cd-8b11-9ef9b1b82881/download/2-_.pdf"
source = "Goverment data catalog smart plus"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path); remove_file(meta_save_path);

# โครงร่างวิจัย (ฉบับเต็ม) การเพิ่มประสิทธิภาพไทยต่อการเพิ่มมูลค่าการค้าชายแดน: กัมพูชา สปป.ลาว เมียนมา 

In [None]:
# !wget -O "../pdf_documents/ฉบับเต็ม_การเพิ่มประสิทธิภาพไทยต่อการเพิ่มมูลค่าการค้าชายแดน.pdf" "https://catalog.neda.or.th/dataset/246abcc2-dc23-4431-88cf-c56cfecdeb8f/resource/30b18057-8186-47cd-8b11-9ef9b1b82881/download/2-_.pdf"

ready

In [None]:
title = "โครงร่างวิจัย (ฉบับเต็ม) การเพิ่มประสิทธิภาพไทยต่อการเพิ่มมูลค่าการค้าชายแดน: กัมพูชา สปป.ลาว เมียนมา"
url = "https://catalog.neda.or.th/dataset/6a7d9d79-bc02-4720-ba0b-49002fa13bc9/resource/ddc869e9-b40a-4a11-86e2-4a4d6f4dfb06/download/1-_.pdf"
source = "Goverment data catalog smart plus"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name + 'abc'}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate", "-nc"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path)

# คำแถลงนโนบายรัฐบาล

In [21]:
# !wget "https://www.thaigov.go.th/uploads/thumbnail/statement/2025/01/_20250102092556000000.pdf" -O "../pdf_documents/คำแถลงนโนบายรัฐบาล.pdf"

ready

In [None]:
title = "คำแถลงนโนบายรัฐบาล"
url = "https://www.thaigov.go.th/uploads/thumbnail/statement/2025/01/_20250102092556000000.pdf"
source = "Royal Thai Goverment"
license = ""

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

In [None]:
# remove_file(pdf_save_path); remove_file(meta_save_path)

# องค์ความรู้และผลการดำเนินงานสำคัญของศูนย์ศึกษาและพัฒนาป่าไม้ 

In [None]:
web_url = "https://gdcatalog.go.th/dataset/gdpublish-67-dnp40-11-02"

# Use a session to reuse the TCP connection for faster requests
session = requests.Session()

# Fetch and parse the main page
response = session.get(web_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Extract resource links with titles
base_url = "https://gdcatalog.go.th"
resources_section = soup.find("section", id="dataset-resources")
resource_links = [
    {
        "title": a["title"].strip(),
        "href": base_url + a["href"],
        "source": "Goverment data catalog smart plus",
        "license": "Open Data Common",
    }
    for a in resources_section.find_all("a", class_="heading")
]

# Process each resource to extract the PDF URL
for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
    pdf_page = session.get(resource["href"])
    pdf_page.raise_for_status()
    pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
    pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
    resource["url"] = pdf_url
    resource["filename"] = (
        f"pdf_องค์ความรู้และผลการดำเนินงานสำคัญของศูนย์ศึกษาและพัฒนาป่าไม้_0{i}.pdf"
    )
    resource.pop("href", None)

In [None]:
resource_links

In [15]:
pd.DataFrame(resource_links).to_csv(
    "../pdf_documents_meta_tmp/pdf_องค์ความรู้และผลการดำเนินงานสำคัญของศูนย์ศึกษาและพัฒนาป่าไม้.csv",
    index=False,
)

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    download_pdf(
        url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
    )

# แผนปฏิบัติการการคุ้มครองแหล่งซากดึกดําบรรพ์และซากดึกดําบรรพ์ 

In [26]:
# !wget "https://data.dmr.go.th/dataset/0e30821e-dafe-4848-b7b7-74339ec2a813/resource/fc5db2f4-3799-46b6-ad5a-dcf0a7a5a905/download/-2566-2570.pdf" -O "../pdf_documents/แผนปฏิบัติการการคุ้มครองแหล่งซากดึกดําบรรพ์และซากดึกดําบรรพ์.pdf"

ready

In [None]:
title = "แผนปฏิบัติการการคุ้มครองแหล่งซากดึกดําบรรพ์และซากดึกดําบรรพ์"
url = "https://data.dmr.go.th/dataset/0e30821e-dafe-4848-b7b7-74339ec2a813/resource/fc5db2f4-3799-46b6-ad5a-dcf0a7a5a905/download/-2566-2570.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# เอกสารเผยแพร่ด้านพฤกษศาสตร์

## หนังสือพรรณไม้สำคัญในระบบนิเวศเขาหินปูนประเทศไทย

In [None]:
# !wget --no-check-certificate "https://catalog.dnp.go.th/dataset/818f6fe3-1576-4606-84c6-0a2d738fa97e/resource/dcc7329e-ac76-46f1-8cc3-ac0c82602bab/download/limestonebook.pdf" -O "../pdf_documents/หนังสือพรรณไม้สำคัญในระบบนิเวศเขาหินปูนประเทศไทย.pdf"

ready

In [None]:
title = "หนังสือพรรณไม้สำคัญในระบบนิเวศเขาหินปูนประเทศไทย"
url = "https://catalog.dnp.go.th/dataset/818f6fe3-1576-4606-84c6-0a2d738fa97e/resource/dcc7329e-ac76-46f1-8cc3-ac0c82602bab/download/limestonebook.pdf"
source = "data.go.th"
license = "Open Data Commons"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

## หนังสือต้นไม้ลดฝุ่น

In [31]:
# !wget --no-check-certificate "https://www.dnp.go.th/botany/PDF/publications/PM25.pdf" -O "../pdf_documents/หนังสือต้นไม้ลดฝุ่น.pdf"

ready

In [None]:
title = "หนังสือต้นไม้ลดฝุ่น"
url = "https://www.dnp.go.th/botany/PDF/publications/PM25.pdf"
source = "data.go.th"
license = "Open Data Commons"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

## หนังสือพืชป่าสมุนไพร

In [None]:
# !wget --no-check-certificate "https://www.dnp.go.th/botany/PDF/publications/HerbBook.pdf" -O "../pdf_documents/หนังสือพืชป่าสมุนไพร.pdf"

ready

In [None]:
title = "หนังสือพืชป่าสมุนไพร"
url = "https://www.dnp.go.th/botany/PDF/publications/HerbBook.pdf"
source = "data.go.th"
license = "Open Data Commons"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

## หนังสือพรรณไม้ไทย-ลาว สองฝั่งโขง

# แผนแม่บทการคุ้มครองแหล่งซากดึกดำบรรพ์และซากดึกดำบรรพ์

In [None]:
# !wget "https://data.dmr.go.th/dataset/0f830783-1894-43b7-b129-7554d4f2a933/resource/31408fed-cee7-4266-a902-fd6b39a2d7d3/download/-2566-2580.pdf" -O "../pdf_documents/แผนแม่บทการคุ้มครองแหล่งซากดึกดำบรรพ์และซากดึกดำบรรพ์.pdf"

ready

In [None]:
title = "แผนแม่บทการคุ้มครองแหล่งซากดึกดำบรรพ์และซากดึกดำบรรพ์"
url = "https://data.dmr.go.th/dataset/0f830783-1894-43b7-b129-7554d4f2a933/resource/31408fed-cee7-4266-a902-fd6b39a2d7d3/download/-2566-2580.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# รายงานคุณภาพชีวิตคนจังหวัดหนองคาย ปี 2562

ready

In [None]:
title = "รายงานคุณภาพชีวิตของคนหนองคายปี 2562"
url = "https://data.go.th/dataset/e818f10c-e8bf-4e01-855b-70d6bf02f04f/resource/b3dfb787-eedf-401c-827a-0cd70aeb375c/download/kunnapapcheewit2562.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# Booklet ชุดองค์ความรู้ ยกระดับการขายสไตล์ท่องเที่ยวโดยชุมชน


In [None]:
# !wget "https://www.dasta.or.th/uploads/file/202312/1703236058_c9a80f495848cef41f79.pdf" -O "../pdf_documents/Bookletชุดองค์ความรู้ยกระดับการขายสไตล์ท่องเที่ยวโดยชุมชน.pdf"b

ready

In [None]:
title = "Booklet ชุดองค์ความรู้ ยกระดับการขายสไตล์ท่องเที่ยวโดยชุมชน"
url = "https://www.dasta.or.th/uploads/file/202312/1703236058_c9a80f495848cef41f79.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# ข้อมูลผ้าไหมลายอัตลักษณ์ท้องถิ่นที่ได้รับการพัฒนา


## พัฒนาลวดลายผ้าและผลิตภัณฑ์ไหมไทย

In [37]:
# !wget "https://qsds.go.th/newocss/wp-content/uploads/sites/74/2021/04/id-silk.pdf" -O "../pdf_documents/พัฒนาลวดลายผ้าและผลิตภัณฑ์ไหมไทย.pdf"

ready

In [None]:
title = "ข้อมูลผ้าไหมลายอัตลักษณ์ท้องถิ่นที่ได้รับการพัฒนา"
url = "https://qsds.go.th/newocss/wp-content/uploads/sites/74/2021/04/id-silk.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

## ลวดลายผ้า หัตถกรรมเชิงเอกลักษณ์ท้องถิ่น

In [None]:
# !wget "https://qsds.go.th/newocss/wp-content/uploads/sites/74/2022/05/ID-SILK1.pdf" -O "../pdf_documents/ลวดลายผ้าหัตถกรรมเชิงเอกลักษณ์ท้องถิ่น.pdf"

ready

In [None]:
title = "ลวดลายผ้า หัตถกรรมเชิงเอกลักษณ์ท้องถิ่น"
url = "https://qsds.go.th/newocss/wp-content/uploads/sites/74/2022/05/ID-SILK1.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# ชุดความรู้ เรื่องGastronomy Tourism ท่องเที่ยวเชิงอาหาร

In [40]:
# !wget "https://data.go.th/dataset/138b9e9e-408b-4d88-bb35-65b1f7af0e47/resource/15e5e0fa-5868-4489-85f3-9b9c11e29da3/download/gastronomy-tourism.pdf" -O "../pdf_documents/ชุดความรู้เรื่องGastronomyTourismท่องเที่ยวเชิงอาหาร.pdf"

ready

In [None]:
title = "ชุดความรู้ เรื่องGastronomy Tourism ท่องเที่ยวเชิงอาหาร"
url = "https://data.go.th/dataset/138b9e9e-408b-4d88-bb35-65b1f7af0e47/resource/15e5e0fa-5868-4489-85f3-9b9c11e29da3/download/gastronomy-tourism.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# เอกสารบทสรุปเชิงนโยบายโครงการพัฒนากลไกความร่วมมือด้านอุตสาหกรรมปาล์มน้ำมันภายใต้ IMT-GT

In [42]:
# !wget "https://www.itd.or.th/wp-content/uploads/2022/05/ITD65_44_01.pdf" -O "../pdf_documents/เอกสารบทสรุปเชิงนโยบายโครงการพัฒนากลไกความร่วมมือด้านอุตสาหกรรมปาล์มน้ำมันภายใต้IMT-GT.pdf"

ready

In [None]:
title = "เอกสารบทสรุปเชิงนโยบายโครงการพัฒนากลไกความร่วมมือด้านอุตสาหกรรมปาล์มน้ำมันภายใต้ IMT-GT"
url = "https://www.itd.or.th/wp-content/uploads/2022/05/ITD65_44_01.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# ข้อเสนอแนะเชิงนโยบายโครงการพัฒนาศักยภาพระเบียงเศรษฐกิจภาคใต้เชื่อมโยงการค้าและการลงทุนกับประตูการค้าฝั่งตะวันตก

In [44]:
# !wget "https://www.itd.or.th/wp-content/uploads/2023/03/ITD66_13_02.pdf" -O "../pdf_documents/ข้อเสนอแนะเชิงนโยบายโครงการพัฒนาศักยภาพระเบียงเศรษฐกิจภาคใต้.pdf"

ready

In [None]:
title = "ข้อเสนอแนะเชิงนโยบายโครงการพัฒนาศักยภาพระเบียงเศรษฐกิจภาคใต้เชื่อมโยงการค้าและการลงทุนกับประตูการค้าฝั่งตะวันตก"
url = "https://www.itd.or.th/wp-content/uploads/2023/03/ITD66_13_02.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# ปฏิทินการจัดงานพิธี และงานวันสำคัญ ต่างๆ

ready

In [None]:
title = "ปฏิทินการจัดงานพิธี และงานวันสำคัญ ต่างๆ"
url = "https://data.go.th/dataset/7def714b-04ba-47e7-8a69-b7957d1ba6eb/resource/8d74c61d-80f7-44ab-a59f-6109da46a195/download/x-.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# การปฏิบัติงานการจัดทำข้อมูลปริมาณการผลิตสินค้าเกษตรที่สำคัญ (มันสำปะหลัง)


ready

In [None]:
title = "การปฏิบัติงานการจัดทำข้อมูลปริมาณการผลิตสินค้าเกษตรที่สำคัญ (มันสำปะหลัง)"
url = "https://www.oae.go.th/assets/portals/1/files/oaegdcatalog/other/13-01%E0%B8%84%E0%B8%B9%E0%B9%88%E0%B8%A1%E0%B8%B7%E0%B8%AD%E0%B8%AA%E0%B8%B3%E0%B8%A3%E0%B8%A7%E0%B8%88%E0%B8%A1%E0%B8%B1%E0%B8%99%E0%B8%AA%E0%B8%B3%E0%B8%9B%E0%B8%B0%E0%B8%AB%E0%B8%A5%E0%B8%B1%E0%B8%87%E0%B9%82%E0%B8%A3%E0%B8%87%E0%B8%87%E0%B8%B2%E0%B8%99%20%E0%B8%9B%E0%B8%B5%202564.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# นโยบายและแผนระดับชาติว่าด้วยการพัฒนาดิจิทัลเพื่อเศรษฐกิจและสังคม


ready

In [None]:
title = "นโยบายและแผนระดับชาติว่าด้วยการพัฒนาดิจิทัลเพื่อเศรษฐกิจและสังคม"
url = "https://www.onde.go.th/assets/portals/1/files/%E0%B8%99%E0%B9%82%E0%B8%A2%E0%B8%9A%E0%B8%B2%E0%B8%A2%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B9%81%E0%B8%9C%E0%B8%99%E0%B8%A3%E0%B8%B0%E0%B8%94%E0%B8%B1%E0%B8%9A%E0%B8%8A%E0%B8%B2%E0%B8%95%E0%B8%B4.PDF"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# แนวปฏิบัติในการรักษาความมั่นคงปลอดภัยด้านสารสนเทศ (Policy & Instruction for IT Security)

## นโยบายและแนวปฏิบัติรักษาความมั่นคงปลอดภัย

ready

In [None]:
title = "นโยบายและแนวปฏิบัติรักษาความมั่นคงปลอดภัย"
url = "https://www.onde.go.th/assets/portals//files/%E0%B8%99%E0%B9%82%E0%B8%A2%E0%B8%9A%E0%B8%B2%E0%B8%A2%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B9%81%E0%B8%99%E0%B8%A7%E0%B8%9B%E0%B8%8F%E0%B8%B4%E0%B8%9A%E0%B8%B1%E0%B8%95%E0%B8%B4%E0%B8%A3%E0%B8%B1%E0%B8%81%E0%B8%A9%E0%B8%B2%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1%E0%B8%A1%E0%B8%B1%E0%B9%88%E0%B8%99%E0%B8%84%E0%B8%87%E0%B8%9B%E0%B8%A5%E0%B8%AD%E0%B8%94%E0%B8%A0.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

## แผนรับมือภัยคุกคามทางไซเบอร์ ของ สดช.

ready

In [None]:
title = "แผนรับมือภัยคุกคามทางไซเบอร์ ของ สดช."
url = "https://datacatalog.onde.go.th/dataset/ddb9dff8-9d80-4a83-80b6-87830b69d767/resource/86c25b0d-4d93-4d1c-ba0a-df685442926b/download/-.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

## แผนสำรองและกู้คืนระบบ (Disaster Recovery Plan)

ready

In [None]:
title = "แผนสำรองและกู้คืนระบบ (Disaster Recovery Plan)"
url = "https://datacatalog.onde.go.th/dataset/ddb9dff8-9d80-4a83-80b6-87830b69d767/resource/39450d18-c3b0-477f-ac1f-edb2b0b7e918/download/-disaster-recovery-plan.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# มาตรฐานทักษะวิชาชีพไอซีที

# ข้อมูลแผนเฉพาะด้านโครงสร้างพื้นฐานดิจิทัล


ready

In [None]:
title = "ข้อมูลแผนเฉพาะด้านโครงสร้างพื้นฐานดิจิทัล"
url = "https://datacatalog.onde.go.th/dataset/777bee7b-ccc5-4a3f-a317-c1866d0b2e5b/resource/ef337efc-a02f-422f-bbbd-e21f3a9f8605/download/onde-digital-infra-master-plan_revised-draft-master-plan_v82_hl-1.pdf"
source = "data.go.th"
license = "Open Data Common"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# รายงานการประเมินความเสี่ยงการทุจริตประจำปี


ready

In [None]:
title = "รายงานการประเมินความเสี่ยงการทุจริตประจำปี"
url = "https://opendata.sme.go.th/dataset/06b2591e-2088-46bf-99cc-6e19a7712ac4/resource/098b8ea5-3c37-4908-bebd-55d0447ede3c/download/-2567.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# หนังสือเที่ยวชุมชนไปกับคน อพท. (ภาษาอังกฤษ/ภาษาไทย)


ready

In [None]:
title = "หนังสือเที่ยวชุมชนไปกับคน อพท. (ภาษาไทย)"
url = "https://data.go.th/dataset/d9dc4283-3ddd-4da6-9fda-4c6a0e7d9c1c/resource/0e3cd6d3-b1b7-4304-9d56-51cf660cb146/download/booklet-__thai.pdf"
source = "data.go.th"
license = "Creative Commons Attributions"

encoded_name = string_to_code(title)
filename = f"pdf_{encoded_name}.pdf"
print(f"filename: {filename}")

pdf_save_path = os.path.join(BASE_PATH_SAVE, filename)
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, filename.rstrip(".pdf") + "_meta.csv"
)

command = ["wget", url, "-O", pdf_save_path, "-q", "--no-check-certificate"]
get_pdf_and_save_meta(title, url, source, license, filename, command, meta_save_path)

# ข้อมูลรถโดยสารประจำทาง หมวด 1 กทม. (เส้นทางปฏิรูป)


In [34]:
df = pd.read_csv("../ข้อมูลรถโดยสารประจำทาง หมวด 1 กทม. (เส้นทางปฏิรูป).csv")
title = df["title"].str.replace(" ", "", regex=False).to_list()
url = df["url"].str.strip().to_list()
source = ["data.go.th"] * len(title)
license = ["GNU Free Documentation License"] * len(title)
filename = [t + ".pdf" for t in title]

meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, "pdf_ข้อมูลรถโดยสารประจำทาง หมวด 1 กทม. (เส้นทางปฏิรูป)_meta.csv"
)
pd.DataFrame(
    {
        "title": title,
        "url": url,
        "source": source,
        "license": license,
        "filename": filename,
    }
).to_csv(meta_save_path, index=False)

In [35]:
df = pd.DataFrame(
    {
        "title": title,
        "url": url,
        "source": source,
        "license": license,
        "filename": filename,
    }
)
resource_links = df.to_dict(orient="records")

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    pdf_save_path = os.path.join(BASE_PATH_SAVE, item["filename"])
    download_pdf(url=item["url"], save_path=pdf_save_path)

# แผนการตรวจสอบระยะยาว

In [None]:
web_url = "https://gdcatalog.go.th/dataset/gdpublish-xc6-4"

# Use a session to reuse the TCP connection for faster requests
session = requests.Session()

# Fetch and parse the main page
response = session.get(web_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Extract resource links with titles
base_url = "https://gdcatalog.go.th"
resources_section = soup.find("section", id="dataset-resources")
resource_links = [
    {
        "title": a["title"].strip(),
        "href": base_url + a["href"],
        "source": "Goverment data catalog smart plus",
        "license": "Open Data Common",
    }
    for a in resources_section.find_all("a", class_="heading")
]

# Process each resource to extract the PDF URL
for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
    pdf_page = session.get(resource["href"])
    pdf_page.raise_for_status()
    pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
    pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
    resource["url"] = pdf_url
    resource["filename"] = f"pdf_แผนการตรวจสอบระยะยาว_0{i}.pdf"
    resource.pop("href", None)

In [None]:
resource_links[:3]

In [19]:
pd.DataFrame(resource_links).to_csv(
    "../pdf_documents_meta_tmp/pdf_แผนการตรวจสอบระยะยาว.csv", index=False
)

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    download_pdf(
        url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
    )

# มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล


In [None]:
web_url = "https://gdcatalog.go.th/dataset/gdpublish-dsb21-1"
# Use a session to reuse the TCP connection for faster requests
session = requests.Session()

# Fetch and parse the main page
response = session.get(web_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Extract resource links with titles
base_url = "https://gdcatalog.go.th"
resources_section = soup.find("section", id="dataset-resources")
resource_links = [
    {
        "title": a["title"].strip(),
        "href": base_url + a["href"],
        "source": "Goverment data catalog smart plus",
        "license": "Open Data Common",
    }
    for a in resources_section.find_all("a", class_="heading")
]

# Process each resource to extract the PDF URL
for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
    pdf_page = session.get(resource["href"])
    pdf_page.raise_for_status()
    pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
    pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
    resource["url"] = pdf_url
    resource["filename"] = f"pdf_มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล_0{i}.pdf"
    resource.pop("href", None)

In [28]:
resource_links = list(filter(lambda x: x["url"].endswith(".pdf"), resource_links))

In [None]:
resource_links

In [30]:
pd.DataFrame(resource_links).to_csv(
    "../pdf_documents_meta_tmp/pdf_มูลค่าและระดับการเติบโตของกิจกรรมทางเศรษฐกิจดิจิทัล.csv",
    index=False,
)

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    download_pdf(
        url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
    )

# มาตรฐานทักษะวิชาชีพไอซีที


In [None]:
web_url = "https://gdcatalog.go.th/dataset/gdpublish-abc1-1"

# Use a session to reuse the TCP connection for faster requests
session = requests.Session()

# Fetch and parse the main page
response = session.get(web_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Extract resource links with titles
base_url = "https://gdcatalog.go.th"
resources_section = soup.find("section", id="dataset-resources")
resource_links = [
    {
        "title": a["title"].strip(),
        "href": base_url + a["href"],
        "source": "Goverment data catalog smart plus",
        "license": "Open Data Common",
    }
    for a in resources_section.find_all("a", class_="heading")
]

# Process each resource to extract the PDF URL
for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
    pdf_page = session.get(resource["href"])
    pdf_page.raise_for_status()
    pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
    pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
    resource["url"] = pdf_url
    resource["filename"] = f"pdf_มาตรฐานทักษะวิชาชีพไอซีที_0{i}.pdf"
    resource.pop("href", None)

In [35]:
resource_links = list(filter(lambda x: x["url"].endswith(".pdf"), resource_links))

In [None]:
resource_links

In [37]:
pd.DataFrame(resource_links).to_csv(
    "../pdf_documents_meta_tmp/pdf_มาตรฐานทักษะวิชาชีพไอซีที.csv", index=False
)

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    download_pdf(
        url=item["url"], save_path=os.path.join(BASE_PATH_SAVE, item["filename"])
    )

# เอกสารแนะนำการท่องเที่ยวของชุมชนท่องเที่ยว

In [None]:
web_url = "https://gdcatalog.go.th/dataset/gdpublish-14-02-66"

# Use a session to reuse the TCP connection for faster requests
session = requests.Session()

# Fetch and parse the main page
response = session.get(web_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Extract resource links with titles
base_url = "https://gdcatalog.go.th"
resources_section = soup.find("section", id="dataset-resources")
resource_links = [
    {
        "title": a["title"].strip(),
        "href": base_url + a["href"],
        "source": "Goverment data catalog smart plus",
        "license": "Creative Commons Attributions",
    }
    for a in resources_section.find_all("a", class_="heading")
]

# Process each resource to extract the PDF URL
for i, resource in enumerate(tqdm(resource_links, desc="Processing resources")):
    pdf_page = session.get(resource["href"])
    pdf_page.raise_for_status()
    pdf_soup = BeautifulSoup(pdf_page.text, "html.parser")
    pdf_url = pdf_soup.find("a", class_="resource-url-analytics")["href"]
    resource["url"] = pdf_url
    resource["filename"] = f"pdf_เอกสารแนะนำการท่องเที่ยวของชุมชนท่องเที่ยว_0{i}.pdf"
    resource.pop("href", None)

In [None]:
resource_links

In [21]:
meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, "pdf_เอกสารแนะนำการท่องเที่ยวของชุมชนท่องเที่ยว.csv"
)
pd.DataFrame(resource_links).to_csv(meta_save_path, index=False)

In [None]:
for item in tqdm(resource_links, desc="donwloading pdf "):
    pdf_save_path = os.path.join(BASE_PATH_SAVE, item["filename"])
    download_pdf(url=item["url"], save_path=pdf_save_path)

# แจกเอกสารประกอบการสอนเพื่อการศึกษา (PDF) KongRuksiam Studio

In [None]:
filename = [
    "พัฒนาเว็บด้วย Python & Flask (Free).pdf",
    "เรียนรู้การใช้งาน Git & GitHub สำหรับผู้เริ่มต้น.pdf",
    "JSON เบื้องต้น (Update).pdf",
    "Python & OpenCV Computer Vision & Image Processing (Free).pdf",
    "รวมเล่มพัฒนาเว็บด้วย PHP สำหรับผู้เริ่มต้น.pdf",
    "เรียนรู้การใช้งาน Visual Studio Code.pdf",
    "ปูพื้นฐาน HTML CSS JavaScript (Free).pdf",
    "รวมเล่มเขียนโปรแกรมภาษา C เบื้องต้น.pdf",
    "รวมเล่มเขียนโปรแกรมภาษา Python (อัปเดตล่าสุด).pdf",
    "เขียนโปรแกรมเชิงวัตถุด้วยภาษา Java (Update).pdf",
    "Python OOP.pdf",
]

url = [
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/Flask%20Framework%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B8%9E%E0%B8%B1%E0%B8%92%E0%B8%99%E0%B8%B2%E0%B9%80%E0%B8%A7%E0%B9%87%E0%B8%9A%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20Python%20%26%20Flask%20(Free).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/Git%20%26%20GitHub%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B9%80%E0%B8%A3%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B8%A3%E0%B8%B9%E0%B9%89%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B9%83%E0%B8%8A%E0%B9%89%E0%B8%87%E0%B8%B2%E0%B8%99%20Git%20%26%20GitHub%20%E0%B8%AA%E0%B8%B3%E0%B8%AB%E0%B8%A3%E0%B8%B1%E0%B8%9A%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B9%80%E0%B8%A3%E0%B8%B4%E0%B9%88%E0%B8%A1%E0%B8%95%E0%B9%89%E0%B8%99.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/JSON%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/JSON%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99%20(Update).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/OpenCV%20%26%20Python%20%E0%B8%AA%E0%B8%B3%E0%B8%AB%E0%B8%A3%E0%B8%B1%E0%B8%9A%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%A1%E0%B8%A7%E0%B8%A5%E0%B8%9C%E0%B8%A5%E0%B8%A0%E0%B8%B2%E0%B8%9E%20(Image%20Processing)/Python%20%26%20OpenCV%20Computer%20Vision%20%26%20Image%20Processing%20(Free).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/PHP%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B8%A3%E0%B8%A7%E0%B8%A1%E0%B9%80%E0%B8%A5%E0%B9%88%E0%B8%A1%E0%B8%9E%E0%B8%B1%E0%B8%92%E0%B8%99%E0%B8%B2%E0%B9%80%E0%B8%A7%E0%B9%87%E0%B8%9A%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20PHP%20%E0%B8%AA%E0%B8%B3%E0%B8%AB%E0%B8%A3%E0%B8%B1%E0%B8%9A%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B9%80%E0%B8%A3%E0%B8%B4%E0%B9%88%E0%B8%A1%E0%B8%95%E0%B9%89%E0%B8%99.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/Visual%20Studio%20Code%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B9%80%E0%B8%A3%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B8%A3%E0%B8%B9%E0%B9%89%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B9%83%E0%B8%8A%E0%B9%89%E0%B8%87%E0%B8%B2%E0%B8%99%20Visual%20Studio%20Code.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B8%9E%E0%B8%B7%E0%B9%89%E0%B8%99%E0%B8%90%E0%B8%B2%E0%B8%99%20HTML%20%2CCSS%20%2C%20JavaScript/%E0%B8%9B%E0%B8%B9%E0%B8%9E%E0%B8%B7%E0%B9%89%E0%B8%99%E0%B8%90%E0%B8%B2%E0%B8%99%20HTML%20CSS%20JavaScript%20(Free).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20C%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99/%E0%B8%A3%E0%B8%A7%E0%B8%A1%E0%B9%80%E0%B8%A5%E0%B9%88%E0%B8%A1%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20C%20%E0%B9%80%E0%B8%9A%E0%B8%B7%E0%B9%89%E0%B8%AD%E0%B8%87%E0%B8%95%E0%B9%89%E0%B8%99.pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20Python%20(%E0%B8%AD%E0%B8%B1%E0%B8%9B%E0%B9%80%E0%B8%94%E0%B8%95%E0%B8%A5%E0%B9%88%E0%B8%B2%E0%B8%AA%E0%B8%B8%E0%B8%94)/%E0%B8%A3%E0%B8%A7%E0%B8%A1%E0%B9%80%E0%B8%A5%E0%B9%88%E0%B8%A1%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20Python%20(%E0%B8%AD%E0%B8%B1%E0%B8%9B%E0%B9%80%E0%B8%94%E0%B8%95%E0%B8%A5%E0%B9%88%E0%B8%B2%E0%B8%AA%E0%B8%B8%E0%B8%94).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B9%80%E0%B8%8A%E0%B8%B4%E0%B8%87%E0%B8%A7%E0%B8%B1%E0%B8%95%E0%B8%96%E0%B8%B8%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20Java/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B9%80%E0%B8%8A%E0%B8%B4%E0%B8%87%E0%B8%A7%E0%B8%B1%E0%B8%95%E0%B8%96%E0%B8%B8%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%20Java%20(Update).pdf",
    "https://github.com/kongruksiamza/ebook-for-education/blob/main/%E0%B9%80%E0%B8%82%E0%B8%B5%E0%B8%A2%E0%B8%99%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1%E0%B9%80%E0%B8%8A%E0%B8%B4%E0%B8%87%E0%B8%A7%E0%B8%B1%E0%B8%95%E0%B8%96%E0%B8%B8%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%20Python/Python%20OOP.pdf",
]
source = ["https://github.com/kongruksiamza/ebook-for-education"] * len(filename)
license = ["CC BY-NC"] * len(filename)
title = [f.replace(".pdf", "") for f in filename]

meta_save_path = os.path.join(
    BASE_PATH_SAVE_META, "pdf_แจกเอกสารประกอบการสอนเพื่อการศึกษา_kong.csv"
)
pd.DataFrame(
    {
        "title": title,
        "url": url,
        "source": source,
        "license": license,
        "filename": filename,
    }
).to_csv(meta_save_path, index=False)

# tmp