In [None]:
import os
import pdfplumber
import pandas as pd
import logging
from pathlib import Path


In [3]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_text = ""
        for page in pdf.pages:
            all_text += page.extract_text() + "\n"
    return all_text
def extract_text_from_folder(folder_path, output_folder='txt'):
    # Create output folder if not specified
    if output_folder is None:
        output_folder = os.path.join(folder_path, "extracted_texts")

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)

            try:
                # Extract text from PDF
                extracted_text = extract_text_from_pdf(pdf_path)

                # Create output filename
                output_filename = os.path.splitext(filename)[0] + ".txt"
                output_path = os.path.join(output_folder, output_filename)

                # Write extracted text to file
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(extracted_text)

                print(f"Extracted text from {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")
# Example usage
folder_path = "2023_Q4"

extract_text_from_folder(folder_path)

Extracted text from 24CS_86500201.pdf
Extracted text from 2S_86215801.pdf
Extracted text from 3BBIF_85806601.pdf
Extracted text from 3K-BAT_85852201.pdf
Extracted text from A5_86520501.pdf
Extracted text from AAI_86094601.pdf
Extracted text from AAV_86270501.pdf
Extracted text from ABM_86368601.pdf
Extracted text from ACC_86663601.pdf
Extracted text from ACE_86706701.pdf
Extracted text from ACG_86119901.pdf
Extracted text from ADB_86150001.pdf
Extracted text from ADD_86212601.pdf
Extracted text from ADVANC_85814801.pdf
Extracted text from ADVANC_86685701.pdf
Extracted text from ADVICE_85728701.pdf
Extracted text from ADVICE_85729401.pdf
Extracted text from ADVICE_86439501.pdf
Extracted text from AEONTS_85538501.pdf
Extracted text from AE_86469401.pdf
Extracted text from AFC_85874901.pdf
Extracted text from AF_86136601.pdf
Extracted text from AGE_86062001.pdf
Extracted text from AGE_86072501.pdf
Extracted text from AHC_86247501.pdf
Extracted text from AH_86765601.pdf
Extracted text from

In [None]:
def create_csv_from_txt(folder_path, output_csv_path):
    data = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # ตรวจสอบว่าเป็นไฟล์ .txt
            filepath = os.path.join(folder_path, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().strip()  # อ่านเนื้อหาไฟล์และตัดช่องว่างที่ไม่จำเป็น
                data.append({'Filename': filename, 'เนื้อหาด้านใน': content})
    
    # สร้าง DataFrame
    df = pd.DataFrame(data)
    
    # บันทึกเป็น CSV โดยใช้ escapechar
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig', escapechar='\\')  # เพิ่ม escapechar
    print(f"บันทึกไฟล์ CSV เรียบร้อย: {output_csv_path}")

# ตัวอย่างการใช้งาน
folder_path = "txt"  # เปลี่ยนเป็น path ของโฟลเดอร์ที่มีไฟล์ .txt
output_csv_path = "txt_csv.csv"  # เปลี่ยนเป็น path ของไฟล์ .csv ที่ต้องการบันทึก
create_csv_from_txt(folder_path, output_csv_path)
