In [None]:
## NO.X 버전 덤프 텍스트 추출기

import os
import re
from PyPDF2 import PdfReader

pdf_filename = "Data-Cloud-Consultant V13.35.pdf"

def extract_and_clean_questions(pdf_filename):
    # 현재 디렉토리 기준 경로 설정
    current_dir = os.getcwd()
    pdf_path = os.path.join(current_dir, pdf_filename)

    # 결과 파일명 생성 (예: Data-Cloud-Consultant V13.35_cleaned.txt)
    base_name = os.path.splitext(pdf_filename)[0]
    output_filename = f"{base_name}_cleaned.txt"
    output_path = os.path.join(current_dir, output_filename)

    # PDF 읽기
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()

    # Answer 기준으로 문제 블럭 추출
    questions_only = re.findall(r"(NO\.\d+.*?)Answer:", full_text, re.DOTALL)

    # 문장 중간 개행 제거
    def clean_line_breaks(text):
        return re.sub(r'(?<!\n)(?<![A-D]\.)\n(?!\n)(?![A-D]\.)', ' ', text)

    cleaned_questions = [clean_line_breaks(re.sub(r'\n+', '\n', q)).strip() for q in questions_only]

    # 결과 파일로 저장
    with open(output_path, "w", encoding="utf-8") as f:
        for question in cleaned_questions:
            f.write(question + "\n\n")

    print(f"✅ 완료: {output_path} 에 저장됨.")

# 사용 예시
if __name__ == "__main__":
    extract_and_clean_questions(pdf_filename)


✅ 완료: c:\Concentrix\[personal_rep]\develop\dump_question_extractor\Data-Cloud-Consultant V13.95_cleaned.txt 에 저장됨.


In [None]:
## NO.X 버전 덤프 텍스트 추출 + PDF 변환 with Answer(Gray)

import os
import re
from PyPDF2 import PdfReader
from fpdf import FPDF

pdf_filename = "Data-Cloud-Consultant V13.35.pdf"
font_path = "C:\\Windows\\Fonts\\malgun.ttf"  # 맑은 고딕

# PDF 클래스 정의
class PDFWithAnswerHint(FPDF):
    def __init__(self):
        super().__init__()
        self.add_page()
        self.set_auto_page_break(auto=True, margin=15)
        self.add_font("Custom", "", font_path, uni=True)
        self.set_font("Custom", size=12)
        self.set_text_color(0, 0, 0)

    def add_question_text(self, question_text):
        lines = question_text.strip().split('\n')
        for line in lines:
            stripped = line.strip()

            # NO.X 부분만 강조 (크기 증가 또는 볼드 느낌)
            if re.match(r'NO\.\d+', stripped, re.IGNORECASE):
                self.set_font("Custom", size=13)
                self.multi_cell(0, 10, stripped)
                self.set_font("Custom", size=12)

            # Answer는 연한 회색 처리
            elif stripped.startswith("Answer:"):
                self.set_text_color(240, 240, 240)
                self.multi_cell(0, 10, stripped)
                self.set_text_color(0, 0, 0)

            else:
                self.multi_cell(0, 10, stripped)
        self.ln(5)

# 1단계: PDF에서 NO.X~Answer 블록 추출, Explanation 제거
def extract_and_clean_questions(pdf_filename):
    current_dir = os.getcwd()
    pdf_path = os.path.join(current_dir, pdf_filename)

    base_name = os.path.splitext(pdf_filename)[0]
    output_filename = f"{base_name}_cleaned.txt"
    output_path = os.path.join(current_dir, output_filename)

    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()

    # NO.X ~ Answer 포함, Explanation 제거
    questions = re.findall(r"(NO\.\d+.*?Answer:.*?)(?:Explanation:|\nNO\.|\Z)", full_text, re.DOTALL)

    def clean_line_breaks(text):
        return re.sub(r'(?<!\n)(?<![A-D]\.)\n(?![A-D]\.|Answer:)', ' ', text)

    cleaned = [clean_line_breaks(re.sub(r'\n+', '\n', q)).strip() for q in questions]

    with open(output_path, "w", encoding="utf-8") as f:
        for q in cleaned:
            f.write(q + "\n\n")

    print(f"✅ 텍스트 저장 완료: {output_path}")
    return base_name

# 2단계: 텍스트를 PDF로 변환
def txt_to_pdf(base_name):
    current_dir = os.getcwd()
    txt_path = os.path.join(current_dir, f"{base_name}_cleaned.txt")
    pdf_path = os.path.join(current_dir, f"{base_name}_cleaned.pdf")

    pdf = PDFWithAnswerHint()

    with open(txt_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        questions = content.split('\n\n')
        for q in questions:
            pdf.add_question_text(q)

    pdf.output(pdf_path)
    print(f"✅ PDF 생성 완료: {pdf_path}")

# 전체 실행
if __name__ == "__main__":
    base = extract_and_clean_questions(pdf_filename)
    txt_to_pdf(base)


✅ 텍스트 저장 완료: c:\Concentrix\[personal_rep]\develop\dump_question_extractor\Data-Cloud-Consultant V13.35_cleaned.txt
✅ PDF 생성 완료: c:\Concentrix\[personal_rep]\develop\dump_question_extractor\Data-Cloud-Consultant V13.35_cleaned.pdf


In [38]:
## QUESTION NO: X 덤프 문제 추출 + PDF 변환 with Answer(Gray)
import os
import re
from PyPDF2 import PdfReader
from fpdf import FPDF

origin_filename = "Data-Cloud-Consultant V13.95"
font_path = "C:\\Windows\\Fonts\\malgun.ttf"  # 맑은 고딕

# PDF 클래스
class PDFWithHiddenAnswers(FPDF):
    def __init__(self):
        super().__init__()
        self.add_page()
        self.set_auto_page_break(auto=True, margin=15)
        self.add_font("Custom", "", font_path, uni=True)
        self.set_font("Custom", size=12)
        self.set_text_color(0, 0, 0)

    def add_question_text(self, question_text):
        lines = question_text.strip().split('\n')
        for line in lines:
            stripped = line.strip()

            if re.match(r'QUESTION NO[:\.] \d+', stripped, re.IGNORECASE):
                self.set_font("Custom", size=13)
                self.multi_cell(0, 10, stripped)
                self.set_font("Custom", size=12)

            elif stripped.startswith("Answer:"):
                self.set_text_color(240, 240, 240)  # 연한 회색 (하이라이트 시 보임)
                self.multi_cell(0, 10, stripped)
                self.set_text_color(0, 0, 0)

            else:
                self.multi_cell(0, 10, stripped)
        self.ln(5)
        
def normalize_question_number(text):
    # QUESTION NO: 23 → NO: 23 (또는 NO.23)
    return re.sub(r'QUESTION\s+NO[:\.]\s*(\d+)', r'NO: \1', text, flags=re.IGNORECASE)

# 1. PDF에서 문제 + Answer만 추출 (Explanation은 제거)
def extract_questions_with_answer(pdf_filename):
    current_dir = os.getcwd()
    pdf_path = os.path.join(current_dir, pdf_filename)
    base_name = os.path.splitext(pdf_filename)[0]
    output_txt = f"{base_name}_cleaned.txt"
    output_path = os.path.join(current_dir, output_txt)

    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()

    # QUESTION ~ Answer까지만 추출, Explanation 이후 내용은 제거
    questions_with_answers = re.findall(r"(QUESTION NO[:\.] \d+.*?Answer:.*?)(?:Explanation:|\nQUESTION NO[:\.]|\Z)", full_text, re.DOTALL)

    def clean_line_breaks(text):
        return re.sub(r'(?<!\n)(?<![A-D]\.)\n(?![A-D]\.|Answer:)', ' ', text)

    # cleaned = [clean_line_breaks(re.sub(r'\n+', '\n', q)).strip() for q in questions_with_answers]
    cleaned = [
        normalize_question_number(
            clean_line_breaks(re.sub(r'\n+', '\n', q))
        ).strip()
        for q in questions_with_answers
    ]

    with open(output_path, "w", encoding="utf-8") as f:
        for q in cleaned:
            f.write(q + "\n\n")

    print(f"✅ 텍스트 저장 완료: {output_path}")
    return base_name

# 2. 텍스트 파일을 PDF로 변환
def txt_to_pdf_unicode(base_name):
    current_dir = os.getcwd()
    txt_path = os.path.join(current_dir, f"{base_name}_cleaned.txt")
    pdf_path = os.path.join(current_dir, f"{base_name}_cleaned.pdf")

    pdf = PDFWithHiddenAnswers()

    with open(txt_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        questions = content.split('\n\n')
        for q in questions:
            pdf.add_question_text(q)

    pdf.output(pdf_path)
    print(f"✅ PDF 생성 완료: {pdf_path}")

# 실행
if __name__ == "__main__":
    base = extract_questions_with_answer(origin_filename + ".pdf")
    txt_to_pdf_unicode(base)


✅ 텍스트 저장 완료: c:\Concentrix\[personal_rep]\develop\dump_question_extractor\Data-Cloud-Consultant V13.95_cleaned.txt
✅ PDF 생성 완료: c:\Concentrix\[personal_rep]\develop\dump_question_extractor\Data-Cloud-Consultant V13.95_cleaned.pdf


In [3]:
## txt to PDF
import os
from fpdf import FPDF
import re

filename = "Data-Cloud-Consultant V13.95"
font_path = "C:\\Windows\\Fonts\\malgun.ttf"  # Windows용 예시

# PDF 생성 클래스 정의
class PDFWithHiddenAnswers(FPDF):
    def __init__(self):
        super().__init__()
        self.add_page()
        self.set_auto_page_break(auto=True, margin=15)
        self.add_font("Custom", "", font_path, uni=True)
        self.set_font("Custom", size=12)
        self.set_text_color(0, 0, 0)

    def add_question_text(self, question_text):
        lines = question_text.strip().split('\n')
        for line in lines:
            stripped = line.strip()

            # NO: X 볼드 대체 (글자 크기만 키움)
            if re.match(r'NO[:\.] \d+', stripped, re.IGNORECASE):
                self.set_font("Custom", size=13)
                self.multi_cell(0, 10, stripped)
                self.set_font("Custom", size=12)

            elif stripped.startswith("Answer:"):
                self.set_text_color(240, 240, 240)  # 연한 회색으로 출력
                self.multi_cell(0, 10, stripped)
                self.set_text_color(0, 0, 0)

            else:
                self.multi_cell(0, 10, stripped)
        self.ln(5)

# 텍스트 파일 → PDF 변환 함수 (분리됨)
def txt_to_pdf_unicode(base_name):
    current_dir = os.getcwd()
    txt_path = os.path.join(current_dir, f"{base_name}_cleaned.txt")
    pdf_path = os.path.join(current_dir, f"{base_name}_cleaned.pdf")

    pdf = PDFWithHiddenAnswers()

    with open(txt_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        questions = content.split('\n\n')
        for q in questions:
            pdf.add_question_text(q)

    pdf.output(pdf_path)
    print(f"✅ PDF 생성 완료: {pdf_path}")

# 예시 실행
if __name__ == "__main__":
    txt_to_pdf_unicode(filename)


✅ PDF 생성 완료: c:\Concentrix\[personal_rep]\develop\dump-extractor\Data-Cloud-Consultant V13.95_cleaned.pdf


In [None]:
## New ver. Explanation 없을 때 예외 처리 추가

import os
import re
from PyPDF2 import PdfReader
from fpdf import FPDF
from tkinter import Tk, filedialog, messagebox

# ==== 사용자 설정 ====
font_path = "C:\\Windows\\Fonts\\malgun.ttf"  # Windows용 한글 지원 폰트
# ====================

# ✅ PDF 출력 클래스
class PDFWithHiddenAnswers(FPDF):
    def __init__(self):
        super().__init__()
        self.add_page()
        self.set_auto_page_break(auto=True, margin=15)
        self.add_font("Custom", "", font_path, uni=True)
        self.set_font("Custom", size=12)
        self.set_text_color(0, 0, 0)

    def add_question_text(self, question_text):
        lines = question_text.strip().split('\n')
        for line in lines:
            stripped = line.strip()
            if re.match(r'NO[:\.] \d+', stripped, re.IGNORECASE):
                self.set_font("Custom", size=13)
                self.multi_cell(0, 10, stripped)
                self.set_font("Custom", size=12)
            elif stripped.startswith("Answer:"):
                self.set_text_color(240, 240, 240)  # 연한 회색
                self.multi_cell(0, 10, stripped)
                self.set_text_color(0, 0, 0)
            else:
                self.multi_cell(0, 10, stripped)
        self.ln(5)

# ✅ QUESTION NO → NO 로 변환
def normalize_question_number(text):
    return re.sub(r'QUESTION\s+NO[:\.]\s*(\d+)', r'NO: \1', text, flags=re.IGNORECASE)

# ✅ PDF에서 문제 + Answer 추출 (Explanation 제거)
def extract_questions_with_answer(pdf_path):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_txt = f"{base_name}_cleaned.txt"
    output_path = os.path.join(os.getcwd(), output_txt)

    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()

    # 다음 QUESTION NO 또는 파일 끝까지 추출
    questions_with_answers = re.findall(
        r"(QUESTION NO[:\.] \d+.*?Answer:.*?)(?=\nQUESTION NO[:\.]|\Z)",
        full_text,
        re.DOTALL
    )

    def clean_line_breaks(text):
        return re.sub(r'(?<!\n)(?<![A-D]\.)\n(?![A-D]\.|Answer:)', ' ', text)

    cleaned = [
        normalize_question_number(
            clean_line_breaks(re.sub(r'\n+', '\n', q))
        ).strip()
        for q in questions_with_answers
    ]

    if not cleaned:
        raise ValueError("❗ 문제를 추출할 수 없습니다. PDF 형식이 예상과 다를 수 있습니다.")

    with open(output_path, "w", encoding="utf-8") as f:
        for q in cleaned:
            f.write(q + "\n\n")

    print(f"✅ 텍스트 저장 완료: {output_path} ({len(cleaned)}문제)")
    return base_name, len(cleaned)

# ✅ 텍스트 파일 → PDF 변환
def txt_to_pdf_unicode(base_name):
    txt_path = os.path.join(os.getcwd(), f"{base_name}_cleaned.txt")
    pdf_path = os.path.join(os.getcwd(), f"{base_name}_cleaned.pdf")

    pdf = PDFWithHiddenAnswers()

    with open(txt_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        questions = content.split('\n\n')
        for q in questions:
            pdf.add_question_text(q)

    pdf.output(pdf_path)
    return pdf_path

# ✅ GUI 실행
if __name__ == "__main__":
    root = Tk()
    root.withdraw()

    try:
        file_path = filedialog.askopenfilename(
            title="PDF 파일 선택",
            filetypes=[("PDF files", "*.pdf")]
        )

        if file_path:
            base, count = extract_questions_with_answer(file_path)
            pdf_path = txt_to_pdf_unicode(base)
            messagebox.showinfo("완료", f"✅ 변환이 완료되었습니다!\n\n📄 저장 위치:\n{pdf_path}\n\n총 추출된 문제 수: {count}개")
        else:
            messagebox.showwarning("취소됨", "❗ 파일이 선택되지 않았습니다.")
    except Exception as e:
        messagebox.showerror("오류 발생", f"🚫 오류가 발생했습니다:\n{str(e)}")
