In [4]:
import re
import pandas as pd
from pathlib import Path

# ============================================================
# 0. PATH (대장 프로젝트 전용 — 단순 & 안정)
# ============================================================
PROJECT_ROOT = Path.cwd().parents[1]
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(exist_ok=True)

print("PROJECT ROOT:", PROJECT_ROOT)
print("RAW DIR:", RAW_DIR)
print("PROCESSED DIR:", PROCESSED_DIR)


# ============================================================
# Dialogue Preprocessor — v1A (ROUGE 최적화 버전)
# ============================================================
class DialoguePreprocessor:
    def __init__(self):

        # title normalization은 유지 (해가 되지 않음)
        self.title_map = {
            r'\bDr\.?\b': '<Doctor>',
            r'\bMr\.?\b': '<Male>',
            r'\bMrs\.?\b': '<MarriedFemale>',
            r'\bMs\.?\b': '<Female>',
            r'\bMiss\.?\b': '<YoungFemale>',
        }

        self.korean_title_map = {
            r'\b의사\s*선생님\b': '<Doctor>',
            r'\b의사\b': '<Doctor>',
            r'\b교수님\b': '<Professor>',
            r'\b간호사\b': '<Nurse>',
            r'\b상담사\b': '<Counselor>',
            r'\b오빠\b': '<OlderBrother>',
            r'\b형\b': '<OlderBrother>',
            r'\b언니\b': '<OlderSister>',
            r'\b누나\b': '<OlderSister>',
            r'\b아저씨\b': '<Mister>',
            r'\b아줌마\b': '<Madam>',
            r'\b사장님\b': '<Boss>',
            r'\b손님\b': '<Customer>',
            r'\b환자\b': '<Patient>',
            r'\b기사님\b': '<Driver>',
        }

        # split markers는 유지하되 강제 split 정도만 조정
        self.split_markers = [
            "그리고", "근데", "그런데", "하지만",
            "그러나", "그래서", "그러니까",
            "또", "또한", "게다가",
        ]


    # ====================================================
    # Speaker Tag
    # ====================================================
    def normalize_speaker(self, text):
        text = re.sub(r'#Person1#\s*:?', '<speaker1> ', text)
        text = re.sub(r'#Person2#\s*:?', '<speaker2> ', text)
        text = re.sub(r'#Person3#\s*:?', '<speaker3> ', text)
        return text


    # ====================================================
    # 따옴표 정규화
    # ====================================================
    def normalize_quotes(self, text):
        quote_map = {
            "“": '"', "”": '"',
            "‘": "'", "’": "'",
            "「": '"', "」": '"',
            "『": '"', "』": '"',
            "‹": "'", "›": "'",
            "«": '"', "»": '"',
        }
        for k, v in quote_map.items():
            text = text.replace(k, v)
        return text


    # ====================================================
    # 영어/한국어 호칭 변환
    # ====================================================
    def normalize_titles(self, text):
        for pat, token in self.title_map.items():
            text = re.sub(
                pat + r'\s+([A-Z][a-zA-Z]+)',
                lambda m: f"{token} {m.group(1)}",
                text, flags=re.IGNORECASE
            )
        for pat, token in self.korean_title_map.items():
            text = re.sub(pat, token, text)
        return text


    # ====================================================
    # 숫자/돈/엔터티 마스킹 삭제
    #  → ROUGE 최적화에서는 그대로 두는 것이 유리
    # ====================================================
    def pass_numbers(self, text):
        return text

    def pass_named_entities(self, text):
        return text


    # ====================================================
    # 욕설 마스킹도 제거 (ROUGE 목적)
    # ====================================================
    def pass_badwords(self, text):
        return text


    # ====================================================
    # 문장 끝 정비 — 최소한만 유지
    # ====================================================
    def split_by_conjunction(self, text):
        # 마지막에 .?! 없으면 . 하나만 추가
        if not re.search(r'[.?!]\s*$', text):
            text = text + '.'
        return text


    # ====================================================
    # 영어 문장 끝 처리 — 오류 패치 적용
    # ====================================================
    def english_sentence_end(self, text):
        if re.search(r'[.?!]\s*$', text):
            return text
        return re.sub(r'([A-Za-z]+)$', r'\1.', text)


    # ====================================================
    # 한자/특수문자 노이즈 제거
    # ====================================================
    def clean_hanja_noise(self, text):
        text = re.sub(r"[『』《》]", "", text)
        text = re.sub(r"[─━│┃╭╮╰╯]", "", text)
        return text


    # ====================================================
    # 전체 파이프라인
    # ====================================================
    def run(self, text):
        if not isinstance(text, str):
            return ""

        text = self.normalize_speaker(text)
        text = self.normalize_quotes(text)
        text = self.normalize_titles(text)

        # masking 계열 모두 제거 (ROUGE 최적화)
        text = self.pass_numbers(text)
        text = self.pass_named_entities(text)
        text = self.pass_badwords(text)

        text = self.clean_hanja_noise(text)
        text = self.split_by_conjunction(text)
        text = self.english_sentence_end(text)

        return text



# ============================================================
# 실행
# ============================================================
print("\nLoading raw CSV...")
train_df = pd.read_csv(RAW_DIR / "train.csv")
test_df  = pd.read_csv(RAW_DIR / "test.csv")

pre = DialoguePreprocessor()

print("\nProcessing TRAIN...")
train_df["dialogue_clean"] = train_df["dialogue"].apply(lambda x: pre.run(str(x)))

print("Processing TEST...")
test_df["dialogue_clean"] = test_df["dialogue"].apply(lambda x: pre.run(str(x)))

train_out = PROCESSED_DIR / "v1_train_preprocessed.csv"
test_out  = PROCESSED_DIR / "v1_test_preprocessed.csv"

train_df.to_csv(train_out, index=False)
test_df.to_csv(test_out, index=False)

print("\nSAVED:")
print(train_out)
print(test_out)
print("\nDONE.")


PROJECT ROOT: /root/nlp
RAW DIR: /root/nlp/data/raw
PROCESSED DIR: /root/nlp/data/processed

Loading raw CSV...

Processing TRAIN...
Processing TEST...

SAVED:
/root/nlp/data/processed/v1_train_preprocessed.csv
/root/nlp/data/processed/v1_test_preprocessed.csv

DONE.
