In [1]:
import re, unicodedata, html
from urllib.parse import urlparse, urlunparse

EMOJI_SYMBOL_RE = re.compile(
    "["                         # BMP(비트맵 이미지 포맷)
    "\u2190-\u21FF"             # arrows
    "\u2300-\u23FF"             # misc technical
    "\u2460-\u24FF"             # enclosed alphanumerics
    "\u25A0-\u25FF"             # geometric shapes (■◆▲○●…)
    "\u2600-\u26FF"             # misc symbols (☀☂☏…)
    "\u2700-\u27BF"             # dingbats (✔✖✈…)
    "\u2B00-\u2BFF"             # misc symbols & arrows
    "\u3000-\u303F"             # CJK symbols (、。・《》【】…)
    "\uFE0F"                    # variation selector-16
    "\u200d"                    # zero width joiner
    "]"
    "|"
    "["                         # 보충 평면(astral)
    "\U0001F000-\U0001FAFF"     # emoticons/pictographs/transport/etc.
    "\U0001FB00-\U0001FBFF"
    "]", flags=re.UNICODE)

URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
EMAIL_RE = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+")
HASHTAG_RE   = re.compile(r"(?:^|(?<=\s))#([^\s#]+)")     # '#청원' -> '청원'
MENTION_RE= re.compile(r"(?:^|(?<=\s))@([A-Za-z0-9_.]+)")

# 제로폭/제어문자
ZERO_WIDTH_RE = re.compile(r"[\u200B-\u200D\u2060\uFEFF]")

# 뉴스 서명/홍보문구
NEWS_TAILS_RE = re.compile(
    r"(자세한 내용은 (영상|뉴스|기사)로|구독과\s*좋아요|무단 전재.*금지|"
    r"YTN.*입니다\.|KBS.*입니다\.|MBC.*입니다\.|연합뉴스TV.*입니다\.|JTBC.*입니다\.)"
)

# 제보/문의/연락/카톡
CONTACT_LINE_RE = re.compile(r"(제보|문의|연락|카카오톡|KakaoTalk|전화|☎|Tel)[^\n]*", re.IGNORECASE)

# 기자/앵커 서명
REPORTER_SIGN_RE = re.compile(r"\b[\uAC00-\uD7A3]{2,5}\s*기자(입니다\.?)?")

# 반복문자/감탄/웃음
REPEAT_PUNCT_RE = re.compile(r"([!?.,])\1{1,}")   # !!??.. → ! ? .
LAUGH_RE        = re.compile(r"(ㅋ|ㅎ|ㅜ|ㅠ){3,}") # ㅋㅋㅋㅋ → ㅋㅋ

# 괄호 내용 삭제 기준
NOISE_TAGS = ["앵커","기자","리포트","자막","영상","사진","자료화면","CG","그래픽",
              "브릿지","현장연결","속보","뉴스","단독","취재","연결","보도국",
              "제작지원","협찬","캡처","출처"]
NOISE_TAGS_RE = re.compile("|".join([re.escape(t) for t in NOISE_TAGS]))

def keep_bracket(inner:str)->bool:
    s = inner.strip()
    if not s: return False
    if re.search(r"(법|조문|조항|제\d+조|처벌|벌금|개정|입법|법안|고시|시행령|시행규칙)", s): return True
    if re.search(r"(대법원|헌법재판소|행정심판|판결|판례|행안부|과기정통부|개인정보보호위원회|방통위)", s): return True
    if re.search(r"\b(19|20)\d{2}\b", s): return True  # 연도
    if NOISE_TAGS_RE.search(s): return False
    return len(s) >= 3

BRACKETS = [r"\[([^\[\]]+)\]", r"\(([^\(\)]+)\)", r"\<([^<>]+)\)"]

In [2]:
# 법령명 별칭 정규화
LAW_ALIAS = {
    r"\b개보법\b": "개인정보보호법",
    r"\bPIPA\b": "개인정보보호법",
    r"\b망법\b": "정보통신망 이용촉진 및 정보보호 등에 관한 법률",
    r"정보통신망법": "정보통신망 이용촉진 및 정보보호 등에 관한 법률",
}
LAW_ALIAS_COMPILED = [(re.compile(k), v) for k,v in LAW_ALIAS.items()]

# 수요 의도 분류
INTENT_PATTERNS = {
    "제정/신설":  r"(법(을)?\s*만들|제정(하|해|하자)|입법\s*하|신설)",
    "개정/강화":  r"(개정(하|해|해야|안)|처벌\s*강화|형량\s*상향|벌금\s*상향|처벌(을)?\s*강하게)",
    "완화/유연":  r"(규제\s*완화|처벌\s*완화|유연(하|해)|완화(하|해))",
    "폐지/반대":  r"(폐지(하|해|하자)|법안\s*반대|철회(하|해))",
    "집행/감시":  r"(단속\s*강화|집행\s*강화|감시\s*강화|시행령|시행규칙)"
}
INTENT_RES = {k: re.compile(v) for k,v in INTENT_PATTERNS.items()}

def normalize_law_aliases(s: str) -> str:
    out = s
    for rgx, repl in LAW_ALIAS_COMPILED:
        out = rgx.sub(repl, out)
    return out

def extract_intents(s: str):
    labels = [k for k, rx in INTENT_RES.items() if rx.search(s)]
    return labels

In [3]:
PUNCT_TRANSLATE = str.maketrans({
    "…": "...", "·": " ", "•": " ", "―": "-", "–": "-", "—": "-",
    "“": "\"", "”": "\"", "‘": "'", "’": "'",
    "◆":" ", "■":" ", "▲":" ", "△":" ", "▶":" ", "▷":" ", "▼":" ", "▽":" ", "※":" "
})

def normalize_korean(text:str, keep_hashtag_tokens=True) -> str:
    if not isinstance(text, str): return ''
    t = html.unescape(text) # html entity, tag 흔적 정리
    t = unicodedata.normalize("NFKC", t) # 유니코드 정규화
    t = unicodedata.normalize('NFC', t) # 한국어 일관된 형태(초성/중성 분해)
    # 제로폭 제어
    t = ZERO_WIDTH_RE.sub("", t)

    t = URL_RE.sub(" ", t)
    t = EMAIL_RE.sub(" ", t)

    # 해시태그/멘션: 토큰만 남길지 여부
    if keep_hashtag_tokens:
        t = HASHTAG_RE.sub(lambda m: " " + m.group(1), t)
    else:
        t = HASHTAG_RE.sub(" ", t)
    t = MENTION_RE.sub(" ", t)

    t = EMOJI_SYMBOL_RE.sub(" ", t)
    t = NEWS_TAILS_RE.sub(" ", t)
    t = CONTACT_LINE_RE.sub(" ", t)
    t = REPORTER_SIGN_RE.sub(" ", t)

    for patt in BRACKETS:
        t = re.sub(patt, lambda m: (" " + m.group(1) + " ") if keep_bracket(m.group(1)) else " ", t)

    t = REPEAT_PUNCT_RE.sub(r"\1", t)
    t = LAUGH_RE.sub(lambda m: m.group(1)*2, t)  # ㅋㅋㅋㅋ → ㅋㅋ

    t = t.translate(PUNCT_TRANSLATE)
    t = re.sub(r"\s+", " ", t).strip()

    return t

In [4]:
import hashlib
from urllib.parse import urlparse

def clean_domain(u: str) -> str:
    if not isinstance(u, str) or not u.strip():
        return ""
    try:
        p = urlparse(u if u.startswith(("http://","https://")) else "http://" + u)
        return p.netloc.lower()
    except:
        return ""

# 제목과 본문을 이용한 특성 key 부여(중복 데이터 확인용)
def make_clean_hash(title_norm: str, content_norm: str) -> str:
    key = (title_norm or "") + "\n" + (content_norm or "")
    return hashlib.md5(key.encode("utf-8")).hexdigest()

In [5]:
import pandas as pd

def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in ['제목','본문','url']:
        if c in out.columns:
            out[c] = out[c].fillna("").astype(str)
        else:
            out[c] = ""

    out['title_norm']   = out['제목'].map(normalize_korean).map(normalize_law_aliases)
    out['content_norm'] = out['본문'].map(normalize_korean).map(normalize_law_aliases)
    out['clean_hash'] = [make_clean_hash(a,b) for a,b in zip(out['title_norm'], out['content_norm'])]
    out['domain']     = out['url'].map(clean_domain)

    out['intent_labels'] = (out['title_norm'] + " " + out['content_norm']).map(extract_intents)

    # 한국어 비율, 본문 글자 수 측정
    def ko_ratio(s: str) -> float:
        if not s: return 0.0
        ko = sum(1 for ch in s if '\uAC00' <= ch <= '\uD7A3')
        return ko / max(1, len(s))
    out['ko_ratio'] = out['content_norm'].map(ko_ratio)
    out['len_chars'] = out['content_norm'].str.len()

    return out

In [10]:
def load_preprocess_data(path: str, sheet_name=0) -> pd.DataFrame:
    usecols = ['category', '날짜', '제목', '본문', 'url']
    df = pd.read_excel(
        path,
        sheet_name=sheet_name,
        engine="openpyxl",
        dtype=str,
        usecols=usecols
    )

    out = preprocess_df(df)
    # 중복 제거
    out = out.drop_duplicates(subset=['clean_hash'])
    # 한국어 비율 0.3 미만, 본문 10자 미만 제거
    out = out[(out['ko_ratio'] >= 0.3) & (out['len_chars'] >= 10)].reset_index(drop=True)
    # 날짜 파싱/정규화
    if '날짜' in out.columns:
        out['date'] = pd.to_datetime(out['날짜'], errors='coerce')
        out['ym'] = out['date'].dt.to_period('M').astype(str)
    return out

TARGET_FILES = [
    "1. (SOCIAL)_(개인정보보호법__정보통신망법).xlsx",
    "2. (SOCIAL)_(자본시장법__특정금융정보법__전자금융거래법__전자증권법__금융소비자보호법).xlsx",
    "3. (SOCIAL)_(아동복지법).xlsx",
    "4. (SOCIAL)_(중대재해처벌법).xlsx",
]

for path in TARGET_FILES:
    part_num = path.split('.')[0]  # "1", "2", "3", "4"
    print(f"\n=== part {part_num} 시작 ===")

    twitter = load_preprocess_data(path, sheet_name="twitter")
    twitter.to_excel(f"preprocess_twitter_part_{part_num}.xlsx", index=False)

    blog = load_preprocess_data(path, sheet_name="blog")
    blog.to_excel(f"preprocess_blog_part_{part_num}.xlsx", index=False)

    insta = load_preprocess_data(path, sheet_name="insta")
    insta.to_excel(f"preprocess_insta_part_{part_num}.xlsx", index=False)

    community = load_preprocess_data(path, sheet_name="community")
    community.to_excel(f"preprocess_community_part_{part_num}.xlsx", index=False)

    print(f"=== part {part_num} 완료 ===")

print("\n[전체 완료]")


=== part 1 시작 ===
=== part 1 완료 ===

=== part 2 시작 ===
=== part 2 완료 ===

=== part 3 시작 ===
=== part 3 완료 ===

=== part 4 시작 ===
=== part 4 완료 ===

[전체 완료]


2차 전처리

트위터 - RT,Qt 분리/@,#제거 후 키워드 저장

인스타그램 - 본문,해시태그 꼬리 분리/이모지 구분

커뮤니티,블로그 - 괄호 정보 제거/"무단전재,재배포 금지","이메일 전화 제보" 패턴 제거

In [11]:
import re, hashlib, math
import pandas as pd
from pathlib import Path

# 플랫폼 구분
def infer_platform(path: str) -> str:
    name = Path(path).name.lower()
    if "twitter" in name:
        return "twitter"
    if "insta" in name:
        return "instagram"
    if "blog" in name:
        return "blog"
    if "community" in name:
        return "community"
    return "generic"

# --------- 정규식/룰 ---------
RE_RT = re.compile(r'^RT\s+@\w+')
RE_MENTION = re.compile(r'@(\w+)')
RE_HASHTAG = re.compile(r'#([A-Za-z가-힣0-9_]+)')
RE_URL = re.compile(r'https?://\S+|www\.\S+')
RE_EMAIL = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
RE_PHONE = re.compile(r'(?:\+?\d{1,3}[-.\s]?)?(?:\d{2,4}[-.\s]?){2,3}\d{3,4}')
RE_ZW = re.compile(r'[\u200d\uFE0F]')  # zero-width, variation selector-16
RE_MULTISPACE = re.compile(r'\s+')

# 노이즈 대괄호 컨텐츠 구분 및 제거
RE_NOISE_BRACKETS = re.compile(
    r'\[(?:앵커|기자|사진|영상|카카오톡|메일|전화|무단.*?금지)\]'
)

# 한글 비율 연산
def korean_ratio(s: str) -> float:
    if not s:
        return 0.0
    total = len(s)
    ko = sum(1 for ch in s if '\uac00' <= ch <= '\ud7a3')
    return ko / total

RE_SENT_SPLIT = re.compile(
    r'(?:(?<=[\.!?])\s+)'
    r'|'
    r'(?:[다요까죠])\s*(?:\n+|\s*$)'
)

def sentence_split(s: str):
    if not isinstance(s, str) or not s.strip():
        return []
    # 분리 토큰을 기준으로 split
    parts = re.split(RE_SENT_SPLIT, s)
    return [p.strip() for p in parts if p and p.strip()]

def sentence_count(s: str) -> int:
    return max(1, len(sentence_split(s)))

# 본문-해시태그 꼬리 분리
def split_tail_hashtags(text: str):
    if not text:
        return text, []
    lines = [ln.rstrip() for ln in text.splitlines()]
    tail_tags = []
    # 뒤에서부터 해시태그만 있는 라인을 수집
    i = len(lines) - 1
    while i >= 0 and lines[i] and all(tok.startswith('#') for tok in lines[i].split()):
        tail_tags.extend(RE_HASHTAG.findall(lines[i]))
        i -= 1
    body = "\n".join(lines[:i+1]).strip()
    return (body if body else text), list(dict.fromkeys(tail_tags))  # uniq 유지 순서존중

# URL/이메일/전화 토큰화, 노이즈 제거, zero-width 제거, 공백정리
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = RE_NOISE_BRACKETS.sub(" ", s)
    s = RE_URL.sub(" <URL> ", s)
    s = RE_EMAIL.sub(" <EMAIL> ", s)
    s = RE_PHONE.sub(" <PHONE> ", s)
    s = RE_ZW.sub("", s)
    # 과도한 공백/개행 축소
    s = RE_MULTISPACE.sub(" ", s).strip()
    return s

# 플랫폼별 메타 추출 및 정규화 정책
def platform_process(platform: str, raw_text: str):
    text = str(raw_text or "")
    is_retweet = False
    mentions = RE_MENTION.findall(text)
    hashtags = RE_HASHTAG.findall(text)

    # instagram: 꼬리 해시태그 블록 분리
    tail_hashtags = []
    if platform == "instagram":
        base, tail = split_tail_hashtags(text)
        # 본문을 base로 교체, 해시태그는 본문+꼬리 합집합
        text = base
        if tail:
            tail_hashtags = tail
            hashtags = list(dict.fromkeys(hashtags + tail_hashtags))

    # 모델 입력용 텍스트: 멘션 제거, 해시태그는 평문 변환
    text_for_model = RE_MENTION.sub(" ", text)
    text_for_model = re.sub(RE_HASHTAG, r"\1", text_for_model)

    return {
        "mentions": mentions,
        "hashtags": hashtags,
        "text_for_model": text_for_model
    }

# 해시(중복 탐지용)
def sha1_hex(s: str) -> str:
    return hashlib.sha1((s or "").encode("utf-8")).hexdigest()

def process_file(path: str):
    platform = infer_platform(path)
    print(f"[INFO] Loading: {path} (platform={platform})")
    df = pd.read_excel(path)

    def safe_str(x):
        return x if isinstance(x, str) else ""

    out_rows = []
    for _, row in df.iterrows():
        title_raw = safe_str(row.get("title_norm", ""))
        content_raw = safe_str(row.get("content_norm", ""))

        title_clean_base = clean_text(title_raw)
        content_clean_base = clean_text(content_raw)

        # 멘션/해시태그/RT는 메타 추출용 합본 사용
        combined_for_meta = (title_clean_base + "\n" + content_clean_base).strip()
        meta = platform_process(platform, combined_for_meta)

        # 멘션 제거, 해시태그 평문화
        title_for_model = RE_MENTION.sub(" ", title_clean_base)
        title_for_model = re.sub(RE_HASHTAG, r"\1", title_for_model)
        title_final = clean_text(title_for_model)

        content_for_model = RE_MENTION.sub(" ", content_clean_base)
        content_for_model = re.sub(RE_HASHTAG, r"\1", content_for_model)
        content_final = clean_text(content_for_model)

        # 지표/통계는 합본 기준으로 계산(길이/문장수/한글비율 등)
        text_for_stats = (title_final + ("\n" if title_final and content_final else "") + content_final).strip()
        ko_ratio = korean_ratio(text_for_stats)
        n_sents = sentence_count(text_for_stats)

        out_rows.append({
            # 업데이트될 전처리 텍스트
            "title_norm__updated": title_final,
            "content_norm__updated": content_final,

            # 메타/지표
            "mentions": meta["mentions"],
            "hashtags": meta["hashtags"],
            "ko_ratio": ko_ratio,
            "len_sentences": n_sents,
        })

    out = pd.DataFrame(out_rows)

    # 원본 주요 컬럼 유지
    keep_cols = []
    for col in ["category", "title", "content", "title_norm", "content_norm", "date", "created_at", "url", "author", "likes", "retweets", "comments"]:
        if col in df.columns:
            keep_cols.append(col)

    merged = pd.concat([df[keep_cols].reset_index(drop=True), out], axis=1)

    # title_norm, content_norm을 전처리 결과로 덮어쓰기
    merged["title_norm"] = merged["title_norm__updated"]
    merged["content_norm"] = merged["content_norm__updated"]
    merged.drop(columns=["title_norm__updated", "content_norm__updated"], inplace=True)

    # 저장
    p = Path(path)
    out_xlsx = str(p.with_name(p.stem + ".xlsx"))

    merged.to_excel(out_xlsx, index=False)
    print(f"[OK] Saved: {out_xlsx}\n")
    return merged

all_results = {}

for part in [1, 2, 3, 4]:
    print(f"\n=== PART {part} 시작 ===")
    INPUT_FILES = [
        f"preprocess_community_part_{part}.xlsx",
        f"preprocess_insta_part_{part}.xlsx",
        f"preprocess_twitter_part_{part}.xlsx",
        f"preprocess_blog_part_{part}.xlsx",
    ]

    for f in INPUT_FILES:
        if not Path(f).exists():
            print(f"[SKIP] Missing: {f}")
            continue
        try:
            all_results[Path(f).name] = process_file(f)
        except Exception as e:
            print(f"[ERROR] {f}: {e}")

    print(f"=== PART {part} 완료 ===")

print("\n[전체 완료]")



=== PART 1 시작 ===
[INFO] Loading: preprocess_community_part_1.xlsx (platform=community)
[OK] Saved: preprocess_community_part_1.xlsx

[INFO] Loading: preprocess_insta_part_1.xlsx (platform=instagram)
[OK] Saved: preprocess_insta_part_1.xlsx

[INFO] Loading: preprocess_twitter_part_1.xlsx (platform=twitter)
[OK] Saved: preprocess_twitter_part_1.xlsx

[INFO] Loading: preprocess_blog_part_1.xlsx (platform=blog)
[OK] Saved: preprocess_blog_part_1.xlsx

=== PART 1 완료 ===

=== PART 2 시작 ===
[INFO] Loading: preprocess_community_part_2.xlsx (platform=community)
[OK] Saved: preprocess_community_part_2.xlsx

[INFO] Loading: preprocess_insta_part_2.xlsx (platform=instagram)
[OK] Saved: preprocess_insta_part_2.xlsx

[INFO] Loading: preprocess_twitter_part_2.xlsx (platform=twitter)
[OK] Saved: preprocess_twitter_part_2.xlsx

[INFO] Loading: preprocess_blog_part_2.xlsx (platform=blog)
[OK] Saved: preprocess_blog_part_2.xlsx

=== PART 2 완료 ===

=== PART 3 시작 ===
[INFO] Loading: preprocess_community_