In [1]:
# sentiment_analysis.py

import os
import re
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

    
    
# 프로젝트 루트 경로
BASE_DIR = os.getcwd()
LIB_DIR  = os.path.join(BASE_DIR, 'lib')

# 1) 불용어 로드 (lib/korean_stopwords.txt)
stopwords_path = os.path.join(LIB_DIR, 'korean_stopwords.txt')
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

okt = Okt()

def clean_text(text: str) -> str:
    text = re.sub(r"[^가-힣0-9a-zA-Z\s]", " ", str(text))
    return re.sub(r"\s+", " ", text).strip()

def tokenize(text: str) -> list[str]:
    tokens = okt.pos(clean_text(text), norm=True, stem=True)
    return [
        w for w, p in tokens
        if p in ('Noun','Verb','Adjective') and w not in stopwords
    ]

def auto_label(df: pd.DataFrame) -> pd.DataFrame:
    """
    키워드 기반 간단 레이블러.
    긍정 키워드 많으면 '긍정', 부정 키워드 많으면 '부정', 아니면 '중립'
    """
    pos = ['좋','행복','재밌','최고','감동','훌륭','만족','버프','상향',
           '쎔','강함','상위','최상위','신이다','고트','신창섭','떡상','부활',
          '득템']
    neg = ['별로','싫','안좋','짜증','역겹','불편','실패','누워','버려',
           '눈물','열받','너프','접어','떡락','븅','쓰레기','씨발','하향',
           '누워','무덤','약한','약함','우울','손해','접을','접어','접게',
           '사망','허수딸','하위','조트','최하위','초상집','정상화','등신','병신','●▅▇█▇▆▅▄▇']

    texts = (
        df.get('제목','').fillna('') + ' ' +
        df.get('본문','').fillna('')
    ).apply(clean_text)

    labels = []
    for t in texts:
        score = sum(t.count(w) for w in pos) - sum(t.count(w) for w in neg)
        if score > 0:
            labels.append('긍정')
        elif score < 0:
            labels.append('부정')
        else:
            labels.append('중립')
    df['label'] = labels
    return df

def build_tokenizer(texts, num_words=15000):
    tok = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tok.fit_on_texts(texts)
    return tok

def texts_to_padded_sequences(tok, texts, maxlen=80):
    seqs = tok.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=maxlen, padding='pre', truncating='pre')

def read_csv_auto(path: str) -> pd.DataFrame:
    """
    여러 인코딩을 순차 시도해서 CSV를 로드합니다.
    """
    encodings = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr']
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # 모두 실패 시 예외
    raise UnicodeDecodeError(f"지원하는 인코딩({encodings})으로도 읽을 수 없습니다: {path}")

def main():
    # 1) CSV 로드 & 자동 라벨링
    data_path = os.path.join(BASE_DIR, 'data', 'posts_label.csv')
    df = read_csv_auto(data_path)
    df = auto_label(df)

    # 2) 전처리 & 토크나이즈
    df['text']    = (
        df['제목'].fillna('') + ' ' +
        df['본문'].fillna('')
    ).apply(clean_text)
    df['tokens']  = df['text'].apply(tokenize)
    df['cleaned'] = df['tokens'].apply(lambda t: " ".join(t))

    # 3) 레이블 인코딩 & 원-핫
    le     = LabelEncoder()
    y_idx  = le.fit_transform(df['label'])
    y      = np.eye(len(le.classes_))[y_idx]

    # 4) train/val 분할
    X_train, X_val, y_train, y_val = train_test_split(
        df['cleaned'], y,
        test_size=0.2,
        stratify=df['label'],
        random_state=42
    )

    # 5) 토크나이저 생성 & 패딩
    tokenizer    = build_tokenizer(X_train)
    X_train_pad   = texts_to_padded_sequences(tokenizer, X_train)
    X_val_pad     = texts_to_padded_sequences(tokenizer, X_val)

    # 6) 모델 정의·학습
    model = Sequential([
        Embedding(input_dim=15000, output_dim=128, input_length=80),
        LSTM(128),
        Dropout(0.5),
        Dense(len(le.classes_), activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    model.fit(
        X_train_pad, y_train,
        validation_data=(X_val_pad, y_val),
        epochs=200, batch_size=32
    )

    # 7) 전체 데이터에 예측
    all_pad = texts_to_padded_sequences(tokenizer, df['cleaned'])
    probs   = model.predict(all_pad)
    pred    = le.inverse_transform(np.argmax(probs, axis=1))
    df['pred'] = pred

    # 8) 결과 확인 및 저장
    print(df[['제목','본문','label','pred']])
    out_path = os.path.join(BASE_DIR, 'data', 'posts_labeled_pred.csv')
    df.to_csv(out_path, index=False, encoding='utf-8-sig', errors='replace')
    print(f"✅ 완료: {out_path}")

if __name__ == '__main__':
    main()

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [2]:
model.summary()

NameError: name 'model' is not defined