In [96]:
import pandas as pd
import numpy as np
import glob
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from konlpy.tag import Okt
import re
import pickle
import json

In [62]:

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# JSON 파일에서 vocab 불러오기
vocab_path = r"C:\Users\kdp\Desktop\KDW\EX_FLASK\project\vocab.json"
with open(vocab_path, 'r', encoding='utf-8') as f:
    vocab = json.load(f)
vocab_size = len(vocab) + 1

# 파일 경로 지정
file_paths = [
    r"C:\Users\kdp\Desktop\KDW\EX_FLASK\project\data\건강_clean.csv",
    r"C:\Users\kdp\Desktop\KDW\EX_FLASK\project\data\군대_clean.csv"
]

all_dataframes = []
for file in file_paths:
    df = pd.read_csv(file, encoding='utf-8-sig')  
    df.columns = ['id','category', 'text']
    all_dataframes.append(df)

# 데이터프레임 통합
DF = pd.concat(all_dataframes, ignore_index=True)

# 결과 출력
print(DF)


           id category                                text
0           0       건강         뭐 서양 약 들 로 처방 받으면 건강 더 안 좋아
1           1       건강                  남성 쪽 으로 건강 안 좋나 보네
2           2       건강                         뭔 데 자신감 넘치네
3           3       건강                    쟤 한테 물어보면 될 거 아냐
4           4       건강                 난 탈모 약 좀 먹어야 될 거 같아
...       ...      ...                                 ...
117794  55012       군대   여자친구 고 무신 거꾸로 안 신고 이쁘게 사랑 했으면 좋겠네
117795  55013       군대            다른사람 만나지 않게 휴가 나와서 잘해야 지
117796  55014       군대           추억 생각나고 풋풋 해보여서 까지 기분 좋더라
117797  55015       군대          군생활 중 별 통보 제일 슬프다던데 잘 해줘야지
117798  55016       군대  군대 생활 에서 여자친구 생각 하면서 버티면 그나마 힘 될꺼야

[117799 rows x 3 columns]


In [63]:

# 텍스트 데이터와 라벨 추출
texts = DF['text'].tolist()
labels = DF['category'].tolist()

# 라벨 인코딩 (라벨을 숫자로 변환)
label_to_index = {label: idx for idx, label in enumerate(sorted(set(labels)))}
indexed_labels = [label_to_index[label] for label in labels]


In [64]:
# 텍스트 인코딩
max_len = 20
# 텍스트 인코딩 수정
encoded_texts = []
for sentence in texts:
    sentence = sentence.split()  # 각 문장을 단어 단위로 분리 (필요에 따라 조정)
    encoded = [vocab.get(word, 0) for word in sentence]  # 단어 인코딩
    if len(encoded) < max_len:
        encoded += [0] * (max_len - len(encoded))  # 패딩
    else:
        encoded = encoded[:max_len]
    encoded_texts.append(encoded)





[[649,
  15812,
  1495,
  11,
  596,
  6178,
  1130,
  2733,
  235,
  5,
  632,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [8989, 3236, 388, 2733, 5, 8308, 3007, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [541, 201, 7340, 27990, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [997, 242, 22945, 420, 116, 637, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [40,
  15596,
  1495,
  1041,
  1302,
  420,
  116,
  354,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [3429,
  15596,
  1495,
  2559,
  15596,
  28493,
  6590,
  184,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [7911, 1787, 596, 7767, 1025, 2891, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [908, 1616, 2252, 2733, 5, 632, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [908,
  282,
  3576,
  157,
  28494,
  2441,
  967,
  2716,
  328,
  908,
  4040,
  2252,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [908,
  11691,
  2716,
  809,
  2716,
  388,
  908,
  28495,
  147,
  235,
  2716,
  143,
  9286,
  0,
  0

In [65]:

# PyTorch Dataset 정의
class CommentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


In [66]:

# 데이터셋 및 데이터로더 생성
X_train, X_test, y_train, y_test = train_test_split(encoded_texts, indexed_labels, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

train_dataset = CommentDataset(X_train, y_train)
valid_dataset = CommentDataset(X_valid, y_valid)
test_dataset = CommentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [67]:

# 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=3, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # 양방향 LSTM의 두 방향 결합
        out = self.fc(hidden)
        return out


In [68]:

# 모델, 손실 함수 및 옵티마이저 초기화
embedding_dim = 300  
hidden_dim = 128
output_dim = 1 

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, verbose=True)

# Early Stopping 설정
best_loss = float('inf')
patience, trigger = 3, 0




In [69]:

# 모델 학습
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_train, total_train = 0, 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        outputs = model(texts)
        loss = criterion(outputs, labels.unsqueeze(1).float())  # 라벨 차원을 맞춤

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        predicted = (outputs > 0.5).float()  # 이진 분류에서는 0.5 기준으로 예측
        total_train += labels.size(0)
        correct_train += (predicted == labels.unsqueeze(1)).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = correct_train / total_train

    # 검증 손실 계산
    model.eval()
    val_loss = 0.0
    correct_val, total_val = 0, 0
    with torch.no_grad():
        for texts, labels in valid_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels.unsqueeze(1).float())  # 라벨 차원 맞춤
            val_loss += loss.item()

            predicted = (outputs > 0.5).float()
            total_val += labels.size(0)
            correct_val += (predicted == labels.unsqueeze(1)).sum().item()

    avg_val_loss = val_loss / len(valid_loader)
    val_accuracy = correct_val / total_val

    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    scheduler.step(avg_val_loss)

    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger = 0
        print(f'Validation loss : {best_loss:.4f}')
        torch.save(model.state_dict(), 'best_lstm_model.pth')
    else:
        trigger += 1
        if trigger >= patience:
            print(f'{patience} 에포크 동안 성능 향상이 없어 학습을 종료합니다.')
            break


Epoch [1/100], Train Loss: 0.3696, Train Accuracy: 0.8042, Validation Loss: 0.3167, Validation Accuracy: 0.8302
Validation loss : 0.3167
Epoch [2/100], Train Loss: 0.2591, Train Accuracy: 0.8761, Validation Loss: 0.3148, Validation Accuracy: 0.8511
Validation loss : 0.3148
Epoch [3/100], Train Loss: 0.2184, Train Accuracy: 0.9005, Validation Loss: 0.3379, Validation Accuracy: 0.8478
Epoch [4/100], Train Loss: 0.2010, Train Accuracy: 0.9083, Validation Loss: 0.3455, Validation Accuracy: 0.8424
Epoch [5/100], Train Loss: 0.1875, Train Accuracy: 0.9128, Validation Loss: 0.3684, Validation Accuracy: 0.8402
3 에포크 동안 성능 향상이 없어 학습을 종료합니다.


In [97]:

# 최종 모델 평가
model.load_state_dict(torch.load('best_lstm_model.pth'))
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels.unsqueeze(1)).sum().item()

test_accuracy = correct / total
print(f'Test Accuracy: {test_accuracy:.4f}')


Test Accuracy: 0.8511


In [102]:
# 예시 텍스트
input_text = "허리 아파"

# 텍스트 전처리
input_text_processed = input_text.split()  # 간단하게 단어 단위로 분리
encoded_input = [vocab.get(word, 0) for word in input_text_processed]  # 단어 인코딩
max_len = 20  # 패딩할 최대 길이
if len(encoded_input) < max_len:
    encoded_input += [0] * (max_len - len(encoded_input))  # 패딩 추가
else:
    encoded_input = encoded_input[:max_len]  # 텍스트 자르기

# PyTorch 텐서로 변환
input_tensor = torch.tensor([encoded_input], dtype=torch.long).to(device)  # 배치를 위한 차원 추가

# 모델 예측
model.eval()
with torch.no_grad():
    output = model(input_tensor)
    prediction = (output > 0.5).float()  # 0.5를 기준으로 이진 분류

# 결과 출력
if prediction.item() == 1.0:
    print("주제: 기타")
else:
    print("주제: 건강")

주제: 건강
