In [2]:
import pandas as pd
import numpy as np
import glob
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from konlpy.tag import Okt
import re
import pickle


In [3]:

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 전처리된 파일들 통합
folder_path = r"C:\Users\kdp\Desktop\KDW\EX_자연어처리\D1010\무한도전"
file_pattern = os.path.join(folder_path, '*.xlsx')

all_dataframes = []
for file in glob.glob(file_pattern):
    df = pd.read_excel(file, header=None)
    df.columns = ['episode', 'comment']  # 0번 컬럼을 'episode', 1번 컬럼을 'comment'로 설정
    all_dataframes.append(df)

combined_df = pd.concat(all_dataframes, ignore_index=True)


# 텍스트 데이터와 라벨 추출
texts = combined_df['comment'].tolist()
labels = combined_df['episode'].tolist()


In [4]:
# 단어사전 만들기
vocab = {}  # 어휘 사전
for sentence in texts:
    for word in sentence:
        if word not in vocab:
            vocab[word] = len(vocab) + 1  # 새로운 단어를 vocab에 추가하고 인덱스 부여

vocab_size = len(vocab) + 1  # vocab 크기 설정
print("Vocabulary size:", vocab_size)



Vocabulary size: 1541


In [7]:
# json 파일로 저장
import json

with open("vocab.json", "w") as f:
    json.dump(vocab, f)


with open("vocab.json", "r") as f:
    vocab = json.load(f)

vocab_dict = {}

for idx, word in enumerate(vocab):
    vocab_dict[word] = idx
print(vocab_dict)


{'정': 0, '형': 1, '돈': 2, ' ': 3, '도': 4, '진': 5, '짜': 6, '유': 7, '재': 8, '석': 9, '공': 10, '격': 11, '수': 12, '장': 13, '난': 14, '뒤': 15, '포': 16, '트': 17, '만': 18, '본': 19, '인': 20, '작': 21, '웃': 22, '기': 23, '행': 24, '혼': 25, '자': 26, '땀': 27, '범': 28, '벅': 29, '저': 30, '걸': 31, '느': 32, '님': 33, '당': 34, '신': 35, '대': 36, '체': 37, '부': 38, '분': 39, '준': 40, '하': 41, '말': 42, '투': 43, '존': 44, '똑': 45, '깜': 46, '짝': 47, '압': 48, '개': 49, '그': 50, '연': 51, '평': 52, '소': 53, '무': 54, '스': 55, '라': 56, '이': 57, '커': 58, '박': 59, '명': 60, '사': 61, '실': 62, '월': 63, '능': 64, '력': 65, '팀': 66, '위': 67, '해': 68, '에': 69, '피': 70, '드': 71, '안': 72, '냥': 73, '심': 74, '감': 75, '탄': 76, '간': 77, '방': 78, '송': 79, '최': 80, '고': 81, '다': 82, '레': 83, '전': 84, '리': 85, '액': 86, '션': 87, '억': 88, '지': 89, '옹': 90, '듯': 91, '넋': 92, '깔': 93, '거': 94, '게': 95, '요': 96, '얼': 97, '마': 98, '나': 99, '관': 100, '찰': 101, '렙': 102, '항': 103, '것': 104, '별': 105, '조': 106, '합': 107, '컨': 108, '텐': 109, '츠': 110,

In [5]:

max_len = 20
encoded_texts = []
for sentence in texts:
    encoded = [vocab[word] for word in sentence]
    if len(encoded) < max_len:
        encoded += [0] * (max_len - len(encoded))  # 패딩
    else:
        encoded = encoded[:max_len]
    encoded_texts.append(encoded)

# 라벨을 문자열로 변환하여 타입 일관성 유지
labels = [str(label) for label in labels]

# 라벨 인코딩
label_to_index = {label: idx for idx, label in enumerate(sorted(set(labels)))}
indexed_labels = [label_to_index[label] for label in labels]

# PyTorch Dataset 정의
class CommentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# 데이터셋 및 데이터로더 생성
X_train, X_test, y_train, y_test = train_test_split(encoded_texts, indexed_labels, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

train_dataset = CommentDataset(X_train, y_train)
valid_dataset = CommentDataset(X_valid, y_valid)
test_dataset = CommentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=3, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # 양방향 LSTM의 두 방향 결합
  
        out = self.fc(hidden)
        return out

# 모델, 손실 함수 및 옵티마이저 초기화
embedding_dim = 300  
hidden_dim = 512  # 히든 유닛 수 증가
output_dim = len(label_to_index)

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, verbose=True)

# Early Stopping 설정
best_loss = float('inf')
patience, trigger = 3, 0

# 모델 학습
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_train, total_train = 0, 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        
        outputs = model(texts)
        loss = criterion(outputs, labels)

        # 역전파 및 옵티마이저 스텝
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # 정확도 계산
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = correct_train / total_train

    # 검증 손실 계산
    model.eval()
    val_loss = 0.0
    correct_val, total_val = 0, 0
    with torch.no_grad():
        for texts, labels in valid_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # 정확도 계산
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(valid_loader)
    val_accuracy = correct_val / total_val

    # 에포크별 결과 출력
    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    # 학습률 스케줄러 업데이트
    scheduler.step(avg_val_loss)

    # Early Stopping 체크
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger = 0
        print(f'Validation loss : {best_loss:.4f}')
        torch.save(model.state_dict(), 'best_lstm_model.pth')
    else:
        trigger += 1
        if trigger >= patience:
            print(f'{patience}에포크 만큼 성능 향상이 되지 않아 학습을 종료합니다.')
            break


# 최종 모델 평가
model.load_state_dict(torch.load('best_lstm_model.pth'))
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f'Test Accuracy: {test_accuracy:.4f}')




Epoch [1/100], Train Loss: 2.8325, Train Accuracy: 0.2636, Validation Loss: 3.0305, Validation Accuracy: 0.2755
Validation loss : 3.0305
Epoch [2/100], Train Loss: 2.9618, Train Accuracy: 0.2619, Validation Loss: 3.2131, Validation Accuracy: 0.2755
Epoch [3/100], Train Loss: 2.8486, Train Accuracy: 0.2583, Validation Loss: 3.4028, Validation Accuracy: 0.1145
Epoch [4/100], Train Loss: 2.7140, Train Accuracy: 0.2584, Validation Loss: 3.4682, Validation Accuracy: 0.2755
3에포크 만큼 성능 향상이 되지 않아 학습을 종료합니다.
Test Accuracy: 0.2887


In [10]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

# 데이터 준비 (벡터라이저와 벡터 데이터 로드)
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# 회차별 라벨 정의
episode_names = [
    "무인도 특집", "무한상사 면접편", "무한상사 야유회", 
    "방콕 특집", "서해안고속도로가요제", "인생극장 특집",
    "죄와 길", "짝꿍 특집", "네멋대로해라", "명수는 12살"
]

# 라벨 인덱스와 회차 이름 매핑
label_to_index = {name: idx for idx, name in enumerate(episode_names)}
index_to_label = {v: k for k, v in label_to_index.items()}

# LSTM 모델 정의 (가중치 로드를 위해 정의만 포함)
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        # LSTM expects a 3D tensor (batch_size, seq_len, input_dim)
        x = x.unsqueeze(1)  # (batch_size, 1, input_dim) 형태로 변환
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # 양방향 LSTM 결합
        out = self.fc(hidden)
        return out

# 모델 생성
input_dim = 1000  # TF-IDF 벡터의 feature dimension (적절한 값으로 설정)
hidden_dim = 256
output_dim = len(label_to_index)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim, hidden_dim, output_dim).to(device)

# 학습된 가중치 로드 (가중치 파일 이름을 적절하게 지정하세요)
model.load_state_dict(torch.load('best_lstm_model.pth', map_location=device))
model.eval()  # 예측 모드로 전환

# 예측 함수 정의
def predict_episode(comment):
    # TF-IDF 벡터화 및 텐서 변환
    comment_vector = vectorizer.transform([comment])
    comment_tensor = torch.tensor(comment_vector.toarray(), dtype=torch.float32).to(device)

    # 예측 수행
    with torch.no_grad():
        output = model(comment_tensor)
        probabilities = F.softmax(output, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()

    return index_to_label[predicted_class]

# 사용자로부터 입력받아 예측 수행
new_comment = input("댓글을 입력하세요: ")
predicted_episode = predict_episode(new_comment)
print(f"이 댓글은 '{predicted_episode}'과(와) 관련이 있습니다.")


RuntimeError: Error(s) in loading state_dict for LSTMClassifier:
	Unexpected key(s) in state_dict: "embedding.weight", "lstm.weight_ih_l2", "lstm.weight_hh_l2", "lstm.bias_ih_l2", "lstm.bias_hh_l2", "lstm.weight_ih_l2_reverse", "lstm.weight_hh_l2_reverse", "lstm.bias_ih_l2_reverse", "lstm.bias_hh_l2_reverse", "lstm.weight_ih_l3", "lstm.weight_hh_l3", "lstm.bias_ih_l3", "lstm.bias_hh_l3", "lstm.weight_ih_l3_reverse", "lstm.weight_hh_l3_reverse", "lstm.bias_ih_l3_reverse", "lstm.bias_hh_l3_reverse", "lstm.weight_ih_l4", "lstm.weight_hh_l4", "lstm.bias_ih_l4", "lstm.bias_hh_l4", "lstm.weight_ih_l4_reverse", "lstm.weight_hh_l4_reverse", "lstm.bias_ih_l4_reverse", "lstm.bias_hh_l4_reverse". 
	size mismatch for lstm.weight_ih_l0: copying a param with shape torch.Size([1024, 400]) from checkpoint, the shape in current model is torch.Size([1024, 1000]).
	size mismatch for lstm.weight_ih_l0_reverse: copying a param with shape torch.Size([1024, 400]) from checkpoint, the shape in current model is torch.Size([1024, 1000]).
	size mismatch for fc.weight: copying a param with shape torch.Size([1201, 512]) from checkpoint, the shape in current model is torch.Size([10, 512]).
	size mismatch for fc.bias: copying a param with shape torch.Size([1201]) from checkpoint, the shape in current model is torch.Size([10]).