In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from nltk.corpus import stopwords
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# 파일 경로 설정
files_comedy = [
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\A Midsummer Nights Dream.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\As You Like It.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\The Merchant of Venice.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\The Taming of the Shrew.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Twelfth Night.csv"
]

files_tragedy = [
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Hamlet.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Macbeth.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Othello.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Romeo and Juliet.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\The Tragedy of King Lear.csv"
]

# 데이터를 저장할 리스트 초기화
texts = []
labels = []

nltk.download('stopwords')

custom_stopwords=r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\STOPWORDS.txt"
with open(custom_stopwords, "r", encoding="utf-8") as f:
    custom_stopwords = [line.strip().lower() for line in f.read().split(",")]

# NLTK 기본 Stopwords
nltk_stopwords = stopwords.words("english")

# 통합 Stopwords 리스트
all_stopwords = set(nltk_stopwords + custom_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kdp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# 전처리하는 함수 정의
def process_text(text):
    # 소문자 변환
    text = text.lower()
    # 구두점 제거
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Stopwords 제거
    words = text.split()
    words = [word for word in words if word not in all_stopwords]
    return " ".join(words)


In [31]:
# 희극과 비극 데이터를 불러와서 라벨링
for file_path in files_comedy:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # 공백 제거
            if line:  # 빈 줄이 아닌 경우만 추가
                cleaned_line = process_text(line)  # 텍스트 정리
                if cleaned_line:  # Stopwords만 남은 텍스트 제거
                    texts.append(cleaned_line)
                    labels.append(0)  # 희극 : 0

for file_path in files_tragedy:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # 공백 제거
            if line:  # 빈 줄이 아닌 경우만 추가
                cleaned_line = process_text(line)  # 텍스트 정리
                if cleaned_line:  # Stopwords만 남은 텍스트 제거
                    texts.append(cleaned_line)
                    labels.append(1)  # 비극 : 1

# 데이터프레임 생성
df = pd.DataFrame({'text': texts, 'label': labels})
  
# Stopwords 제거 및 데이터 재정렬
df = df[["text", "label"]]  # 필요한 열만 유지
# 결과 확인
df.head()

Unnamed: 0,text,label
0,﻿texts,0
1,room palace,0
2,fair nuptial hour,0
3,draws apace four happy days bring,0
4,another moon oh slow,0


In [32]:

tfidf=TfidfVectorizer()
X = tfidf.fit_transform(df['text'])

X_train, X_test, y_train, y_test = train_test_split(X, labels, stratify=labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

# PyTorch 텐서로 변환
X_train_tensor = torch.FloatTensor(X_train.toarray())
X_test_tensor = torch.FloatTensor(X_test.toarray())
y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)
x_v_tensor = torch.FloatTensor(X_val.toarray())
y_v_tensor = torch.LongTensor(y_val)

In [33]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, num_layers=1, dropout=0.2):

        super(LSTMClassifier, self).__init__()
        layers=[]
        in_features=input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            layers.append(nn.ReLU())  # 활성화 함수로 ReLU 추가
            in_features = hidden_size  # 다음 레이어의 입력 크기는 현재 레이어의 출력 크기

        # Dropout과 마지막 출력층 추가
        layers.append(nn.Dropout(0.3))  # Dropout 추가
        layers.append(nn.Linear(in_features, output_size))

        self.network = nn.Sequential(*layers) 

    def forward(self, x):
        return self.network(x)
