In [4]:
import os
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
import matplotlib.pyplot as plt
import json
import re

In [5]:
# stopwords 불러오기
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

# 내가 설정한 stopwords 추가
custom_stopwords = set()
with open(r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\STOPWORDS.txt", 'r', encoding='utf-8') as f:
    custom_stopwords = {word.strip().lower() for line in f for word in line.split(', ')}
stop_words.update(custom_stopwords)
stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kdp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kdp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


<WordListCorpusReader in 'C:\\Users\\kdp\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

In [6]:
# 파일 경로 설정 (희극 및 비극 폴더 내 파일 경로)
files_comedy = [
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\A Midsummer Nights Dream.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\As You Like It.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\The Merchant of Venice.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\The Taming of the Shrew.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Twelfth Night.csv"
]

files_tragedy = [
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Hamlet.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Macbeth.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Othello.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\Romeo and Juliet.csv",
    r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\The Tragedy of King Lear.csv"
]

# 데이터를 저장할 리스트 초기화
texts = []
labels = []

# 희극과 비극 데이터를 불러와서 라벨링
for file_path in files_comedy:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # 공백 제거
            if line:  # 빈 줄이 아닌 경우만 추가
                texts.append(line)
                labels.append(0)  # 희극 : 0

for file_path in files_tragedy:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # 공백 제거
            if line:  # 빈 줄이 아닌 경우만 추가
                texts.append(line)
                labels.append(1)  # 비극 : 1

# 데이터프레임 생성
df = pd.DataFrame({'text': texts, 'label': labels})
df=df.drop(0, axis=0)
df

Unnamed: 0,text,label
1,ACT I,0
2,SCENE I. Athens. A room in the Palace of Theseus,0
3,"""Enter Theseus, Hippolyta, Philostrate and Att...",0
4,THESEUS.,0
5,"""Now, fair Hippolyta, our nuptial hour""",0
...,...,...
38251,The weight of this sad time we must obey;,1
38252,"""Speak what we feel, not what we ought to say.""",1
38253,The oldest hath borne most; we that are young,1
38254,"""Shall never see so much, nor live so long.""",1


In [7]:
# 전처리하는 함수 정의
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 특수문자 제거
    text = text.lower()  # 소문자로 변환
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]  # stopwords 제거
    return ' '.join(filtered_words)

# Stopwords를 제거하는 함수 정의
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Stopwords 제거하여 전처리된 텍스트 추가
df['text'] = df['text'].apply(remove_stopwords)
df['text']=df['text'].apply(preprocess_text)
df

Unnamed: 0,text,label
1,,0
2,room palace,0
3,,0
4,,0
5,fair nuptial hour,0
...,...,...
38251,weight sad time must obey,1
38252,speak feel ought say,1
38253,oldest hath borne young,1
38254,shall never see much live long,1


In [8]:
df = df.dropna(subset=['text'])  # 'text' 열에서 비어 있는 행 제거
df = df[df['text'].str.strip() != '']  # 빈 문자열 제거
df=df.reset_index(drop=True)
df

Unnamed: 0,text,label
0,room palace,0
1,fair nuptial hour,0
2,draws apace four happy days bring,0
3,another moon oh slow,0
4,moon wanes lingers desires,0
...,...,...
31355,weight sad time must obey,1
31356,speak feel ought say,1
31357,oldest hath borne young,1
31358,shall never see much live long,1


In [9]:
# 단어 사전 생성 및 저장
vectorizer = TfidfVectorizer()
vectorizer.fit(df['text'])
word_index = vectorizer.vocabulary_

# 단어 사전을 JSON 형식으로 저장
with open(r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\texts\vocab.json", 'w', encoding='utf-8') as f:
    json.dump(word_index, f, ensure_ascii=False)



print(f"단어 사전 크기: {len(word_index)}")

단어 사전 크기: 14050


In [10]:
# 단어 사전 불러오기 및 모델 학습

# 저장된 단어 사전 불러오기
with open(r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\texts\vocab.json", 'r', encoding='utf-8') as f:
    word_index = json.load(f)

In [11]:
vectorizer = TfidfVectorizer(vocabulary=word_index)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label'].values

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:

# 로지스틱 회귀 모델 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 예측 및 정확도 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# 검증
scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", scores)
print(f"Accuracy: {scores.mean() * 100:.2f}%")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1:.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Accuracy: 69.05%
Cross-validation scores: [0.59805485 0.62946429 0.60299745 0.63966837 0.63536352]
Accuracy: 62.11%
F1 Score: 0.68


In [13]:
# 모델과 TfidfVectorizer 저장 경로
model_path = 'shakespeare_full_model_logistic.pkl'
vectorizer_path = 'tfidf_vectorizer.pkl'

# 모델과 벡터라이저 저장
with open(model_path, 'wb') as model_file:
    pickle.dump(model, model_file)

with open(vectorizer_path, 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [14]:
with open(model_path, 'rb') as model_file:
    model = pickle.load(model_file)

with open(vectorizer_path, 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)


In [15]:
# 파일로부터 텍스트 읽기
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().strip()
    return text

# 텍스트 전처리 및 Tfidf 변환
def preprocess_and_vectorize(text, vectorizer):
    tfidf_features = vectorizer.transform([text])  # 텍스트를 TF-IDF로 변환
    return tfidf_features

# 텍스트 분류 함수
def classify_text(file_path, vectorizer, model):
    text = read_text_from_file(file_path)  # 파일에서 텍스트 읽기
    input_tensor = preprocess_and_vectorize(text, vectorizer)  # TF-IDF 변환
    prediction = model.predict(input_tensor)[0]  # Scikit-learn 모델로 예측
    return "희극" if prediction == 0 else "비극"

# 평가 실행
file_path = r"C:\Users\kdp\Desktop\Work_권도운\개인프로젝트\texts\All's well that ends well.txt"  
result = classify_text(file_path, vectorizer, model)
print(f"분석 결과: {result}")

분석 결과: 비극
