모듈 세팅

In [25]:
%pip install pandas numpy scikit-learn nltk matplotlib seaborn

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True)

Note: you may need to restart the kernel to use updated packages.


True

데이터 호출 및 전처리

In [34]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# 데이터 불러오기
df = pd.read_csv('wine_review.csv')

# 필요한 열 선택
df = df[['name', 'reviews.rating', 'reviews.text']]

# 결측값 제거
df = df.dropna()

# NLTK 불용어 다운로드
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# NLTK 불용어 설정
stop_words = set(stopwords.words('english'))

# 텍스트 전처리 함수 정의
def preprocess_text(text):
    # 소문자 변환
    text = text.lower()
    # 특수 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # 표제어 추출
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # 리스트를 공백으로 결합하여 문자열로 반환
    return ' '.join(lemmatized_tokens)

# 리뷰 텍스트 전처리
df['cleaned_text'] = df['reviews.text'].apply(preprocess_text)

# 필요한 열 선택
df = df[['name', 'reviews.rating', 'cleaned_text']]

# 전처리된 데이터를 CSV 파일로 저장
output_filename = 'preprocessed_wine_reviews.csv'
df.to_csv(output_filename, index=False)
print(f"Preprocessed data saved to {output_filename}")

# 감정 분석기 초기화
sid = SentimentIntensityAnalyzer()

# 감정 분석 함수 정의
def analyze_sentiment(text):
    # NaN 값이면 중립(0)으로 반환
    if isinstance(text, float) and np.isnan(text):
        return 0
    
    # 감정 분석 수행
    sentiment_score = sid.polarity_scores(text)
    if sentiment_score['compound'] >= 0.05:
        return 1  # Positive
    elif sentiment_score['compound'] <= -0.05:
        return -1  # Negative
    else:
        return 0  # Neutral

# 전처리된 데이터 불러오기
df = pd.read_csv('preprocessed_wine_reviews.csv')

# NaN 값이 있는 행 제거
df = df.dropna(subset=['cleaned_text'])

# 'cleaned_text' 열에 대해 감정 분석을 수행하여 'sentiment' 열 추가
df['sentiment'] = df['cleaned_text'].apply(analyze_sentiment)

# 필요한 열 선택
df = df[['name', 'cleaned_text', 'reviews.rating', 'sentiment']]

# 전처리된 데이터를 CSV 파일로 저장
output_sentiment_filename = 'preprocessed_wine_reviews_with_sentiment.csv'
df.to_csv(output_sentiment_filename, index=False)
print(f"Preprocessed data with sentiment saved to {output_sentiment_filename}")


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Preprocessed data saved to preprocessed_wine_reviews.csv
Preprocessed data with sentiment saved to preprocessed_wine_reviews_with_sentiment.csv


전처리한 데이터셋 컬럼확인

In [35]:
# 'processed_wine_reviews.csv' 파일을 로드
#df = pd.read_csv('preprocessed_wine_reviews.csv')
df = pd.read_csv('preprocessed_wine_reviews_with_sentiment.csv')
# 데이터프레임의 컬럼 확인
print(df.columns)


Index(['name', 'cleaned_text', 'reviews.rating', 'sentiment'], dtype='object')


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# 전처리된 데이터 불러오기
df = pd.read_csv('preprocessed_wine_reviews_with_sentiment.csv')
# 특성과 타겟 설정
X = df['cleaned_text']
y = df['sentiment']

# 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# TF-IDF 벡터화와 SVM 모델을 파이프라인으로 구성
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(random_state=42))
])

# 하이퍼파라미터 그리드 설정
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf']
}

# 그리드 서치를 사용하여 모델 튜닝
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best parameters found:", grid_search.best_params_)
# 교차 검증을 통해 모델 평가
cv_results = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_results)
print("Mean accuracy:", cv_results.mean())
# 테스트 세트로 최종 평가
test_accuracy = grid_search.best_estimator_.score(X_test, y_test)
print("Test accuracy:", test_accuracy)
import joblib

# 최적의 모델을 저장
model_filename = 'wine_review_sentiment_svm_model.pkl'
joblib.dump(grid_search.best_estimator_, model_filename)
print(f"Best model saved to {model_filename}")


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters found: {'svm__C': 10, 'svm__kernel': 'rbf', 'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 1)}
Cross-validation scores: [0.85421995 0.84654731 0.85933504 0.83887468 0.85384615]
Mean accuracy: 0.8505646271886681
Test accuracy: 0.8548057259713702
Best model saved to wine_review_sentiment_svm_model.pkl
