모듈 세팅

In [None]:
%pip install pandas numpy scikit-learn nltk matplotlib seaborn

import pandas as pd
import numpy as np
import nltk
import time

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

데이터 호출 및 전처리

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# 데이터 불러오기
df = pd.read_csv('wine_review.csv')

# 필요없는 열 제거
df = df[['name', 'reviews.rating', 'reviews.text']]

# 결측값 처리
df = df.dropna()

# 불용어 제거를 위한 NLTK 다운로드 (한 번 다운로드 후 주석 처리할 것)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# 불용어 목록 다운로드
stop_words = set(stopwords.words('english'))

# 텍스트 전처리 함수 정의
def preprocess_text(text):
    # 소문자로 변환
    text = text.lower()
    # 특수 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # 표제어 추출
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # 리스트를 공백으로 결합하여 문자열로 변환
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# 리뷰 텍스트 전처리
df['cleaned_text'] = df['reviews.text'].apply(preprocess_text)

# 필요한 열 선택
df = df[['name', 'reviews.rating', 'cleaned_text']]

# 전처리된 데이터를 새 CSV 파일로 저장
output_filename = 'preprocessed_wine_reviews.csv'
df.to_csv(output_filename, index=False)

print(f"Preprocessed data saved to {output_filename}")


전처리한 데이터셋 컬럼확인

In [None]:
# 'processed_wine_reviews.csv' 파일을 로드
df = pd.read_csv('preprocessed_wine_reviews.csv')

# 데이터프레임의 컬럼 확인
print(df.columns)

Vader를 사용한 모델 생성 및 튜닝

In [13]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# 데이터 불러오기
df = pd.read_csv('preprocessed_wine_reviews.csv')

# VADER 모델 다운로드
nltk.download('vader_lexicon')

# VADER 초기화
sid = SentimentIntensityAnalyzer()

# VADER 모델을 이용한 감정 분석 함수
def analyze_sentiment(text):
    scores = sid.polarity_scores(text)
    compound_score = scores['compound']
    if compound_score >= 0.05:
        return 1  # Positive
    elif compound_score <= -0.05:
        return -1  # Negative
    else:
        return 0  # Neutral

# NaN 값 처리
df['sentiment'] = df[df['cleaned_text'].notna()]['cleaned_text'].apply(analyze_sentiment)

# NaN이 포함된 행을 NaN으로 설정
df.loc[df['cleaned_text'].isna(), 'sentiment'] = np.nan

# 필요한 열 선택
df = df[['name', 'cleaned_text', 'reviews.rating', 'sentiment']]

# 데이터 분할
X = df['cleaned_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 파이프라인 설정
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SentimentIntensityAnalyzer())
])

# 학습
pipeline.fit(X_train, y_train)

# 예측
y_pred = pipeline.predict(X_test)

# 평가
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. '<nltk.sentiment.vader.SentimentIntensityAnalyzer object at 0x788f17d52cb0>' (type <class 'nltk.sentiment.vader.SentimentIntensityAnalyzer'>) doesn't