In [667]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [668]:
# 데이터 불러오기
file_path = 'wine_review.csv'
df = pd.read_csv(file_path)

In [669]:
# 필요한 열 선택 및 결측치 제거
data = data[['reviews.rating', 'reviews.doRecommend','reviews.text']].dropna()  # 'reviews.rating', 'reviews.doRecommend'가 직접적인 평가수치라고 판단, 결측치를 없앱니다

In [670]:
print(df.columns)

Index(['id', 'asins', 'brand', 'categories', 'dateAdded', 'dateUpdated',
       'descriptions', 'dimension', 'ean', 'flavors', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'reviews.date', 'reviews.dateAdded',
       'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend',
       'reviews.id', 'reviews.numHelpful', 'reviews.rating',
       'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.userCity', 'reviews.userProvince', 'reviews.username', 'sizes',
       'sourceURLs', 'upc', 'weight'],
      dtype='object')


In [671]:
# 노이즈 제거
df['reviews.text'] = df['reviews.text'].fillna('')  # 결측값을 빈 문자열로 채움
df['reviews.text'] = df['reviews.text'].astype(str)  # 문자열로 변환
df['reviews.text'] = df['reviews.text'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))  # 특수 문자 제거
df['reviews.text'] = df['reviews.text'].apply(lambda x: re.sub(r'\d+', '', x))  # 숫자 제거
df['reviews.text'] = df['reviews.text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # 불필요한 공백 제거

In [672]:
# NLTK 데이터 다운로드
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [673]:
# 불용어 제거 및 토큰화
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['reviews.text'].apply(lambda x: tokenize_and_remove_stopwords(x))

In [674]:
# 스테밍 및 표제어 추출
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

df['stemmed_text'] = df['cleaned_text'].apply(lambda x: stem_text(x))
df['lemmatized_text'] = df['cleaned_text'].apply(lambda x: lemmatize_text(x))

In [675]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer() # VADER 감정 분석기 초기화

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [676]:
# 감성 분석 함수 정의
def get_sentiment_score(text):
    return analyzer.polarity_scores(text)['compound']

# 감성 분석 적용
df['sentiment'] = df['lemmatized_text'].apply(lambda x: get_sentiment_score(x))

# 결과 확인
print(df[['lemmatized_text', 'sentiment']].head())

                                                                                                                                                          lemmatized_text  \
0                                                                                                                                           fantastic white wine occasion   
1                                                                                                                                     Tart sweetvery refreshing delicious   
2                                                                                              given wine delightful surprise find flavorful delicious taste new favorite   
3                                                                                                                                        phenomenal wine new favorite red   
4  ml bottle price two way le packaging YES PLEASE nervous good true wouldnt like wine fantastic go bit way get live near state line is

In [677]:
#  리뷰의 별점과 라벨링된 감정 점수가 일치하는지 확인하는 함수
def check_labeling(review_rating, sentiment_label):
    if review_rating >= 2 and sentiment_label == 1:
        return "Match"
    elif review_rating < 2 and sentiment_label == 0:
        return "Match"
    else:
        return "Mismatch"

# 감정 점수에 따라 라벨링 (감정 점수 0 이상이면 긍정(1), 0 미만이면 부정(0))
df['sentiment_label'] = df['sentiment'].apply(lambda x: 1 if x >= 0 else 0)

# review_rating과 라벨링된 감정 점수 비교
df['Check_Labeling'] = df.apply(lambda row: check_labeling(row['reviews.rating'], row['sentiment_label']), axis=1)

# 결과 출력
print(df[['reviews.rating', 'sentiment_label', 'Check_Labeling']])


      reviews.rating  sentiment_label Check_Labeling
0                5.0                1          Match
1                5.0                1          Match
2                5.0                1          Match
3                5.0                1          Match
4                5.0                1          Match
...              ...              ...            ...
2885             5.0                1          Match
2886             5.0                1          Match
2887             5.0                1          Match
2888             5.0                1          Match
2889             5.0                1          Match

[2890 rows x 3 columns]


In [678]:
# Match로 출력되는 비율 계산
match_count = (df['Check_Labeling'] == 'Match').sum()  # 'Check_Labeling' 열에서 'Match'인 항목의 수를 세고 match_count에 저장
total_count = len(df)  # DataFrame의 총 행 수를 total_count에 저장
match_ratio = match_count / total_count  # 'Match'로 출력되는 비율을 계산하여 match_ratio에 저장
print("matching ratio:", match_ratio)  # 'Match'로 출력되는 비율 출력


matching ratio: 0.769204152249135


In [679]:
# pandas 옵션 설정
pd.set_option('display.max_colwidth', None)

# 불일치하는 리뷰 추출
mismatches = df[df['Check_Labeling'] == 'Mismatch']

# 불일치하는 리뷰 출력
print(mismatches[['reviews.rating', 'sentiment', 'sentiment_label', 'Check_Labeling', 'reviews.text']])


      reviews.rating  sentiment  sentiment_label Check_Labeling  \
16               2.0    -0.0829                0       Mismatch   
17               5.0    -0.1779                0       Mismatch   
30               5.0    -0.1200                0       Mismatch   
45               4.0    -0.8430                0       Mismatch   
66               5.0    -0.3400                0       Mismatch   
...              ...        ...              ...            ...   
2860             3.0    -0.4767                0       Mismatch   
2861             1.0     0.7124                1       Mismatch   
2874             2.0    -0.0258                0       Mismatch   
2875             2.0    -0.6381                0       Mismatch   
2876             1.0     0.7074                1       Mismatch   

                                                                                                                                                                                                   

In [680]:
# 특성과 라벨을 분리
X = df['sentiment'].values.reshape(-1, 1)  # 감성 점수
y = df['sentiment_label']  # 감성 점수 라벨

# 분리된 특성과 라벨 확인
print("Features (X):", X[:5])
print("Labels (y):", y[:5])

Features (X): [[0.5574]
 [0.5719]
 [0.9118]
 [0.4588]
 [0.7865]]
Labels (y): 0    1
1    1
2    1
3    1
4    1
Name: sentiment_label, dtype: int64


In [681]:
%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import numpy as np

Note: you may need to restart the kernel to use updated packages.


In [682]:
# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [683]:
# 별점을 이진 분류로 변환
y_train = np.where(y_train >= 3, 1, 0)
y_test = np.where(y_test >= 3, 1, 0)

In [684]:
# SVR 모델과 하이퍼파라미터 그리드 정의
svr = SVR()
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto']
}

In [685]:
# GridSearchCV 설정 및 실행
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [686]:
# 최적 하이퍼파라미터 출력
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [687]:
# 최적 모델로 예측
best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test)

In [688]:
# 예측 결과를 이진 분류 라벨로 변환
y_pred_label = np.where(y_pred >= 0.5, 1, 0)  # 예측 값이 0.5 이상이면 1, 미만이면 0

In [689]:
# y_test와 y_pred_label의 값 분포 확인
print("y_test 값 분포:", np.unique(y_test, return_counts=True))
print("y_pred_label 값 분포:", np.unique(y_pred_label, return_counts=True))


y_test 값 분포: (array([0]), array([578], dtype=int64))
y_pred_label 값 분포: (array([0]), array([578], dtype=int64))


In [690]:
# 학습 데이터와 테스트 데이터의 클래스 분포 확인
print("Train data label distribution:", np.unique(y_train, return_counts=True))
print("Test data label distribution:", np.unique(y_test, return_counts=True))

Train data label distribution: (array([0]), array([2312], dtype=int64))
Test data label distribution: (array([0]), array([578], dtype=int64))


In [691]:
# 모델 평가
accuracy = accuracy_score(y_test, y_pred_label)
precision = precision_score(y_test, y_pred_label, average='binary')
recall = recall_score(y_test, y_pred_label, average='binary')
f1 = f1_score(y_test, y_pred_label, average='binary')


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 1.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [692]:
from collections import Counter

# SMOTE 적용 후 학습 데이터의 클래스 분포 확인
print('Original training dataset shape %s' % Counter(y_train))
print('Resampled training dataset shape %s' % Counter(y_train_resampled))

# 테스트 데이터의 클래스 분포 확인
print('Test dataset shape %s' % Counter(y_test))

Original training dataset shape Counter({0: 2312})


NameError: name 'y_train_resampled' is not defined