# 데이터 파일 확인 및 분석 시작

In [1]:
import pandas as pd
import time

# 데이터 로드
start = time.time()
df = pd.read_csv('./wine_review.csv')
end = time.time()

print("Loading took " + str(round(end - start, 2)) + " seconds")

Loading took 0.11 seconds


# 데이터 구조 및 샘플 분석

총 32개의 컬럼과 2890개의 레코드가 있습니다.

주요 텍스트 컬럼은 reviews.text로, 이는 감성 분석에 사용될 텍스트 데이터를 포함합니다.

일부 컬럼에 결측값이 많이 포함되어 있습니다 (예: asins, dimension, flavors 등).

In [2]:
# 데이터 구조 확인
df.info()

# 데이터 샘플 확인
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2890 entries, 0 to 2889
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    2890 non-null   object 
 1   asins                 870 non-null    object 
 2   brand                 2825 non-null   object 
 3   categories            2890 non-null   object 
 4   dateAdded             2890 non-null   object 
 5   dateUpdated           2890 non-null   object 
 6   descriptions          2738 non-null   object 
 7   dimension             1838 non-null   object 
 8   ean                   2166 non-null   object 
 9   flavors               151 non-null    object 
 10  keys                  2890 non-null   object 
 11  manufacturer          849 non-null    object 
 12  manufacturerNumber    2457 non-null   object 
 13  name                  2890 non-null   object 
 14  reviews.date          2552 non-null   object 
 15  reviews.dateAdded    

Unnamed: 0,id,asins,brand,categories,dateAdded,dateUpdated,descriptions,dimension,ean,flavors,...,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,sourceURLs,upc,weight
0,AV13ClKCGV-KLJ3akN68,,Gallo,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",2017-07-24T23:59:11Z,2018-01-10T18:06:28Z,,1.0 in x 1.0 in x 1.0 in,,,...,https://redsky.target.com/groot-domain-api/v1/...,This a fantastic white wine for any occasion!,My Favorite White Wine,,,Bjh,,http://redsky.target.com/v1/plp/search?kwr=y&c...,492130000000.0,1.0 lbs
1,AV13CsvW-jtxr-f38AQO,,Fresh Craft Co.,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",2017-07-24T23:59:42Z,2018-01-10T05:38:33Z,"[{""dateSeen"":[""2017-12-21T05:43:00.000Z"",""2017...",4.25 in x 4.25 in x 5.25 in,,,...,https://redsky.target.com/groot-domain-api/v1/...,"Tart, not sweet...very refreshing and delicious!",Yum!!,,,Wino,,http://redsky.target.com/v1/plp/search?kwr=y&c...,83120003441.0,2.45 lbs
2,AV13CVI_glJLPUi8O7Po,,1000 Stories,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",2017-07-24T23:58:05Z,2018-01-10T05:38:31Z,,3.3 in x 3.3 in x 11.79 in,,,...,https://redsky.target.com/groot-domain-api/v1/...,I was given this wine so it was a delightful s...,A New Favorite!,,,Bama Mom,,http://redsky.target.com/v1/plp/search?kwr=y&c...,82896001453.0,3.09 lbs
3,AV13CVI_glJLPUi8O7Po,,1000 Stories,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",2017-07-24T23:58:05Z,2018-01-10T05:38:31Z,,3.3 in x 3.3 in x 11.79 in,,,...,https://redsky.target.com/groot-domain-api/v1/...,This is a phenomenal wine and my new favorite ...,"Bold, Flavorful, Aromatic, Delicious",,,Av Dub,,http://redsky.target.com/v1/plp/search?kwr=y&c...,82896001453.0,3.09 lbs
4,AV13CYL4-jtxr-f37_-t,,Wine Cube153,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",2017-07-24T23:58:18Z,2018-01-10T18:06:29Z,"[{""dateSeen"":[""2017-12-21T05:43:00.000Z"",""2017...",1.0 in x 1.0 in x 1.0 in,,,...,https://redsky.target.com/groot-domain-api/v1/...,4 750ml bottles for the price of two With way ...,"Yum! Plus, Environmentally Friendly!",Overland Park,,Chelseamay,,https://redsky.target.com/groot-domain-api/v1/...,85200600465.0,1.0 lbs


# 데이터 전처리
데이터 전처리를 위해 다음 단계를 수행하겠습니다:

불필요한 컬럼 제거: 감성 분석에 필요하지 않은 컬럼을 제거합니다.

결측값 처리: 주요 텍스트 컬럼 (reviews.text)의 결측값을 처리합니다.

텍스트 데이터 정리: 텍스트 정규화, 토큰화, 불용어 제거, 스테밍 또는 표제어 추출을 수행합니다.


In [3]:
# 감성 분석에 필요한 컬럼만 선택
df = df[['reviews.text']]

# 결측값 확인
df.isnull().sum()

reviews.text    1
dtype: int64

In [4]:
# 결측값 제거
df = df.dropna(subset=['reviews.text'])

In [5]:
%pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [6]:
import re
import nltk as nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# 텍스트 정리 함수
def preprocess_text(text):
    # 소문자 변환
    text = text.lower()
    # 특수문자 제거
    text = re.sub(r'\W', ' ', text)
    # 숫자 제거
    text = re.sub(r'\d', ' ', text)
    # 공백 제거
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 토큰화
    tokens = word_tokenize(text)
    
    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 스테밍
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

# 텍스트 데이터 전처리 적용
# df['reviews.text'] = df['reviews.text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
nltk.download('stopwords')

# Remove stop words - can be slow for a lot of text!
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
start = time.time()
cache = set(stopwords.words("english"))
def remove_stopwords(review):
    text = " ".join([word for word in review.split() if word not in cache])
    return text

# Remove the stop words from both columns
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'DataFrame' object has no attribute 'Negative_Review'