In [2]:
import pandas as pd
import time
import ast # ast 모듈은 Python 코드 구문 트리를 파싱하고 분석하는데 사용

In [4]:
# Load the hotel reviews from CSV
start = time.time()
df = pd.read_csv('./wine_review.csv')

In [5]:
# 데이터 구조 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2890 entries, 0 to 2889
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    2890 non-null   object 
 1   asins                 870 non-null    object 
 2   brand                 2825 non-null   object 
 3   categories            2890 non-null   object 
 4   dateAdded             2890 non-null   object 
 5   dateUpdated           2890 non-null   object 
 6   descriptions          2738 non-null   object 
 7   dimension             1838 non-null   object 
 8   ean                   2166 non-null   object 
 9   flavors               151 non-null    object 
 10  keys                  2890 non-null   object 
 11  manufacturer          849 non-null    object 
 12  manufacturerNumber    2457 non-null   object 
 13  name                  2890 non-null   object 
 14  reviews.date          2552 non-null   object 
 15  reviews.dateAdded    

In [8]:
# dropping columns we will not use:
# df.drop(["id", "asins", "dataAdded", "de", "", "", "", "", ], axis = 1, inplace=True)

df = df[["categories", "name", "reviews.rating", "reviews.text", "reviews.title"]]

In [11]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2890 entries, 0 to 2889
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   categories      2890 non-null   object 
 1   name            2890 non-null   object 
 2   reviews.rating  2445 non-null   float64
 3   reviews.text    2889 non-null   object 
 4   reviews.title   2846 non-null   object 
dtypes: float64(1), object(4)
memory usage: 113.0+ KB


Unnamed: 0,categories,name,reviews.rating,reviews.text,reviews.title
0,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",Ecco Domani174 Pinot Grigio - 750ml Bottle,5.0,This a fantastic white wine for any occasion!,My Favorite White Wine
1,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",Fresh Craft174 Mango Citrus - 4pk / 250ml Bottle,5.0,"Tart, not sweet...very refreshing and delicious!",Yum!!
2,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",1000 Stories174 Zinfandel - 750ml Bottle,5.0,I was given this wine so it was a delightful s...,A New Favorite!
3,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",1000 Stories174 Zinfandel - 750ml Bottle,5.0,This is a phenomenal wine and my new favorite ...,"Bold, Flavorful, Aromatic, Delicious"
4,"Food & Beverage,Beverages,Wine, Beer & Liquor,...",Pink Moscato - 3l Bottle - Wine Cube153,5.0,4 750ml bottles for the price of two With way ...,"Yum! Plus, Environmentally Friendly!"


In [12]:
# 대괄호와 따옴표를 제거

# remove opening and closing brackets)
df.categories = df.categories.str.strip("[']")
# remove all quotes too
df.categories = df.categories.str.replace(" ', '", ",", regex = False)

In [13]:
print (df["categories"])

0       Food & Beverage,Beverages,Wine, Beer & Liquor,...
1       Food & Beverage,Beverages,Wine, Beer & Liquor,...
2       Food & Beverage,Beverages,Wine, Beer & Liquor,...
3       Food & Beverage,Beverages,Wine, Beer & Liquor,...
4       Food & Beverage,Beverages,Wine, Beer & Liquor,...
                              ...                        
2885             Wine Shop,Kitchen and Food,View All Food
2886             Wine Shop,Kitchen and Food,View All Food
2887             Wine Shop,Kitchen and Food,View All Food
2888             Wine Shop,Kitchen and Food,View All Food
2889             Wine Shop,Kitchen and Food,View All Food
Name: categories, Length: 2890, dtype: object


In [15]:
# Now split the strings into a list
tag_list_df = df.categories.str.split(',', expand = True)

In [16]:
# Remove leading and trailing spaces
df["Tag_1"] = tag_list_df[0].str.strip()
df["Tag_2"] = tag_list_df[1].str.strip()
df["Tag_3"] = tag_list_df[2].str.strip()
df["Tag_4"] = tag_list_df[3].str.strip()
df["Tag_5"] = tag_list_df[4].str.strip()
df["Tag_6"] = tag_list_df[5].str.strip()

In [17]:
# Merge the 6 columns into one with melt
df_tags = df.melt(value_vars=["Tag_1", "Tag_2", "Tag_3", "Tag_4", "Tag_5", "Tag_6"])

In [26]:
# Submitted from a mobile device됨과 같은 일반적인 태그 중 일부는 쓸모가 없으므로 제거
# length of stay 태그 제거
# 객실, 스위트, 스튜디오, 아파트 등 매우 다양한 종류의 숙소가 있지만 거의 같은 의미이며 목표와 관련이 없으므로 고려 대상에서 제외

# Get the value counts
tag_vc = df_tags.value.value_counts()
# print(tag_vc)
print("The shape of the tags with no filtering:", str(df_tags.shape))
# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000
df_tags = df_tags[~df_tags.value.str.contains("Beer|Beverages|Food|Wholesale Lots|Grocery|Wine Shop", na=False, case=False)]
tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 50")
# Print the top 10 (there should only be 9 and we'll use these in the filtering section)
print(tag_vc[:10])

The shape of the tags with no filtering: (5796, 2)
             value  count
0   Wine & Spirits   2277
1  Cocktail Mixers    622
2             Wine    601
3          Spirits    414
4   Meal Solutions     77
5           Drinks     65
6    Juice & Cider     65
7   Cocktail Mixes     62
8   Party Supplies     57


In [29]:
# 마지막 단계는 이러한 각 태그에 관한 새 필드를 만드는 것
# 다음 모든 리뷰 레코드에서 태그 필드가 새 필드 중 하나와 일치하면 1을 추가하고, 일치하지 않으면 0을 추가
# 최종 결과는 비즈니스와 레저, 반려동물 동반 등의 이유로 이 호텔을 선택한 리뷰어의 수(총합)
# 이는 호텔을 추천할 때 유용한 정보로 활용할 수 있음

# Process the Tags into new columns
# The file Hotel_Reviews_Tags.py, identifies the most important tags
# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, 
# Family with young children, Family with older children, With a pet
df["Wine & Spirits"] = df.categories.apply(lambda tag: 1 if "Wine & Spirits" in tag else 0)
df["Cocktail Mixers"] = df.categories.apply(lambda tag: 1 if "Cocktail Mixers" in tag else 0)
df["Wine"] = df.categories.apply(lambda tag: 1 if "Wine" in tag else 0)
df["Spirits"] = df.categories.apply(lambda tag: 1 if "Spirits" in tag else 0)
df["Meal Solutions"] = df.categories.apply(lambda tag: 1 if "Meal Solutions" in tag or "Travelers with friends" in tag else 0)
df["Drinks"] = df.categories.apply(lambda tag: 1 if "Drinks" in tag else 0)
df["Juice & Cider"] = df.categories.apply(lambda tag: 1 if "Juice & Cider" in tag else 0)
df["Cocktail Mixes"] = df.categories.apply(lambda tag: 1 if "Cocktail Mixes" in tag else 0)

In [30]:
# Saving new data file with calculated columns
print("Saving results to Hotel_Reviews_Filtered.csv")
df.to_csv(r'./Hotel_Reviews_Filtered.csv', index = False)
end = time.time()
print("Filtering took " + str(round(end - start, 2)) + " seconds")

Saving results to Hotel_Reviews_Filtered.csv
Filtering took 1358.41 seconds


In [31]:
%pip install nltk
import time
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer # nltk의 VADER 감정 분석기를 가져옮
nltk.download('vader_lexicon') # VADER 감정 분석기에 필요한 어휘 자료를 다운로드

# Load the filtered hotel reviews from CSV
df = pd.read_csv('./Hotel_Reviews_Filtered.csv')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/codespace/nltk_data...


In [32]:
nltk.download('stopwords')

# Remove stop words - can be slow for a lot of text!
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
start = time.time()
cache = set(stopwords.words("english"))
def remove_stopwords(review):
    text = " ".join([word for word in review.split() if word not in cache])
    return text

# Remove the stop words from both columns
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'DataFrame' object has no attribute 'Negative_Review'