In [1]:
import pandas as pd
import numpy as np

In [5]:
reviews = pd.read_csv('./datasets/original_reviews.csv', index_col=0)
reviews.head()

Unnamed: 0,property_id,rating,badge,timestamp,content
0,106005,10.0,Exceptional,2020-07-05,Beautiful hotel with great views and easy loca...
1,106005,10.0,Exceptional,2020-06-23,Spectacular hotel and location. Very well appo...
2,106005,10.0,Exceptional,2020-06-20,"Great service, everyone was friendly and respe..."
3,106005,10.0,Exceptional,2020-05-17,The lady that checked me was very kind and hel...
4,106005,10.0,Exceptional,2020-03-23,Impeccable! Attentive staff and relaxing room ...


In [6]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81492 entries, 0 to 81491
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   property_id  81492 non-null  int64  
 1   rating       81492 non-null  float64
 2   badge        81492 non-null  object 
 3   timestamp    81492 non-null  object 
 4   content      57401 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.7+ MB


In [7]:
no_content_reviews = reviews[reviews.content.isna()]
no_content_reviews.shape

(24091, 5)

In [8]:
text_reviews = reviews.drop(index = no_content_reviews.index)
text_reviews.head()

Unnamed: 0,property_id,rating,badge,timestamp,content
0,106005,10.0,Exceptional,2020-07-05,Beautiful hotel with great views and easy loca...
1,106005,10.0,Exceptional,2020-06-23,Spectacular hotel and location. Very well appo...
2,106005,10.0,Exceptional,2020-06-20,"Great service, everyone was friendly and respe..."
3,106005,10.0,Exceptional,2020-05-17,The lady that checked me was very kind and hel...
4,106005,10.0,Exceptional,2020-03-23,Impeccable! Attentive staff and relaxing room ...


In [9]:
text_reviews.shape

(57401, 5)

In [10]:
from langdetect import detect, detect_langs, DetectorFactory
DetectorFactory.seed = 0

def get_error_row(df):
    """
    rows cannot be processed as language
    """
    
    error_row_index = []
    for text in df['content']:
        try:
            language = detect(text)
        except:
            language = "error"
            index = df[df['content']== text].index.tolist()
            for i in index:
                error_row_index.append(i)
    return error_row_index

In [11]:
error_row_index = get_error_row(text_reviews)
len(error_row_index), len(set(error_row_index))

(76, 32)

In [12]:
for ind in list(set(error_row_index)):
    print(text_reviews.loc[ind, 'content'])

—————————-.................................................................................
Good property...………………………………………………………………………...…………………….…………..
.
0
19/9-21/9
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.
                                                      
 
-
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-
..,..........................................................................
....................................................................................................................................... 
 
 
👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍
 
..................................................................................................................................

In [15]:
# drop error rows
text_reviews = text_reviews.drop(set(error_row_index))

In [16]:
def get_non_eng_reviews(df):
    """
    this notebook will only process English reviews. 
    """
    non_eng_review_index = []
    
    for text in df['content']:
        result_lang = detect(text)
        if result_lang != 'en':
            index = df[df['content']== text].index.tolist()
            for i in index:
                non_eng_review_index.append(i)
            
    return non_eng_review_index

In [17]:
non_eng_review_index = get_non_eng_reviews(text_reviews)
for ind in list(set(non_eng_review_index)):
    print(text_reviews.loc[ind, 'content'])

Friendly staff
Breakfast average. Not value for money.
It’s quite ok except parking
All good 
Superb
지하철역과 거리가 좀 있지만 지하철역 및 공항 사이 버스가 자주 있어서 접근성이 용이함
여행비용 절약과 현지음식 체험을 위해
조리기구가 있는 호텔을 찾던중 AEA호텔을 발견 후
이용하였습니다.

* 장점
1. 공항과 가까움(도보 25분)
2. 도심외곽에 위치한 현지느낌 
3. 넓은 숙소 
4. 조리에 유용한 주방조리기구
5. 깨끗한 숙소내부
6. 2섹션으로 나뉘어진 옷장
7. 호텔입구, 현관 2중 카드보안

* 단점
1. 도심과 멀다(관광시 교통비, 시간소요)
2. 수영장, 레스토랑 등 외부시설이 없다
3. 도로변이 복잡함(교통이용시 일정시간 이동해야함)
4. 외곽위치
5. 1층입구 계단
6. 벨보이없음
7. 인포 안내 시간(늦은오전~ 저녁)

* 후기
가성비 대비 넓고 쾌적한 숙소였습니다. 조리가 가능하여 직접 마트에서 구매 후 음식을 구매하였고 여행경비를 절약할 수 있었습니다. 
Rất yên tĩnh ,nhân viên phục vụ nhiệt tình  ,nói chung rất tốt .
マスコット地区にある空港に比較的近いアパートメントタイプのホテルです。空港からのアクセスは４００番のバスに乗れば約１５分程で付近のバス停に着くので費用を抑えて旅を楽しみたい方には便利です。ホテル付近からCBDへのアクセスもホテル付近からの通りから数分ごとにバスが出ているのでそこまで不便は感じません。部屋は大変広くてひとりでは持て余しそうな大きさでした。キッチンもあり近くの商店街にマーケットもあるので簡単な料理もできます。レセプションの方も親切で前日にメールをくれ飛行機の到着時間を伝えたところ、アーリーチェックインもさせてもらいました。街中のホテルも便利ですが、シドニー 郊外の落ち着いた雰囲気を味わいながらのんびりした時間を過ごしたいならオススメです。
環境不錯，空間大，只是垃圾要自己丟掉
Very good
スタッフの方はフレンドリーで良い方達でした。6人部屋はとても広

In [18]:
# only take english reviews for now
text_reviews = text_reviews.drop(set(non_eng_review_index))

text_reviews.reset_index(drop = True, inplace = True)
text_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52003 entries, 0 to 52002
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   property_id  52003 non-null  int64  
 1   rating       52003 non-null  float64
 2   badge        52003 non-null  object 
 3   timestamp    52003 non-null  object 
 4   content      52003 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 2.0+ MB


In [19]:
text_reviews.to_csv('./datasets/text_reviews.csv')