# Step 0: Initialization

In [17]:
! git config --global user.name "Hao-Lin Yeh"
! git config --global user.email r11323045@ntu.edu.tw

# Step 1: Web Crawling

In [18]:
import requests
import time
import math
from bs4 import BeautifulSoup
import pandas as pd

In [19]:
def crawler(url):
    # 讓程式暫停2秒，不要讓booking覺得我們在攻擊他們
    time.sleep(2)

    headers = {
    #User-Agent的值表示這個請求是由Chrome 51瀏覽器（User-Agent字符串中的信息）發送，而且是在一個Chrome OS x86_64的環境下
    'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
    #表示使用者首選的語言是美式英文（en-US），優先權為0.5。
    #若伺服器能夠提供美式英文內容，就會優先返回該內容，如果不行，則默認使用一般的英文內容（en）
    'Accept-Language': 'en-US, en;q=0.5'}
    # Reference
    # https://medium.com/@tetianakushniruk/web-scraping-booking-com-with-beautiful-soup-for-hotel-data-analysis-95f874820c18
    # https://www.zenrows.com/blog/python-requests-user-agent#what-is
    
    # 訪問該網站
    response  = requests.get(url, headers=headers)
    # 解析網站內容
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup

In [20]:
def find_value(hotel, tag, class_):
    try:
        # hotel是soup抓下來的內容，找他的文字檔
        # tag和class_是在讀取html文字檔中用到的東西
        txt = hotel.find(tag, class_).text
    except AttributeError:
        # 有時候某間旅館可能沒有全部資料，這時直接設成None，之後再清掉
        txt = None
    return txt

In [21]:
def booking(location, check_in_date, check_out_date):
    # location是我們要的旅館所在地，check_in_date, check_out_date分別是住宿和退宿日期
    
    # 先找到有幾家旅館
    first_page_url = f'https://www.booking.com/searchresults.zh-tw.html?ss={location}&ssne={location}&ssne_untouched={location}&label=gen173nr-1BCAEoggI46AdIM1gEaOcBiAEBmAEwuAEXyAEM2AEB6AEBiAIBqAIDuAK2yuuqBsACAdICJDUyY2M5OTkzLTRhMmItNDZmNS04MDI1LWFlMzRiNzZlMzlhMNgCBeACAQ&sid=eaa1f2af7556a3de60c00d3cf49ff2df&aid=304142&lang=zh-tw&sb=1&src_elem=sb&src=searchresults&dest_type=city&checkin={check_in_date}&checkout={check_out_date}&group_adults=2&no_rooms=1&group_children=0'
    first_page_soup = crawler(first_page_url)
    # 找到旅館總數
    hotel_numbers = first_page_soup.find('div', class_='efdb2b543b e4b7a69a57').text.lstrip(f'{location}：找到 ').rstrip(' 間住宿')
    # 一頁會顯示25家旅館，以旅館總數算取頁數
    pages = math.ceil(int(hotel_numbers) / 25) 

    #先設6個空的list，用來裝資料
    name, address, price, rating, distance, comment = [],[],[],[],[],[]
    for page in range(pages):
        url = f'https://www.booking.com/searchresults.zh-tw.html?ss={location}&ssne={location}&ssne_untouched={location}&label=gen173nr-1BCAEoggI46AdIM1gEaOcBiAEBmAEwuAEXyAEM2AEB6AEBiAIBqAIDuAK2yuuqBsACAdICJDUyY2M5OTkzLTRhMmItNDZmNS04MDI1LWFlMzRiNzZlMzlhMNgCBeACAQ&sid=eaa1f2af7556a3de60c00d3cf49ff2df&aid=304142&lang=zh-tw&sb=1&src_elem=sb&src=searchresults&dest_type=city&checkin={check_in_date}&checkout={check_out_date}&group_adults=2&no_rooms=1&group_children=0&offset={page*25}'
        soup = crawler(url)
        # 過濾其他內容，找到該頁全部的旅館資訊
        hotels = soup.find_all('div', class_ = 'c82435a4b8 a178069f51 a6ae3c2b40 a18aeea94d d794b7a0f7 f53e278e95 c6710787a4')
        for hotel in hotels:
            # 把需要資料加進list
            name.append(find_value(hotel, 'div', 'f6431b446c a15b38c233'))
            address.append(find_value(hotel, 'span', 'aee5343fdb def9bc142a'))
            price.append(find_value(hotel, 'span', 'f6431b446c fbfd7c1165 e84eb96b1f'))
            rating.append(find_value(hotel,'div','a3b8729ab1 d86cee9b25'))
            comment.append(find_value(hotel, 'div', 'a3b8729ab1 e6208ee469 cb2cbb3ccb'))

            # 有其他資料和distance共用同樣的tag和class，要先過濾掉
            for_distance = hotel.find_all('span', class_='aee5343fdb')[2]
            distance.append(find_value(for_distance, 'span', 'f419a93f12'))

    #把爬完的資料裝進DataFrame裡面
    df = pd.DataFrame({
        'name':name,
        'location':address,
        'price':price,
        'rating':rating,
        'distance':distance,
        'comment':comment})
    return df

In [22]:
# 檢查用code
#df = booking('台北', '2023-11-25', '2023-11-26')

In [23]:
# 檢查抓下來的內容
#len(df)

In [24]:
# 檢查是否有NA，等一下直接刪掉
# df.isna().any()

# Step 2: Data Cleaning

In [25]:
#參考作業一助教的code
def clean(df):
    # 把沒有資料的row直接刪掉，重設index
    df = df.dropna().reset_index(drop=True)

    #把price前面的TWD刪掉，並把價格中間的','刪掉，最後轉成float
    df['price'] = df['price'].str.replace('TWD', '').str.replace(',', '').astype(float)
    #把rating轉成float
    df['rating'] = df['rating'].astype(float)
    
    #剛好我們要的數字前後都有空格，以空格區隔
    def convert_distance(distance_s):
        if '公尺' in distance_s:
            distance = float(distance_s.split(' ')[1]) / 1000
        elif '公里' in distance_s:
            distance = float(distance_s.split(' ')[1])
        else:
            distance = None
        return distance
    df['distance'] = df['distance'].apply(convert_distance).astype(float)

    # 再清一次NA資料
    df = df.dropna().reset_index(drop=True)
    return df

# Step 3: Data Visualization

In [26]:
import plotly.express as px

In [27]:
def draw(df):
    # 把DataFrame中的資料匯出圖片，橫軸為price，縱軸為rating，以rating做顏色區分
    # 滑鼠移到點上可以顯示name、price、distance和rating
    fig = px.scatter(df, x='price', y='distance', color='rating', hover_name='name',
                    title = 'Hotel price and Distance Scatter Plot')
    return fig

# Step 4 Dash Application

In [28]:
from datetime import date, datetime, timedelta
from dash import Dash, dcc, html, Input, Output, callback

In [29]:
# Reference: 
# https://app.datacamp.com/learn/courses/building-dashboards-with-dash-and-plotly

# Create an app object
app = Dash(__name__)
app.layout = html.Div([
    # 輸入文字檔的地方
    dcc.Input(
        id='my_input',
        type='text',
        placeholder="Enter your text",
        debounce=True),
    
    # 用日曆選日期的地方，最小設今天日期，最大設一年後
    dcc.DatePickerRange(
        id='my-date-picker-range',
        min_date_allowed=datetime.now(),
        max_date_allowed=datetime.now()+timedelta(days=365),
        initial_visible_month=datetime.now(),
    ),
    
    # 輸出圖片的地方
    html.Div(dcc.Graph(id='picture'))
])

#設定回傳資料
@callback(
    Output('picture', 'figure'),
    Input('my_input', 'value'),
    Input('my-date-picker-range', 'start_date'),
    Input('my-date-picker-range', 'end_date'))
def update_output(my_input, start_date, end_date):
    if my_input  and start_date and end_date is not None:

        # 把日曆選取的日期轉成booking網址一樣的格式
        start_date_object = date.fromisoformat(start_date)
        start_date_string = start_date_object.strftime('%Y-%m-%d')
        end_date_object = date.fromisoformat(end_date)
        end_date_string = end_date_object.strftime('%Y-%m-%d')

        # 執行爬蟲
        df=booking(my_input, start_date_string, end_date_string)
        # Ensure the DataFrame is not overwritten
        df_1 = df.copy(deep=True)
        df_1 = clean(df_1)
        fig = draw(df_1)

    return fig


# http://127.0.0.1:8050
# running the server
if __name__ == '__main__':
    app.run_server(debug=True)

[1;31m---------------------------------------------------------------------------[0m
[1;31mUnboundLocalError[0m                         Traceback (most recent call last)
[1;31mUnboundLocalError[0m: cannot access local variable 'fig' where it is not associated with a value

[1;31m---------------------------------------------------------------------------[0m
[1;31mUnboundLocalError[0m                         Traceback (most recent call last)
[1;31mUnboundLocalError[0m: cannot access local variable 'fig' where it is not associated with a value

[1;31m---------------------------------------------------------------------------[0m
[1;31mUnboundLocalError[0m                         Traceback (most recent call last)
[1;31mUnboundLocalError[0m: cannot access local variable 'fig' where it is not associated with a value

[1;31m---------------------------------------------------------------------------[0m
[1;31mUnboundLocalError[0m                         Traceback (most rece