In [1]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from bs4 import BeautifulSoup
import requests
import time
from googletrans import Translator

In [0]:
def get_url(url):
    url_split = url.split('-Reviews-')
    for i in range(max_page):
        page = i*10
        if page == 0:
            url_page = url
        else:
            url_page = url_split[0] + '-Reviews-or{}-'.format(page) + url_split[1]
            

In [31]:
def get_reviews(response):
    soup = BeautifulSoup(response.text, 'lxml')
    reviews = []
    review_container = soup.find_all(class_='review-container')
    for i in range(len(review_container)):
        review = review_container[i].find_all("p", class_='partial_entry')[0].text
        reviews.append(review)
    return reviews

In [32]:
def translate_reviews(reviews):
    translator = Translator(service_urls=['translate.googleapis.com'])
    reviews_translated = []
    translations = translator.translate(reviews, dest='ja')
    for translation in translations:
        reviews_translated.append(translation.text)
    return reviews_translated

In [42]:
def get_translated_reviews(url, max_page = 10):
    """
    対象のURLにアクセスする関数
    アクセスできない等のエラーが発生したら例外を投げる
    """
    # 接続確立の待機時間、応答待機時間を10秒とし、それぞれの値を超えた場合は例外が発生（ConnectTimeout）
    url_split = url.split('-Reviews-')
    reviews_all = []
    for i in range(max_page):
        page = i*10
        if page == 0:
            url_page = url
        else:
            url_page = url_split[0] + '-Reviews-or{}-'.format(page) + url_split[1]
        print(url_page)
        data = requests.get(url_page, timeout=10)
        data.encoding = data.apparent_encoding
        # アクセス過多を避けるため、2秒スリープ
        time.sleep(2)

        # レスポンスのステータスコードが正常(200番台)以外の場合は、例外を発生させる(HTTPError)
        if data.status_code != requests.codes.ok:
            break
        else:
            reviews = get_reviews(data)
            reviews_all.append(reviews)
    reviews_all = [item for sublist in reviews_all for item in sublist]
    reviews_translated = translate_reviews(reviews_all)
    zipped = zip(reviews_all, reviews_translated)
    trip_advisor_reviews_df = pd.DataFrame(set(zipped), columns=["en", "jp"])
    return trip_advisor_reviews_df

In [43]:
url = 'https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html'

In [44]:
trip_advisor_reviews_df = get_translated_reviews(url, max_page = 2)

https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or10-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html


In [45]:
trip_advisor_reviews_df

Unnamed: 0,en,jp
0,"I’ve been there many time, I tried pretty much...",私はそこに何度も行っており、さまざまな種類のラーメンを試しました。\n全部好き。場所は本当に...
1,We decided to try due to the long queue we saw...,ある夜、オンラインで良いレビューを見て長い列ができていたので、試してみることにしました。食べ...
2,"We arrived for the opening time of 12h00, wait...",私たちは 12 時の開店時間に到着し、待ち行列で 25 分待ってから入りました。最終的に入る...
3,The ramens are really good. Its all about FISH...,ラーメンは本当にうまい。それはすべて FISH についてです。彼らは魚のスープを使用します。...
4,"Very good food, close to a real Japanese exper...",本物の日本体験に近い、とてもおいしい料理。長期滞在にはあまり快適ではありませんが、雰囲気は良...
5,Real dissappointment total tourist trap. Food ...,本当の失望の総観光客の罠。食べ物は本当に塩辛く、味がしませんでした。他の 5 つ星のレビュー...
6,The atmosphere of this restaurant was great an...,このレストランの雰囲気は素晴らしく、すべての従業員はとても快適でした。唯一の問題は、彼らのラ...
7,Worth the wait! Had to queue for about 1 hour ...,待つ価値あり！土曜日の夜に約 1 時間待ち行列に入れなければなりませんでした。本物の日本の雰...
8,Definitely ramen worth putting on a bucket lis...,バケツリストに載せる価値のあるラーメン。すべての素材の組み合わせによる味と香りの完璧さは、神...
9,This is a small ramen restaurant. I waited so ...,小さなラーメン屋です。冬の間、私は長い間家族と並んで待っていたので、実際に通りの反対側にある...


In [3]:
response = connect_url(url)

In [14]:
url_split = url.split('-Reviews-')

In [17]:
url_split[0] + "-Reviews-or10-" + url_split[1]
"-Reviews-or10-"

'https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or10-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html'

In [27]:
for i in range(10):
    page = i*10
    if page == 0:
        url_page = url
    url_page = url_split[0] + '-Reviews-or{}-'.format(page) + url_split[1]
    print(url_page)

https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or0-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or10-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or20-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or30-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or40-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or50-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or60-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
https://www.tripadvisor.com/Restaurant_Review-g187147-d10085290-Reviews-or70-Kodawari_Ramen_Yokocho-Paris_Ile_de_France.html
h

In [21]:
'-Reviews-or{}-'.format("10")

'-Reviews-or10-'

In [4]:
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
reviews = []
review_container = soup.find_all(class_='review-container')
for i in range(len(review_container)):
    review = review_container[i].find_all("p", class_='partial_entry')[0].text
    reviews.append(review)

In [8]:
translator = Translator(service_urls=['translate.googleapis.com'])

In [9]:
reviews_translated = []
translations = translator.translate(reviews, dest='ja')
for translation in translations:
    reviews_translated.append(translation.text)

In [10]:
zipped = zip(reviews, reviews_translated)

In [11]:
trip_advisor_reviews_df = pd.DataFrame(set(zipped), columns=["en", "jp"])

In [12]:
# Write recipe outputs
your_trip_advisor = dataiku.Dataset("your_trip_advisor")
your_trip_advisor.write_with_schema(trip_advisor_reviews_df)

10 rows successfully written (6VdZp2MatJ)


In [13]:
trip_advisor_reviews_df

Unnamed: 0,en,jp
0,"We arrived for the opening time of 12h00, wait...","We arrived for the opening time of 12h00, wait..."
1,"Really excellent food, and fab service. Also r...","Really excellent food, and fab service. Also r..."
2,We decided to try due to the long queue we saw...,We decided to try due to the long queue we saw...
3,This is a small ramen restaurant. I waited so ...,This is a small ramen restaurant. I waited so ...
4,"Ambiance is great. The food however, needs som...","Ambiance is great. The food however, needs som..."
5,Worth the wait! Had to queue for about 1 hour ...,Worth the wait! Had to queue for about 1 hour ...
6,The atmosphere of this restaurant was great an...,The atmosphere of this restaurant was great an...
7,We didn't get to go to Japan like we usually d...,We didn't get to go to Japan like we usually d...
8,"The best ramen I’ve ever had. Tender meat, fre...","The best ramen I’ve ever had. Tender meat, fre..."
9,"I’ve been there many time, I tried pretty much...","I’ve been there many time, I tried pretty much..."
