# Scraping Hotel Reviews from TripAdvisor

#### Scraped English reviews from hotels based in Seoul: https://www.tripadvisor.in/Hotels-g294197-Seoul-Hotels.html
#### Top 240 hotels were selected in order of the Best Value as listed on TripAdvisor as of March 6, 2023.

In [1]:
# importing required libraries

import time
import pandas as pd
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

#### Reading Hotel Dataset (Hotel Name and it's URL)

In [2]:
df = pd.read_excel('Tripadvisor_HotelsList.xlsx')
hotels = df['Hotel'].tolist()
urls = df['URL'].tolist() 

### Complete Code for Scraping Title, Date, Reviewer, Location, Rating and the Review Text

##### This code will scrape the data for all the hotels provided through a list (Approximately takes 10.5 hours)

In [None]:
for hotel,url in zip(hotels,urls):
    
    time.sleep(30)
    
    hotelname = hotel
    hotelurl = url
    
    
    
    Title = []
    Date = []
    Reviewer = []
    Location = []
    Rating = []
    Review = []

    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

    #getting the link of the hotel

    driver.get(url)

    # defining the function to extract reviews
    
    def getreviews():
        
        # clicks on'Read More' button to expand each review to get full review
        element_list = driver.find_elements(By.XPATH, "//span[@class='Ignyf _S Z']") 
        
        if len(element_list) > 0:
            print(f'there is an element: {element_list}')
            driver.execute_script("arguments[0].click();", element_list[0])
        else:
            print('theres no element')
            time.sleep(2)

        # we will go with one review at a time. It finds the element which contains all data including title,
        # date, reviewer, location,rating, review text for each review.
        
        containers = driver.find_elements(By.XPATH, "//div[@class='YibKl MC R2 Gi z Z BB pBbQr']")

        # looping through the elements found above
        
        for items in containers:

            try:
                title = items.find_element(By.XPATH,".//div[contains(@data-test-target, 'review-title')]")
                Title.append(title.text)

            except NoSuchElementException:
                Title.append("N/A")


            try:
                dates = items.find_element(By.XPATH,".//span[@class = 'teHYY _R Me S4 H3']" )
                Date.append(dates.text)
            except NoSuchElementException:
                Date.append("N/A")


            try:
                reviewer = items.find_element(By.XPATH, ".//div[@class='cRVSd']")
                Reviewer.append(reviewer.text)
            except NoSuchElementException:
                Reviewer.append("N/A")


            try:
                reviewer_loc = items.find_element(By.XPATH, ".//span[@class='RdTWF']")
                Location.append(reviewer_loc.text)
            except NoSuchElementException:
                Location.append("N/A")         


            try:
                for div in items.find_elements(By.XPATH,".//div[@class = 'Hlmiy F1']"):
                    rating = div.find_element(By.TAG_NAME, "span")
                    Rating.append(rating.get_attribute("class").split("_")[-1])
            except NoSuchElementException:
                Rating.append("N/A")


            try:
                review = items.find_element(By.XPATH,".//q[@class='QewHA H4 _a']")
                Review.append(review.text)
            except NoSuchElementException:
                Review.append("N/A")

   #running a while loop to loop through all the pages of a hotel unless there is no "Next" button to click.

    i = 1
    nextbn = True
    while True:
        try:
            time.sleep(3)
            getreviews()
            driver.find_element(By.XPATH,f"//div/div[3]/div[13]/div/a[{i}]").click()
            i= i+1
            if i>2:
                while nextbn:
                    try: 
                        # clicks the "Next" button
                        driver.find_element(By.XPATH,f"//div/div[3]/div[13]/div/a[2]").click()
                        time.sleep(3) 
                        getreviews()
                    except NoSuchElementException:
                        nextbn = False
                break     
        except NoSuchElementException:
            break


    # Prepare CSV file

    Hotel = [hotelname for i in range(len(Review))]
    dict = {'Hotel': Hotel,'Title': Title,'Date':Date,'Reviewer': Reviewer,'Location': Location,'Rating': Rating,'Review':Review}
    df = pd.DataFrame(dict)
    df.to_csv(f"{hotelname}.csv",index = False)   


### Saving the Data

In [None]:
# We have 240 files, one for each Hotel. Reading all the CSV files in this project directory.

import os
import pandas as pd
import glob

#listing all the files (CSV only)

hotels_file_list = glob.glob('*.{}'.format('csv'))

print(len(hotels_file_list))
hotels_file_list

In [7]:
#Merging all the files 

df_final = pd.concat([pd.read_csv(f) for f in hotels_file_list ], ignore_index=True)
df_final.shape

(50178, 7)

In [9]:
#saving the dataframe to the file

df_final.to_csv("TripAdvisor_English_Reviews_Seoul_Hotels.csv", index = False)
pd.read_csv("TripAdvisor_English_Reviews_Seoul_Hotels.csv").head()

Unnamed: 0,Hotel,Title,Date,Reviewer,Location,Rating,Review
0,9Brick_Hotel,Love the look,Date of stay: December 2022,ChristelleNg wrote a review Jan 2023,"Singapore, Singapore",40,Location not too bad. Near the Hongik station....
1,9Brick_Hotel,Very bad experience,Date of stay: December 2022,Molly123 wrote a review Dec 2022,,10,I asked the hotel to provide another set of co...
2,9Brick_Hotel,Chic design,Date of stay: October 2019,wamysdottir wrote a review Apr 2020,"Copenhagen, Denmark",40,A clean boutique hotel conveniently located wi...
3,9Brick_Hotel,Great location but weak service standards,Date of stay: December 2019,worldtraveller wrote a review Dec 2019,,30,Let me start with what I like about the hotel....
4,9Brick_Hotel,Never better,Date of stay: November 2019,Hui Yingg wrote a review Nov 2019,"Sydney, Australia",40,This hotel exceeded my expectations in terms o...
