In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

def create_driver():
    options = Options()
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-blink-features=AutomationControlled")
    return webdriver.Chrome(
        service=Service("chromedriver.exe"),  # Use chromedriver in the same directory or set PATH
        options=options
    )

def scroll_to_bottom(driver, scroll_times=3):
    for _ in range(scroll_times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

def wait_for_comment_elements(driver):
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "span.JguWG"))
        )
        return True
    except:
        return False

def scrape_comments_and_ratings(driver):
    try:
        read_more_buttons = driver.find_elements(By.XPATH, "//span[text()='Read more']")
        for btn in read_more_buttons:
            try:
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(0.2)
            except:
                continue
    except:
        pass

    data = []
    comment_elements = driver.find_elements(By.CSS_SELECTOR, "span.JguWG")

    for comment_elem in comment_elements:
        comment_text = comment_elem.text.strip()
        reviewer = None
        title = None
        general_rating = None
        date_of_travel = None
        location = None

        ratings = {
            "Legroom": None,
            "Seat comfort": None,
            "In-flight Entertainment": None,
            "Customer service": None,
            "Value for money": None,
            "Cleanliness": None,
            "Check-in and boarding": None,
            "Food and Beverage": None
        }

        try:
            parent = comment_elem.find_element(By.XPATH, "./ancestor::div[contains(@class, 'lwGaE') and contains(@class, 'A')]")

            try:
                name_elem = parent.find_element(By.XPATH, ".//span[contains(@class, 'fiohW')]//a[starts-with(@href, '/Profile/')]")
                reviewer = name_elem.text.strip()
            except:
                pass

            try:
                title_elem = parent.find_element(By.XPATH, ".//div[contains(@class, 'biGQs') and contains(@class, 'fiohW')]")
                title = title_elem.text.strip()
            except:
                pass

            try:
                svg_elem = parent.find_element(By.XPATH, ".//*[name()='svg' and @data-automation='bubbleRatingImage']")
                title_attr = svg_elem.find_element(By.XPATH, ".//*[name()='title']").get_attribute("textContent")
                general_rating = int(title_attr.split(" ")[0])
            except:
                pass

            try:
                travel_date_elements = parent.find_elements(By.XPATH, ".//span[contains(text(), 'Date of travel')]")
                for elem in travel_date_elements:
                    if "Date of travel" in elem.text:
                        date_of_travel = elem.text.replace("Date of travel:", "").strip()
                        break
            except:
                pass

            try:
                location_spans = parent.find_elements(By.XPATH, ".//span[@class='thpSa']")
                location = " ".join([span.text.strip() for span in location_spans if span.text.strip()])
            except:
                location = None

            try:
                category_blocks = parent.find_elements(By.XPATH, ".//div[contains(@class, 'msVPq')]")
                for block in category_blocks:
                    try:
                        cat_name = block.find_element(By.XPATH, ".//div[contains(@class, 'biGQs') and contains(@class, 'pZUbB')]").text.strip()
                        rating_elem = block.find_element(By.XPATH, ".//*[name()='svg' and @data-automation='bubbleRatingImage']")
                        rating_title = rating_elem.find_element(By.XPATH, ".//*[name()='title']").get_attribute("textContent")
                        cat_rating = int(rating_title.split()[0])
                        if cat_name in ratings:
                            ratings[cat_name] = cat_rating
                    except:
                        continue
            except:
                pass

        except:
            continue

        row = {
            "Reviewer Name": reviewer,
            "Review Title": title,
            "General Rating": general_rating,
            "Date of Travel": date_of_travel,
            "Review Location": location,
            "Comment": comment_text
        }
        row.update(ratings)
        data.append(row)

    return data

def go_to_next_page(driver):
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Next page']"))
        )
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)
        return True
    except:
        return False

def main():
    driver = create_driver()
    driver.get("https://www.tripadvisor.com/Airline_Review-d8729073-Reviews-Ethiopian-Airlines")
    time.sleep(5)

    all_data = []
    max_pages = 900  # Adjust if needed
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        scroll_to_bottom(driver, scroll_times=5)
        if wait_for_comment_elements(driver):
            data = scrape_comments_and_ratings(driver)
            print(f"Found {len(data)} reviews on page {page}")
            all_data.extend(data)
        else:
            print("No comments loaded.")
        if not go_to_next_page(driver):
            print("Reached last page.")
            break

    driver.quit()

    df = pd.DataFrame(all_data)
    output_path = "../cleaned_data/tripadvisor_ethiopian_airlines_reviews_raw.csv"
    df.to_csv(output_path, index=False)
    print(f"\nDone! Saved {len(df)} reviews to {output_path}")

if __name__ == "__main__":
    main()
