In [2]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import traceback

# --- Configuration ---
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") # Runs Chrome in the background
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.0 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])

airlines_to_scrape = ["air-india", "air-india-express"]
all_reviews_data = [] 

print(f"Starting scraper for: {', '.join(airlines_to_scrape)}")

# ---------------------------------
# --- OUTER LOOP FOR AIRLINES ---
# ---------------------------------
for airline_name in airlines_to_scrape:
    
    base_url = f"https://www.airlinequality.com/airline-reviews/{airline_name}/"
    page = 1
    
    print(f"\n\n--- Scraping Airline: {airline_name} ---")

    # ---------------------------------
    # --- INNER LOOP FOR PAGES (ALL OF THEM) ---
    # ---------------------------------
    while True: 
        
        url = f"{base_url}page/{page}/"
        print(f"\n--- Scraping Page {page} ---")
        
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        html_source = None

        try:
            driver.get(url)
            print("Waiting for page JavaScript to load reviews...")
            wait = WebDriverWait(driver, 20)
            wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "text_content")))
            print("Reviews loaded! Grabbing HTML...")
            html_source = driver.page_source

        except Exception as e:
            print(f"Error on page {page}. Assuming no more pages. Moving to next airline.")
            break 
        finally:
            driver.quit() 

        # ---------------------------------
        # --- PARSING LOGIC (ALL REVIEWS) ---
        # ---------------------------------
        if html_source:
            soup = BeautifulSoup(html_source, 'html.parser')
            
            review_containers = soup.find_all("article", class_=lambda v: v and v.startswith("review-"))
            page_reviews_found = 0
            
            if not review_containers:
                print("No review containers found on this page. Stopping loop for this airline.")
                break 

            for review in review_containers:
                try:
                    rating_element = review.find('div', class_='rating-10')
                    # Check if rating exists before trying to access .text
                    if not rating_element:
                        continue # Skip this review if there's no rating
                    
                    rating = rating_element.find('span', itemprop='ratingValue').text.strip()
                    
                    full_review_text_element = review.find('div', class_="text_content")
                    # Check if text exists
                    if not full_review_text_element:
                        continue # Skip this review if there's no text
                        
                    full_review_text = full_review_text_element.text.strip()
                    
                    cleaned_text = full_review_text
                    verification_status = "Unknown" 
                    
                    if "Trip Verified" in full_review_text:
                        verification_status = "Trip Verified"
                        try:
                            cleaned_text = full_review_text.split('|', 1)[1].strip()
                        except IndexError:
                            cleaned_text = full_review_text.replace("✅ Trip Verified", "").strip()
                    
                    elif "Not Verified" in full_review_text:
                        verification_status = "Not Verified"
                        try:
                            cleaned_text = full_review_text.split('|', 1)[1].strip()
                        except IndexError:
                            cleaned_text = full_review_text.replace("Not Verified", "").strip()

                    # Add the review (as long as it has a rating and text)
                    all_reviews_data.append({
                        'Airline': airline_name, 
                        'Rating': int(rating),
                        'Review': cleaned_text,
                        'Verification': verification_status 
                    })
                    page_reviews_found += 1
                    
                except Exception as e:
                    # This will skip any malformed review (e.g., missing rating/text)
                    pass 
                    
            print(f"Successfully scraped {page_reviews_found} total reviews from this page.")
            
            # If page is empty (0 reviews), stop
            if page_reviews_found == 0:
                 print("Found 0 reviews, assuming end of pages.")
                 break 
            
            page += 1 
            time.sleep(1) # Be polite
        else:
            print("No HTML source found. Stopping loop for this airline.")
            break 

# ---------------------------------
# --- FINAL RESULTS ---
# ---------------------------------
print(f"\n\n--- Scraping Complete for All Airlines ---")
print(f"Successfully scraped a total of {len(all_reviews_data)} reviews.")

# Create the final DataFrame
if all_reviews_data:
    df = pd.DataFrame(all_reviews_data)
    
    print("\n--- Review Count by Airline ---")
    print(df['Airline'].value_counts())
    
    print("\n--- Review Count by Verification Status ---")
    print(df['Verification'].value_counts())
    
    print("\n--- Review Count by Star Rating (All Reviews) ---")
    print(df['Rating'].value_counts().sort_index())
    
    # Save the data
    df.to_csv("all_airline_reviews_full.csv", index=False)
    print("\nSuccessfully saved all reviews to 'all_airline_reviews_full.csv'")
    
    print("\n--- DataFrame 'df' is now created. Displaying .head() ---")
    
    # This will be the final output of the cell.
    # The variable 'df' will now be available in your notebook.
    display(df.head())

else:
    print("No reviews were scraped.")
    df = pd.DataFrame() # Create an empty DataFrame

Starting scraper for: air-india, air-india-express


--- Scraping Airline: air-india ---

--- Scraping Page 1 ---
Waiting for page JavaScript to load reviews...
Reviews loaded! Grabbing HTML...
Successfully scraped 10 total reviews from this page.

--- Scraping Page 2 ---
Waiting for page JavaScript to load reviews...
Reviews loaded! Grabbing HTML...
Successfully scraped 10 total reviews from this page.

--- Scraping Page 3 ---
Waiting for page JavaScript to load reviews...
Reviews loaded! Grabbing HTML...
Successfully scraped 10 total reviews from this page.

--- Scraping Page 4 ---
Waiting for page JavaScript to load reviews...
Reviews loaded! Grabbing HTML...
Successfully scraped 10 total reviews from this page.

--- Scraping Page 5 ---
Waiting for page JavaScript to load reviews...
Reviews loaded! Grabbing HTML...
Successfully scraped 10 total reviews from this page.

--- Scraping Page 6 ---
Waiting for page JavaScript to load reviews...
Reviews loaded! Grabbing HTML...
Successfull

Unnamed: 0,Airline,Rating,Review,Verification
0,air-india,1,"Worst airline ever. Delayed my flight 8 hours,...",Not Verified
1,air-india,2,No vegetarian meal available even after pre bo...,Trip Verified
2,air-india,1,I booked a $5000 business class ticket and the...,Trip Verified
3,air-india,1,Had flights with Air India from Melbourne to N...,Trip Verified
4,air-india,1,This has been my worst with an airline ever. I...,Trip Verified


In [3]:
df

Unnamed: 0,Airline,Rating,Review,Verification
0,air-india,1,"Worst airline ever. Delayed my flight 8 hours,...",Not Verified
1,air-india,2,No vegetarian meal available even after pre bo...,Trip Verified
2,air-india,1,I booked a $5000 business class ticket and the...,Trip Verified
3,air-india,1,Had flights with Air India from Melbourne to N...,Trip Verified
4,air-india,1,This has been my worst with an airline ever. I...,Trip Verified
...,...,...,...,...
1598,air-india-express,4,DXB-IXE. Boarded with apprehension but quite p...,Unknown
1599,air-india-express,3,I am regular passenger in UAE-Kerala route. Fl...,Unknown
1600,air-india-express,4,Singapore to Calcutta return. An adequate perf...,Unknown
1601,air-india-express,3,TRZ to KUL via Chennai return. The aircraft wa...,Unknown
