<a href="https://www.kaggle.com/code/nandhinipremkumar/web-scarping-british-airways-data-science?scriptVersionId=227377470" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import csv
import time
from bs4 import BeautifulSoup
import requests

# Define the CSV file path
csv_file_path = "british_airways_reviews.csv"

# Open the CSV file for writing
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=[
        'User Name', 'Overall Rating', 'Review', 'Trip Status', 'Detail Review',
        'Type of Traveller', 'Seat Type', 'Route', 'Date Flown',
        'Seat Comfort Rating', 'Staff Service Rating', 'Food & Beverages Rating',
        'Inflight Entertainment Rating', 'Ground Service Rating', 'Value For Money Rating', 'Recommended'])
    writer.writeheader()  # Write the header row

    try:

        pages = 100  # Adjust this as needed
        page_size = 100

        # Loop through pages
        for i in range(1, pages + 1):
            print(f"Scraping page {i}")

            # Create URL to collect links from paginated data
            url = f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
            # Collect HTML data from this page
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all reviews
            reviews=soup.find('article',class_="comp comp_reviews-airline querylist position-content").find_all("article")

            for review in reviews:
                # Extract user name
                user_name_tag = review.find('span', itemprop="name")
                user_name = user_name_tag.text.strip() if user_name_tag else 'NA'

                # Extract the overall rating (out of 10)
                overall_rating_tag = review.find('span', itemprop="ratingValue")
                overall_rating = overall_rating_tag.text.strip() if overall_rating_tag else 'NA'

                # Extract user review text
                user_review_tag = review.find('h2', class_="text_header")
                user_review = user_review_tag.text.strip() if user_review_tag else 'NA'

                # Extract additional review details (as per your structure)
                user_detail_review_tag = review.find('div', class_="text_content")
                user_detail_review = user_detail_review_tag.text.strip() if user_detail_review_tag else 'NA'
                parts = user_detail_review.split("|", 1)
                trip_status = parts[0].strip() if len(parts) > 1 else "N/A"
                user_detail_review = parts[1].strip() if len(parts) > 1 else parts[0].strip()

                # Extract type of traveller
                type_of_traveller_tag = review.find('td', class_="review-rating-header type_of_traveller")
                Type_of_traveller = type_of_traveller_tag.find_next(
                    'td').text.strip() if type_of_traveller_tag else 'NA'

                # Extract seat type
                seat_type_tag = review.find('td', class_="review-rating-header cabin_flown")
                Seat_type = seat_type_tag.find_next('td').text.strip() if seat_type_tag else 'NA'

                # Extract route
                route_tag = review.find('td', class_="review-rating-header route")
                Route = route_tag.find_next('td').text.strip() if route_tag else 'NA'

                # Extract date flown
                date_flown_tag = review.find('td', class_="review-rating-header date_flown")
                Date_Flown = date_flown_tag.find_next('td').text.strip() if date_flown_tag else 'NA'


                # Extract ratings for various categories (e.g., Seat comfort, Cabin staff, etc.)
                def get_star_rating(rating_class):
                    stars_tag = review.find('td', class_=f"review-rating-header {rating_class}")
                    if stars_tag:
                        stars = stars_tag.find_next('td', class_="review-rating-stars stars")
                        if stars:
                            return len(stars.find_all('span', class_='star fill'))  # Count the filled stars
                    return 'NA'


                Seat_comfort_rating = get_star_rating('seat_comfort')
                Staff_service_rating = get_star_rating('cabin_staff_service')
                Food_Beverages_rating = get_star_rating('food_and_beverages')
                Inflight_Entertainment_rating = get_star_rating('inflight_entertainment')
                Ground_Service_rating = get_star_rating('ground_service')
                Value_For_Money_rating = get_star_rating('value_for_money')

                # Extract whether the review is recommended
                recommended_tag = review.find('td', class_="review-rating-header recommended")
                Recommended = recommended_tag.find_next('td').text.strip() if recommended_tag else 'NA'

                # Write the review data to the CSV file
                writer.writerow({
                    'User Name': user_name,
                    'Overall Rating': overall_rating,
                    'Review': user_review,
                    'Trip Status': trip_status,
                    'Detail Review': user_detail_review,
                    'Type of Traveller': Type_of_traveller,
                    'Seat Type': Seat_type,
                    'Route': Route,
                    'Date Flown': Date_Flown,
                    'Seat Comfort Rating': Seat_comfort_rating,
                    'Staff Service Rating': Staff_service_rating,
                    'Food & Beverages Rating': Food_Beverages_rating,
                    'Inflight Entertainment Rating': Inflight_Entertainment_rating,
                    'Ground Service Rating': Ground_Service_rating,
                    'Value For Money Rating': Value_For_Money_rating,
                    'Recommended': Recommended
                })

            # Optional: Sleep to avoid too many requests too quickly
            time.sleep(15)

    except Exception as e:
        print(f"An error occurred: {e}")

print(f"Reviews have been written to {csv_file_path}")

Scraping page 1


KeyboardInterrupt: 