# Import necessary dependencies

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Web Scraper

In [2]:
def ba_scraper(url):
    
    session = requests.Session()
    response = session.get(url)
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    df = pd.DataFrame(columns=["review_rating", "review_title",
                               "username_loc_date", "verified",
                               "review_text", "aircraft",
                               "traveller_type","seat_type",
                               "route", "date_flown", "seat_comfort",
                               "cabin_staff_service", "food_beverages",
                               "inflight_entertainment", "ground_service",
                               "wifi", "value_for_money", "recommended"])
    
    #################### CREATE EMPTY LIST ##############################
        
    review_rating = []
    review_title = []
    username_loc_date = []
    verified = []
    review_text = []
    aircraft = []
    traveller_type = []
    seat_type = []
    route = []
    date_flown = []
    seat_comfort = []
    cabin_staff_service = []
    food_beverages = []
    inflight_entertainment = []
    ground_service = []
    wifi = []
    value_for_money = []
    recommended = []
    
    list_of_tables = [
          review_rating, review_title, username_loc_date, verified,
          review_text, aircraft, traveller_type, seat_type, route,
          date_flown, seat_comfort, cabin_staff_service, food_beverages,
          inflight_entertainment, ground_service, wifi, value_for_money, recommended
    ]
    
    #################### GET TEXT CONTENT ##############################

    for item in soup.find_all('article', attrs={'itemprop':'review'}):
        review_rating.append(item.select('div.rating-10')[0].text.strip())

    for h in soup.find_all(attrs={'class': 'text_header' }):
        review_title.append(h.text.strip())

    for item in soup.select('h3.text_sub_header.userStatusWrapper'):
        username_loc_date.append(item.text.strip())

    for item in soup.find_all('div', attrs={'itemprop':'reviewBody',
                                            'class':'text_content'}):
        if len(list(item.children)) < 2:
            verified.append(np.nan)
        else:
            value = item.select('strong')[0].text
            verified.append(value)

    for text in soup.find_all('div', attrs={'itemprop':'reviewBody'}):
        review_text.append(text.text)

    #################### GET TABLE ELEMENTS ##############################

    for table in soup.find_all("table")[1:]:

        lst = []
        for row in table.find_all('tr'):

            cols = row.find_all('td')
            for item in cols[:1]:
                lst = lst + item.get_attribute_list('class')

        my_lst = lst[1::2]

        if 'aircraft' not in my_lst:
            aircraft.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.aircraft')
            value = elem[0].find_next_sibling().text
            aircraft.append(value)

        if 'type_of_traveller' not in my_lst:
            traveller_type.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.type_of_traveller')
            value = elem[0].find_next_sibling().text
            traveller_type.append(value)

        if 'cabin_flown' not in my_lst:
            seat_type.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.cabin_flown')
            value = elem[0].find_next_sibling().text
            seat_type.append(value)

        if 'route' not in my_lst:
            route.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.route')
            value = elem[0].find_next_sibling().text
            route.append(value)

        if 'date_flown' not in my_lst:
            date_flown.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.date_flown')
            value = elem[0].find_next_sibling().text
            date_flown.append(value)

    #------------------------- Get Ratings ----------------------------#

        if 'seat_comfort' not in my_lst:
            seat_comfort.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.seat_comfort')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            seat_comfort.append(len(new_list))

        if 'cabin_staff_service' not in my_lst:
            cabin_staff_service.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.cabin_staff_service')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            cabin_staff_service.append(len(new_list))

        if 'food_and_beverages' not in my_lst:
            food_beverages.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.food_and_beverages')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            food_beverages.append(len(new_list))

        if 'inflight_entertainment' not in my_lst:
            inflight_entertainment.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.inflight_entertainment')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            inflight_entertainment.append(len(new_list))

        if 'ground_service' not in my_lst:
            ground_service.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.ground_service')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            ground_service.append(len(new_list))

        if 'wifi_and_connectivity' not in my_lst:
            wifi.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.wifi_and_connectivity')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            wifi.append(len(new_list))

        if 'value_for_money' not in my_lst:
            value_for_money.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.value_for_money')
            new_list = []
            for i in elem[0].find_next_sibling().findChildren():
                if 'fill' in i.get_attribute_list('class'):
                    new_list.append(i)
            value_for_money.append(len(new_list))

        if 'recommended' not in my_lst:
            recommended.append(np.nan)
        else:
            elem = table.select('td.review-rating-header.recommended')
            value = elem[0].find_next_sibling().text
            recommended.append(value)
            
    for k,v in zip(df.columns, list_of_tables):
        df[k] = v

    return df

# Get data

In [3]:
base_url = 'https://www.airlinequality.com/airline-reviews/british-airways/'
page_size = 352
reviews = []
data = ba_scraper(base_url)

reviews.append(data)

for i in range(2, page_size+1):
    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/"

    # Collect HTML data from this page
    df = ba_scraper(url)
    
    reviews.append(df)

final_df = pd.concat(reviews)

In [4]:
final_df.reset_index(inplace=True)

In [5]:
final_df.drop(columns=["index"], inplace=True)

In [6]:
final_df.to_csv('data/british_airways.csv')

In [7]:
final_df.shape

(3520, 18)

In [8]:
final_df.isna().sum()

review_rating                0
review_title                 0
username_loc_date            0
verified                  1508
review_text                  0
aircraft                  1680
traveller_type             759
seat_type                    3
route                      763
date_flown                 767
seat_comfort               101
cabin_staff_service        111
food_beverages             345
inflight_entertainment    1059
ground_service             828
wifi                      2978
value_for_money              0
recommended                  0
dtype: int64

In [9]:
round(final_df.review_rating.value_counts(normalize=True) * 100, 2)

1/10     21.99
2/10     11.42
3/10     11.19
8/10     10.34
10/10     8.86
9/10      8.78
7/10      8.66
4/10      6.73
5/10      6.48
6/10      5.40
na        0.14
Name: review_rating, dtype: float64

In [10]:
round(final_df.aircraft.str.contains('Boeing').value_counts(dropna=True, normalize=True) * 100, 2)

True     55.0
False    45.0
Name: aircraft, dtype: float64

In [11]:
final_df.aircraft.value_counts(dropna=True)

A320                   340
Boeing 777             264
Boeing 747-400         182
A380                   160
Boeing 777-200         123
                      ... 
A319 / Boeing 787-9      1
Airbus A32               1
A320 / Boeing 787-9      1
Boeing 787-9, A380       1
Airbus 319               1
Name: aircraft, Length: 192, dtype: int64

In [12]:
final_df.head(10)

Unnamed: 0,review_rating,review_title,username_loc_date,verified,review_text,aircraft,traveller_type,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_beverages,inflight_entertainment,ground_service,wifi,value_for_money,recommended
0,2/10,"""It was a nightmare""",Guadalupe Carlos-Alarcon (United States) 18th ...,Not Verified,Not Verified | They changed our Flights from ...,,Couple Leisure,Economy Class,Brussels to Los Angeles via London Heathrow,April 2023,1.0,4.0,1.0,3.0,1.0,1.0,1,no
1,5/10,"""Abysmal service""",Patrick Sparks (United States) 18th April 2023,Not Verified,Not Verified | At Copenhagen the most chaotic...,,Couple Leisure,Economy Class,Copenhagen to London,April 2023,1.0,4.0,2.0,,2.0,,2,no
2,1/10,"""trained to give you the runaround""",T Cayle (United States) 17th April 2023,Trip Verified,✅ Trip Verified | Worst experience of my life...,,Family Leisure,Economy Class,London to Denver,March 2023,2.0,5.0,2.0,3.0,4.0,1.0,1,no
3,3/10,"""they only had one choice of meal""",1 reviews\n\n\n\nAndrew Pybus (Hong Kong) 17th...,Trip Verified,✅ Trip Verified | Due to code sharing with Ca...,,Solo Leisure,Economy Class,Hong Kong to London,April 2023,2.0,1.0,1.0,1.0,2.0,1.0,2,no
4,4/10,"""relentless BA cost cutting""",M Edwards (United Kingdom) 16th April 2023,Trip Verified,✅ Trip Verified | LHR check in was quick at t...,A320,Business,Business Class,Heathrow to Malaga,April 2023,3.0,3.0,4.0,,3.0,1.0,2,no
5,3/10,"""I wouldn't recommend British Airways""",Luis Rojas (United Kingdom) 15th April 2023,Trip Verified,✅ Trip Verified | I wouldn't recommend Britis...,,Family Leisure,Economy Class,Santorini to Gatwick,March 2023,4.0,4.0,,1.0,3.0,3.0,1,no
6,1/10,"""Absolutely horrible experience""",C Maire (United States) 15th April 2023,Trip Verified,✅ Trip Verified | Absolutely horrible experie...,,Solo Leisure,Economy Class,Madrid to Belfast via London,January 2023,3.0,3.0,2.0,,1.0,,1,no
7,1/10,"""This is the worst airline""",Megan Campbell (United Kingdom) 14th April 2023,Not Verified,Not Verified | This is the worst airline. Not...,,Business,Economy Class,Dublin to London,April 2023,1.0,1.0,1.0,1.0,1.0,1.0,1,no
8,2/10,"""never fly British Airways again""",L Horten (United States) 13th April 2023,Trip Verified,✅ Trip Verified | I will never fly British Ai...,,Couple Leisure,Business Class,London to Venice,March 2023,1.0,2.0,2.0,1.0,2.0,,2,no
9,1/10,"""seats were cramped and uncomfortable""",L Keane (United Kingdom) 12th April 2023,Trip Verified,✅ Trip Verified | Worst aircraft I have ever ...,A350,Couple Leisure,Economy Class,Las Vegas to London,April 2023,1.0,1.0,1.0,4.0,4.0,1.0,1,no


In [13]:
final_df.value_for_money.value_counts()

1    1089
3     660
4     660
5     558
2     552
0       1
Name: value_for_money, dtype: int64

In [14]:
final_df[final_df.value_for_money == 0]

Unnamed: 0,review_rating,review_title,username_loc_date,verified,review_text,aircraft,traveller_type,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_beverages,inflight_entertainment,ground_service,wifi,value_for_money,recommended
3517,6/10,British Airways customer review,W Benson (United Kingdom) 29th August 2012,,HKG-LHR in New Club World on Boeing 777-300 - ...,,,Business Class,,,4.0,3.0,3.0,4.0,,,0,yes
