# Web Scraping

Scrape the reviews of BA from https://www.airlinequality.com/airline-reviews/british-airways%5D

In [1]:
! pip install pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from collections import defaultdict
import numpy as np

### Scrape function

In [3]:
def df_from_parse(parsed_content):
    """
    Returns a dataframe of the revies given a html parse of the scraped page

        Parameters:
            parsed_content (parsed html): Beautiful Soup parsing of html

        Returns:
            df (dataframe): Dataframe of the review text, scores and route.

    """
    # Scrape the parsec content for aricles
    articles = parsed_content.find_all("article", {"itemprop":"review"})

    n = len(articles)

    def def_value():
        return [np.nan] * n

    df_dict = defaultdict(def_value)


    for i, article in enumerate(articles):
        # Find the review text
        paragraph = article.find_all("div", {"class": "text_content"})
        df_dict['reviews'][i] = paragraph[0].get_text()

        # Get overall rating
        rating_query = article.find_all('span',{"itemprop":"ratingValue"})
        if len(rating_query) > 0:
            df_dict['overall_rating'][i] = rating_query[0].string

        # Find the revie ratings
        review_ratings = article.find_all("table", {"class": "review-ratings"})

        # each tr contains a review rating name and value via the num of stars
        for x in review_ratings[0].find_all("tr"):
            name_tag, value_tag = x.find_all("td")
            name = name_tag.string.lower()

            if value_tag['class'] == ['review-rating-stars', 'stars']:
                filled_stars = value_tag.find_all('span',{"class":"star fill"})

                # Sometimes the stars row can be empty, ensure this is not the case
                if len(filled_stars) > 0:
                    value = max([x.string for x in filled_stars])
                else:
                    value = np.nan

            else:
                value = value_tag.string

            df_dict[name][i] = value

    df = pd.DataFrame.from_dict(df_dict)
    return df

### Scraping Loop

In [4]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 35
page_size = 100

reviews = []

dfs = [None] * pages

for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    print(url)

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    dfs[i-1] = df_from_parse(parsed_content)

Scraping page 1
https://www.airlinequality.com/airline-reviews/british-airways/page/1/?sortby=post_date%3ADesc&pagesize=100
Scraping page 2
https://www.airlinequality.com/airline-reviews/british-airways/page/2/?sortby=post_date%3ADesc&pagesize=100
Scraping page 3
https://www.airlinequality.com/airline-reviews/british-airways/page/3/?sortby=post_date%3ADesc&pagesize=100
Scraping page 4
https://www.airlinequality.com/airline-reviews/british-airways/page/4/?sortby=post_date%3ADesc&pagesize=100
Scraping page 5
https://www.airlinequality.com/airline-reviews/british-airways/page/5/?sortby=post_date%3ADesc&pagesize=100
Scraping page 6
https://www.airlinequality.com/airline-reviews/british-airways/page/6/?sortby=post_date%3ADesc&pagesize=100
Scraping page 7
https://www.airlinequality.com/airline-reviews/british-airways/page/7/?sortby=post_date%3ADesc&pagesize=100
Scraping page 8
https://www.airlinequality.com/airline-reviews/british-airways/page/8/?sortby=post_date%3ADesc&pagesize=100
Scraping

### Save Scraped Data

In [7]:
# Concatenate all the pages into one df
df = pd.concat(dfs,axis=0, ignore_index=True)
df.columns = df.columns.str.replace(" ","_")

# Save the file
file_path = '/content/drive/MyDrive/Colab Notebooks/BA/data'
df.to_parquet(file_path+"/reviews.parquet")

### Evaluate

In [11]:
df[df.overall_rating.isnull()]

Unnamed: 0,reviews,overall_rating,aircraft,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended
2879,Cabin crew polite unfortunately BA ran out of ...,,,,Economy Class,,,3,5,1,1,,,2,no
3012,Phoenix to London - outbound a wonderful and e...,,,,First Class,,,1,5,4,2,,,2,no
3034,On past experience I chose BA for our long hau...,,,,Economy Class,,,3,3,2,1,,,3,no
3268,LHR-CPH-LHR Business Class. This is a joke. Sc...,,,,Business Class,,,1,1,1,1,,,1,no
3295,I flew with British Airways with my mother fro...,,,,Economy Class,,,1,3,1,1,,,1,no


### Test

In [9]:
import_df = pd.read_parquet(file_path+"/reviews.parquet")
import_df

Unnamed: 0,reviews,overall_rating,aircraft,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended
0,✅ Trip Verified | A rather empty and quiet fl...,9,Boeing 787-8,Family Leisure,Economy Class,London to Tel Aviv,December 2022,4,5,4,3,4,4,5,yes
1,✅ Trip Verified | Easy check in and staff mem...,9,Boeing 777-200,Couple Leisure,Business Class,London Heathrow to Cape Town,January 2023,3,5,5,5,4,5,4,yes
2,✅ Trip Verified | Being a silver flyer and bo...,1,,Solo Leisure,Economy Class,Gatwick to Dublin,January 2023,3,5,,,1,,1,no
3,Not Verified | I find BA incredibly tacky and...,3,,Solo Leisure,Premium Economy,London Heathrow to Athens Greece,April 2022,3,4,1,1,3,1,1,no
4,✅ Trip Verified | Flew ATL to LHR 8th Jan 202...,4,Boeing 777-200,Family Leisure,Economy Class,Atlanta to London,January 2023,1,1,1,3,1,,2,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3453,LHR-HKG on Boeing 747 - 23/08/12. Much has bee...,4,,,Economy Class,,,2,3,2,3,,,3,no
3454,LHR to HAM. Purser addresses all club passenge...,9,,,Business Class,,,4,5,4,,,,3,yes
3455,My son who had worked for British Airways urge...,5,,,Economy Class,,,,,,,,,4,yes
3456,London City-New York JFK via Shannon on A318 b...,4,,,Premium Economy,,,1,3,5,,,,1,no


In [10]:
del import_df