# WebScraping using request and BeautifulSoup

In [1]:
# Importing Libraries 
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Requesting all page

all_reviews = []

for i in range(40):
    url = f'https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100'
    r = requests.get(url)
    flight_details = r.text
    html = BeautifulSoup(flight_details, 'html.parser') 
    all_reviews.append(html)




In [3]:
# Creating prototype for function
header_prototype = []
for doc in all_reviews:
    header_tag = doc.find_all('h2',class_='text_header')
    for head in header_tag:
        header_prototype.append(head.text)

# Printing accumulated result
print('we have',len(header_prototype),'data to be scraped')


3921


In [4]:
# Defining function to extract tags 
def find(container, tag, cls=None):
    """
    Extracts and returns all instances of a specified HTML tag from a list of documents.

    This function searches through each document in the provided container for 
    the specified HTML tag. If a class is provided, it will filter the tags by the 
    given class. The results are accumulated in a list and returned.

    Parameters:
    container (list): A list of BeautifulSoup objects representing the documents to search through.
    tag (str): The HTML tag to search for in the documents.
    cls (str, optional): The class attribute to filter the tags by. Defaults to None.

    Returns:
    list: A list of all found tags that match the specified tag and class.
    """
    output = []
    for doc in container:
        if cls:
            tag_holder = doc.find_all(tag, class_ = cls)
            
        else:
            tag_holder = doc.find_all(tag)
        output.extend(tag_holder)    
    return output


# Defining function to extract content of tags 
def extract(container, tag, cls=None):
    """
    Extracts and returns the text content of a specified HTML tag from a list of documents.

    The text content of each found tag is stripped of leading and trailing whitespace and accumulated in a list 
    which is then returned.

    Parameters:
    container (list): A list of BeautifulSoup objects representing the documents to search through.
    tag (str): The HTML tag to search for in the documents.
    cls (str, optional): The class attribute to filter the tags by. Defaults to None.

    Returns:
    list: A list of strings representing the text content of all found tags that match the specified tag and class.
    Example:
    >>> from bs4 import BeautifulSoup
    >>> html_doc = "<html><body><div class='content'> Example </div></body></html>"
    >>> soup = BeautifulSoup(html_doc, 'html.parser')
    >>> extract([soup], 'div', 'content')
    ['Example']
    """

    tag_holder1 = find(container, tag, cls)
    output = [item.text.strip() for item in tag_holder1]
    return output


Following the 'DRY' and 'DOT' principle

In [5]:
# Testing function
result = find(all_reviews,'table','review-ratings')
print(result[0])

<table class="review-ratings">
<tr>
<td class="review-rating-header food-beverages">Food &amp; Beverages</td>
<td class="review-rating-stars stars">
<span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star">4</span><span class="star">5</span> </td>
</tr>
<tr>
<td class="review-rating-header inflight-entertainment">Inflight Entertainment</td>
<td class="review-rating-stars stars">
<span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star">4</span><span class="star">5</span> </td>
</tr>
<tr>
<td class="review-rating-header seat-comfort">Seat Comfort</td>
<td class="review-rating-stars stars">
<span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star">4</span><span class="star">5</span> </td>
</tr>
<tr>
<td class="review-rating-header staff-service">Staff Service</td>
<td class="review-rating-stars stars">
<span class="sta

In [6]:
# Extracting header and subheader
header = extract(all_reviews,'h2','text_header')
sub_header = extract(all_reviews,'h3', "text_sub_header userStatusWrapper")
sub_header = [item.replace('\n\n\n\nN','') for item in sub_header]

print(len(header))

3921


In [7]:
# Importing library
import numpy as np
import re
# Specifying pattern
date_pattern = r'\d{1,2}(st|nd|rd|th) [a-zA-Z]+ \d{4}'
location_pattern = r'\((.*?)\)'
# Extracting date and location from subheading
date = [re.search(date_pattern, text).group(0) if re.search(date_pattern, text) else np.nan for text in sub_header]
location = [re.search(location_pattern, text).group(1) if re.search(location_pattern, text) else np.nan for text in sub_header]
# Printing output
print(len(location))
print(date[0], '\t', location[0])


3921
5th July 2024 	 United Kingdom


Using regex pattern to extract desired information

In [8]:
# Extracting review
review = extract(all_reviews,'div','text_content')
print(review[1])

Not Verified |   Had to cancel my flight months in advance due to a change of schedule. I was flying with American Airlines and British Airways. Selected seats and when I attempted to get a refund on the selected seats, AA refunded me with no issue when BA refused despite both companies having similar policies. Avoid them, fly with anyone else. Customer service is a joke, taking over 2 weeks to respond to my email and giving me no further justification than what I was told on the phone.


In [9]:
flight_details = []
for doc in all_reviews:
    table_tag = doc.find_all('table', class_='review-ratings')
    for table in table_tag:
        value_tags = table.find_all('td', class_='review-value')
        value = [val.text.strip() for val in value_tags]
        flight_details.append(value)


# Define the indices to delete
indices_to_delete = np.arange(0, len(flight_details), 101)

# Subsetting elements from review_ratings using indices_to_delete
flight_details = [flight_details[i] for i in range(len(flight_details)) if i not in indices_to_delete]
print(len(flight_details))
        
#del(flight_details[0])
print(flight_details[0])


3921
['Couple Leisure', 'Economy Class', 'London to Corfu', 'September 2023', 'no']


In [10]:
Aircraft = []
Travellers_type = []
Seat_type = []
Route = []
Date_flown = []
Recommended = []
for detail in flight_details:
    if len(detail) == 6:
        Aircraft.append(detail[0])
        Travellers_type.append(detail[1])
        Seat_type.append(detail[2])
        Route.append(detail[3])
        Date_flown.append(detail[4])
        Recommended.append(detail[-1])
    elif len(detail) == 5:
        Aircraft.append(None)
        Travellers_type.append(detail[0])
        Seat_type.append(detail[1])
        Route.append(detail[2])
        Date_flown.append(detail[3])
        Recommended.append(detail[-1])
    elif len(detail) == 4:
        Aircraft.append(None)
        Travellers_type.append(detail[0])
        Seat_type.append(detail[1])
        Route.append(detail[2])
        Date_flown.append(None)
        Recommended.append(detail[-1])
    elif len(detail) == 3:
        Aircraft.append(None)
        Travellers_type.append(detail[0])
        Seat_type.append(detail[1])
        Route.append(None)
        Date_flown.append(None)
        Recommended.append(detail[-1])
    elif len(detail) == 2:
        Aircraft.append(None)
        Travellers_type.append(None)
        Seat_type.append(detail[0])
        Route.append(None)
        Date_flown.append(None)
        Recommended.append(detail[-1])
    elif len(detail) == 1:
        Aircraft.append(None)
        Travellers_type.append(None)
        Seat_type.append(None)
        Route.append(None)
        Date_flown.append(None)
        Recommended.append(detail[0])
    else:
        Aircraft.append(None)
        Travellers_type.append(None)
        Seat_type.append(None)
        Route.append(None)
        Date_flown.append(None)
        Recommended.append(None)

print('Aircraft: ', Aircraft[0], '\n', 'Travellers_type: ', Travellers_type[0], '\n', 'Seat_type: ', Seat_type[0], '\n', 'Route: ', Route[0], '\n', 'Date_flown: ', Date_flown[0], '\n', 'Recommendation: ', Recommended[0])
print(len(Aircraft))

Aircraft:  None 
 Travellers_type:  Couple Leisure 
 Seat_type:  Economy Class 
 Route:  London to Corfu 
 Date_flown:  September 2023 
 Recommendation:  no
3921


In [11]:
review_ratings = []
for doc in all_reviews:
    table_tag = doc.find_all('table', class_='review-ratings')
    for table in table_tag:
        value_tags = table.find_all('td', class_='review-rating-stars stars')
        review_values = []
        for td in value_tags:
            stars = td.find_all('span', class_='star fill')
            star_values = [star.text.strip() for star in stars]
            review_values.append(star_values)
        review_ratings.append(review_values)


# Define the indices to delete
indices_to_delete = np.arange(0, len(review_ratings), 101)

# Subsetting elements from review_ratings using indices_to_delete
review_ratings = [review_ratings[i] for i in range(len(review_ratings)) if i not in indices_to_delete]
print(len(review_ratings))


3921


In [12]:
# Importing numpy 
import numpy as np
# Define a helper function to extract max value or return np.nan if the list is empty
def get_max_value(star_list, index):
    """
    Returns the maximum integer value from a nested list at a specified index.
    If the sublist at the specified index is empty or the index is out of range, the function returns None.

    Parameters:
    star_list (list of list of str): The nested list from which to extract the maximum value.
    index (int): The index of the sublist to search for the maximum value.

    Returns:
    int or None: The maximum integer value from the sublist at the specified index, or None 
    if the sublist is empty or the index is out of range.
    Example:
    >>> star_list = [['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']]
    >>> get_max_value(star_list, 1)
    5
    >>> get_max_value(star_list, 3)
    None
    """
    return max(map(int, star_list[index])) if len(star_list) > index and star_list[index] else None

# Use list comprehensions to extract max values
seat_comfort = [get_max_value(star, 0) for star in review_ratings]
staff_service = [get_max_value(star, 1) for star in review_ratings]
food_beverage = [get_max_value(star, 2) for star in review_ratings]
ground_service = [get_max_value(star, 3) for star in review_ratings]
pricing = [get_max_value(star, 4) for star in review_ratings]

# Print results
print("Seat Comfort:",seat_comfort[:6], len(seat_comfort))
print("Staff Service:",staff_service[:6],len(staff_service))
print("Food & Beverage:",food_beverage[:6], len(food_beverage))
print("Ground Service:",ground_service[:6], len(ground_service))
print("Pricing:",pricing[:6], len(pricing))



Seat Comfort: [1, 1, 1, 4, 2, 3] 3921
Staff Service: [1, None, 1, 5, 2, 3] 3921
Food & Beverage: [1, None, 1, 3, 1, 3] 3921
Ground Service: [1, None, 1, 1, 2, 3] 3921
Pricing: [None, None, 1, 3, 2, 1] 3921


In [13]:
# Confirming data lenght uniformity 
print(len(header),len(date),len(location),len(review))

3921 3921 3921 3921


In [16]:
# Preparing dictionary of list for Dataframe creation
reviews = {'title':header,'date':date,'country':location,'review':review}
flight_info = {'aircraft':Aircraft,'travellers_type':Travellers_type,'Seat_type':Seat_type,'route':Route,'date_flown':Date_flown}
ratings = {'aircraft':Aircraft,'date_flown':Date_flown,'seat_comfort':seat_comfort,'staff_service':staff_service,'food_beverage':food_beverage,'ground_service':ground_service,'pricing':pricing,'recommended':Recommended}

# Creating Dataframes 
import pandas as pd
reviews_df = pd.DataFrame(reviews)
flight_info_df = pd.DataFrame(flight_info)
ratings_df = pd.DataFrame(ratings)

# Displaying datasets
display(reviews_df.head(5), flight_info_df.head(), ratings_df.head())

Unnamed: 0,title,date,country,review
0,“customer service was awful”,5th July 2024,United Kingdom,Not Verified | The WORST customer experience! ...
1,"""over 2 weeks to respond""",1st July 2024,Canada,Not Verified | Had to cancel my flight month...
2,“wholly inadequate customer service”,30th June 2024,United Kingdom,✅ Trip Verified | Flight cancelled with no rea...
3,“the cabin crew were great”,26th June 2024,United Kingdom,✅ Trip Verified | This is a route I fly regula...
4,"""cannot recommend BA""",23rd June 2024,Canada,✅ Trip Verified | While BA may have made some...


Unnamed: 0,aircraft,travellers_type,Seat_type,route,date_flown
0,,Couple Leisure,Economy Class,London to Corfu,September 2023
1,,Solo Leisure,Economy Class,Toronto to London,July 2024
2,,Couple Leisure,Economy Class,London Gatwick to Verona,June 2024
3,A320,Business,Business Class,London Heathrow to Paris CDG,June 2024
4,A350-1000,Solo Leisure,Economy Class,London Heathrow to Vancouver,June 2024


Unnamed: 0,aircraft,date_flown,seat_comfort,staff_service,food_beverage,ground_service,pricing,recommended
0,,September 2023,1,1.0,1.0,1.0,,no
1,,July 2024,1,,,,,no
2,,June 2024,1,1.0,1.0,1.0,1.0,no
3,A320,June 2024,4,5.0,3.0,1.0,3.0,yes
4,A350-1000,June 2024,2,2.0,1.0,2.0,2.0,no


Data has been successfully scraped

In [18]:
# Saving datasets
reviews_df.to_csv('B_Airways_reviews.csv')
flight_info_df.to_csv('B_Airways_flight_info.csv')
ratings_df.to_csv('B_Airways_ratings.csv')

Data now being saved 

# Dataset Description
B_Airways_ratings.csv; Reviewers ratings

index: Index column

date_flown: Date of the flight
staff_service: Rating of the staff service (1-5)
food: Rating of the food service (1-5)
ground_service: Rating of the ground service (1-5)
seat_comfort: Rating of the seat comfort (1-5)
value_for_money: Rating of value for money (1-5)
recommended: Whether the reviewer recommended the airline (Yes/No)

B_Airways_flight_info.csv; Flight details of reviewer

index: Index column

date_flown: Date of the flight
travellers_type: Type of travelers (e.g., Business, Leisure)
route: Flight route
aircraft: Aircraft type
Seat_type: Type of seat (e.g., Economy, Business)

reviews.csv: Scraped reviews dataset.

index: Index column

title: Title of review
date: Date of review
country: Country of reviewer
review: Actual review

# Challenges 
During the data extraction process, we encountered irregularities in the structure of the flight information. The fields for aircraft type, traveller type, seat type, route, and date flown were not consistently populated. This required a dynamic approach to ensure all possible data points were captured accurately.


# Recommendation
To further enhance the web scraping process and the usability of the data, the following steps are recommended:
1.	Automated Data Cleaning: Implement automated scripts to clean and normalize the data, ensuring consistency and accuracy.
2.	Incremental Scraping: Develop a system to incrementally scrape new reviews, keeping the dataset up-to-date without redundant data collection.
3.	Advanced Sentiment Analysis: Utilize advanced natural language processing (NLP) techniques to perform in-depth sentiment analysis and topic modeling.
4.	Interactive Dashboards: Create interactive dashboards to visualize the data, making it easier for stakeholders to interpret and act upon the insights.

# Conclusion
This web scraping project has successfully demonstrated the power of data in understanding customer experiences. By addressing the challenges and employing robust methodologies, I have laid the foundation for comprehensive analysis and actionable insights. Future work can build upon this dataset to drive meaningful improvements in the airline industry.