In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import traceback

In [2]:
#Functions to webscrape
def fetch_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch webpage. Status code: {response.status_code}")
        return None

def parse_html(html_content):
    return BeautifulSoup(html_content, 'html.parser')

In [4]:
#Get the maximum pages of Scoot airlines
first_url = "https://www.airlinequality.com/airline-reviews/scoot/?sortby=post_date%3ADesc&pagesize=100"
first_page = fetch_webpage(first_url)
num_reviews = 0
success = False
for attempt in range(10):
    if first_page:
        soup = parse_html(first_page)
        reviews_html = soup.find_all('div', class_='pagination-total')
        lst = str(reviews_html[0]).split(" ")
        for word in lst:
            try:
                num_reviews = int(word)
            except:
                continue
        success = True
        break
if not success:
    print("Failed at getting first page in 10 tries")

print(num_reviews)

682


In [5]:
import math
MAX_PAGES= math.ceil(num_reviews/100)
list_url = [f'https://www.airlinequality.com/airline-reviews/scoot/page/{page}/?sortby=post_date%3ADesc&pagesize=100' for page in range(1, MAX_PAGES+1)]

In [6]:
list_url

['https://www.airlinequality.com/airline-reviews/scoot/page/1/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/scoot/page/2/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/scoot/page/3/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/scoot/page/4/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/scoot/page/5/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/scoot/page/6/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/scoot/page/7/?sortby=post_date%3ADesc&pagesize=100']

In [7]:
# Initialize an empty DataFrame to store the comments data
comments_data = pd.DataFrame(columns=['Date Published', 'Overall Rating', 'Passenger Country', 'Trip Verified', 'Review Title','Review', 
                                       'Aircraft', 'Type Of Traveller', 'Seat Type', 'Origin', 'Destination', 'Layover', 'Date Flown', 
                                       'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Ground Service', 
                                       'Value For Money', 'Recommended'])
comments_data_list = [] 

#To detect all flight details and subratings
class_to_label = {
    'aircraft': 'Aircraft',
    'type_of_traveller': 'Type Of Traveller',
    'cabin_flown': 'Seat Type',
    'route': 'Route',
    'date_flown': 'Date Flown',
    'seat_comfort': 'Seat Comfort',
    'cabin_staff_service': 'Cabin Staff Service',
    'food_and_beverages': 'Food & Beverages',
    'inflight_entertainment':'Inflight Entertainment',
    'ground_service': 'Ground Service',
    'wifi_and_connectivity':'Wifi & Connectivity',
    'value_for_money': 'Value For Money',
    'recommended': 'Recommended'
}

In [8]:
for url in list_url:
    success = False
    html_content = fetch_webpage(url)
    for attempt in range(10):
        if html_content:
            soup = parse_html(html_content)

            # Find all comment elements
            comments = soup.find_all('article', itemprop='review')

            for comment in comments:
                try:
                    # Date Published column
                    date_published = comment.find('meta', itemprop='datePublished')['content']
                    # Overall Rating column
                    rating_text = comment.find('span', itemprop='ratingValue')
                    rating = rating_text.text if rating_text else ""
                    # Review Title column
                    text_header = comment.find('h2', class_='text_header').text
                    # Passenger Country column
                    text_sub_header_text = comment.find('h3', class_='text_sub_header userStatusWrapper').get_text(strip=True)
                    country = text_sub_header_text.split('(')[-1].split(')')[0]

                    # Trip Verified and Review
                    text_content = comment.find('div', class_='text_content', itemprop='reviewBody')
                    # Find the element containing 'Not Verified' or 'Trip Verified'
                    verification_text = text_content.find('strong')
                    verification = verification_text.text.strip() if verification_text else ""
                    text_content = text_content.text.strip()
                    #If there is a trip verified before the review
                    if '|' in text_content:
                        text_content= text_content.split('|')[1].strip()

                    # Table that contains all flight details and subratings
                    review_ratings = comment.find('table', class_='review-ratings')
                    review_ratings = comment.find_all('tr')
                    table_data = {}
                    for row in review_ratings:
                        # Find the header and value cells
                        header_cell = row.find('td', class_='review-rating-header')
                        value_cell = row.find('td', class_='review-value')
                        value2_cell = row.find('td', class_='review-rating-stars')\

                        #Details of trip
                        if header_cell and value_cell:
                            # Get the class name of the header cell
                            class_name = header_cell['class'][1]
                            # Get the corresponding data label from the class_to_label dictionary
                            data_label = class_to_label.get(class_name, '')
                            value = value_cell.text.strip()
                            # If the feature is 'Route', split the value into origin and destination
                            if data_label == 'Route':
                                origin, destination, layover = "", "", ""
                                # Got layover
                                if ' via ' in value:
                                    layover = value.split(" via ")[1]
                                    value = value.split(" via ")[0]
                                    if " to " in value:
                                        origin, destination = value.split(" to ")
                                    elif " - " in value:
                                        origin, destination = value.split(" - ")
                                elif " then to " in value:
                                    destination = value.split(" then to ")[1]
                                    value = value.split(" then to ")[0]
                                    origin, layover = value.split(" to ")
                                else:
                                    if " to " in value:
                                        origin, destination = value.split(" to ")
                                    elif " - " in value:
                                        origin, destination = value.split(" - ")
                                    layover = ""
                                table_data['Origin'] = origin.strip()
                                table_data['Destination'] = destination.strip()
                                table_data["Layover"] = layover.strip()
                            else:
                                table_data[data_label] = value

                        #Subratings
                        if header_cell and value2_cell:
                            # Get the class name of the header cell
                            class_name = header_cell['class'][1]
                            # Get the corresponding data label from the class_to_label dictionary
                            data_label = class_to_label.get(class_name, '')
                            filled_star_spans = value2_cell.find_all('span', class_='star fill')
                            table_data[data_label] = int(len(filled_star_spans))

                    # Append the data from the current comment to the list
                    comments_data_list.append({'Date Published': date_published, 'Overall Rating': rating, 
                                               'Passenger Country': country, 'Trip Verified': verification, 
                                               'Review Title': text_header, 'Review': text_content, **table_data})

                except Exception as e:
                    print(f'Error in the comment: -> {comments.index(comment)}')
                    traceback.print_exc()
            success = True
            break
        if not success:
            print(f"Fetching {url} failed in 10 attempts")

In [9]:
# Convert the list of dictionaries into a DataFrame
comments_data = pd.DataFrame(comments_data_list)

In [10]:
comments_data.to_csv('scoot_reviews.csv', encoding='utf-8', index=False)

In [11]:
comments_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date Published          682 non-null    object 
 1   Overall Rating          682 non-null    object 
 2   Passenger Country       682 non-null    object 
 3   Trip Verified           682 non-null    object 
 4   Review Title            682 non-null    object 
 5   Review                  682 non-null    object 
 6   Type Of Traveller       501 non-null    object 
 7   Seat Type               627 non-null    object 
 8   Origin                  500 non-null    object 
 9   Destination             500 non-null    object 
 10  Layover                 500 non-null    object 
 11  Date Flown              500 non-null    object 
 12  Seat Comfort            631 non-null    float64
 13  Cabin Staff Service     627 non-null    float64
 14  Food & Beverages        456 non-null    fl