# Skytrax Scraper

run all:

In [None]:
# Import required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
import numpy as np 
import os
import time

In [None]:
# Initialize an empty list to store the airline names
airline_names = []

# Set custom User-Agent in the headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}

# Loop through each letter of the alphabet (from A to Z)
for letter in range(ord('A'), ord('Z') + 1):
    l = chr(letter)  # Get the character corresponding to the current ASCII code (e.g., 'A' for 65, 'B' for 66, etc.)

    # Build the URL for the specific letter to fetch airline names starting with that letter
    url = f"https://www.airlinequality.com/review-pages/a-z-airline-reviews/#a2z-ldr-{l}"

    # Send an HTTP GET request to fetch the webpage's HTML content using the custom User-Agent in the headers
    html_text = requests.get(url, headers=headers).text

    # Parse the HTML content using BeautifulSoup with the "lxml" parser
    soup = BeautifulSoup(html_text, "lxml")

    # Create a unique identifier (URI) to locate the container for the current letter
    uri = f"a2z-ldr-{l}"

    # Find the container that holds the list of airlines starting with the current letter
    container = soup.find("div", {"id": uri})

    # Loop through each list item in the container and extract the airline name
    for items in container.find_all("li"):
        airline_names.append(items.text)

In [None]:
#Keep only MUST Airlines
segmentation_df = pd.read_csv("C:/Users/jadea/Documents/Mémoire M2 docs/segmentation.csv", encoding='latin-1')
segmentation_df['Current_Airline_Name'] = segmentation_df['Current_Airline_Name'].apply(lambda x: x.lower())
segmentation_df

matching_airlines = [airline for airline in airline_names if airline.lower() in segmentation_df['Current_Airline_Name'].to_list()]
print(len(matching_airlines))
matching_airlines

In [None]:
# to convert names of airlines into url
start = "https://www.airlinequality.com/airline-reviews/"
end = "?sortby=post_date%3ADesc&pagesize=100"
airline_url = []
for items in matching_airlines:
    #converting names to lower case
    items = items.lower()
    # Replace special characters with their ASCII equivalents
    items = unicodedata.normalize('NFKD', items).encode('ASCII', 'ignore').decode('utf-8')
    # Replacing space with hypen
    items = items.replace(" ", "-")
    #creating URL
    airline_url.append(start + items + "/"+end)

In [None]:
# to convert names of airlines into url
def generate_url(airline, page):
    start = "https://www.airlinequality.com/airline-reviews/"
    end = "?sortby=post_date%3ADesc&pagesize=100"
    #converting names to lower case
    airline = airline.lower()
    # Replace special characters with their ASCII equivalents
    airline = unicodedata.normalize('NFKD', airline).encode('ASCII', 'ignore').decode('utf-8')
    # Replacing space with hypen
    airline = airline.replace(" ", "-")
    #creating URL
    if page == 1:
        airline_url = start + airline + "/"+end
    else:
        airline_url = start + airline + f"/page/{page}/"+end
    return airline_url

Run the scraper:

In [None]:
%%time

reviews = []
path = "C:/Users/jadea/Notebooks/Mémoire M2 notebooks/skytrax" #path the store scraped reviews
columns = ["Aircraft","Type Of Traveller","Seat Type","Route","Date Flown","Seat Comfort","Cabin Staff Service","Food & Beverages","Ground Service","Inflight Entertainment","Wifi & Connectivity","Value For Money","Recommended"]
df_columns = ["Airline Name","Overall_Rating","Review_Title","Review Date","Verified","Review","Aircraft","Type Of Traveller","Seat Type","Route","Date Flown","Seat Comfort","Cabin Staff Service","Food & Beverages","Ground Service","Inflight Entertainment","Wifi & Connectivity","Value For Money","Recommended"]
file_num = 1

# Loop through each row in the 'df_airline' DataFrame
for airline in matching_airlines:
    loop = 'in'
    page = 1
    while loop == 'in':
        url = generate_url(airline, page)
        #print(page)
        page += 1
        #print(url)
        # Send an HTTP GET request to fetch the HTML content of the airline's review page
        html = requests.get(url, headers=headers).text
        bs = BeautifulSoup(html, "html.parser")
        if bs.find("article", {"itemprop":"review"}) != None:
            # Find the container that holds the airline's review information
            container = bs.find("article", {"class": "comp comp_reviews-airline querylist position-content"})

            # Check if the container exists (i.e., the review page is valid)
            if container:
                # Extract the airline's name from the 'Name' column of the DataFrame
                #print(airline)

                # Loop through each review article in the container and extract relevant details
                for items in container.find_all("article"):
                    verified = False
                    # Extract the overall rating from the review
                    rating = items.find("div", {"class": "rating-10"})
                    if rating:
                        rating = rating.text.strip()[:1]
                    else:
                        rating = None

                    # Extract the review title from the review
                    title = items.find("h2")
                    if title:
                        title = title.text
                    else:
                        title = None

                    # Extract the review date from the review
                    time = items.find("h3").find("time")
                    if time:
                        time = time.text
                    else:
                        time = None

                    # Extract the review content from the review
                    text = items.find("div", {"class": "text_content"}).text
                    text = text.split("|")
                    if len(text) == 1:
                        review = text[0]
                    else:
                        if text[0] == '✅ Trip Verified ':
                            verified = True
                        review = text[1]

                    # Extract ratings for specific categories from the review
                    table = items.find("table")
                    tab = [None] * 13
                    for item in table.find_all("tr"):
                        i = 0
                        for td in item.find_all("td"):
                            if i == 0:
                                condition = td.text
                                # Finding index of the given condition in the 'columns' list
                                ind = columns.index(condition)
                                i = 1
                            else:
                                # Checking if it's a rating or not
                                if td.find("span") is None:
                                    value = td.text
                                    tab[ind] = value
                                # In case it's a rating, counting stars filled (i.e., stars given)
                                else:
                                    value = 0
                                    for star in td.find_all("span", {"class": "star fill"}):
                                        value += 1
                                    tab[ind] = value

                    # Store the extracted review data in a list named 'data'
                    data = [airline, rating, title, time, verified, review] + tab
                    reviews.append(data)
        else:
            print(f'end {airline}')
            loop = 'out'
            
    if len(reviews) >= 10000:
                
        df = pd.DataFrame(reviews, columns=df_columns)
        df['Route'] = df['Route'].apply(lambda x: x if x and 'to' in x else None)
        df['origin'] = df['Route'].apply(lambda x: x.split('to')[0] if x is not None else None)
        df['origin'] = df['origin'].apply(lambda x: x.replace(' ','') if x is not None else None)
        df['destination'] = df['Route'].apply(lambda x: x.split('to')[1] if x is not None else None)
        df['destination'] = df['destination'].apply(lambda x: x.replace(' ','') if x is not None else None)
        
        path_csv = path + '{}.csv'.format(file_num)
        df.to_csv(path_csv)
        file_num += 1
        
        print(len(reviews),'saved')
        reviews = []
        
print('done')
        
# The 'reviews' list now contains all the extracted review data for different airlines.
