## Web Scraping

We gathered data from www.skytrax.com, scraping around 2000 reviews for each of seven major airlines. This data forms the basis of our project, aimed at understanding passenger feedback and improving air travel experiences.

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

In [11]:
# URL pattern for airline reviews
url = "https://www.airlinequality.com/airline-reviews/flight/page/page_number/?sortby=post_date%3ADesc&pagesize=100"

# Initialize an empty list to store DataFrames
dfs = []

# List of pre-decided airlines to scrape reviews for
airlines = ['ryanair', 'delta-air-lines', 'spirit-airlines', 'united-airlines', 'emirates', 'qatar-airways', 'etihad-airways']

for airline in airlines:
    
    # Replace 'flight' in the URL with the airline name
    urllink = url.replace('flight', airline)
    
    # Loop through the first 20 pages
    for i in range(1, 20):  
        
        response = requests.get(urllink.replace('page_number', str(i)))
        
        # Check if the request was successful
        if response.status_code == 200:  
            
            soup = BeautifulSoup(response.text, "html.parser")
            reviews = soup.find_all("article", itemprop="review")  
            
            # Find all review attributes:
            for item in reviews:
                my_dict = {
                    "rating": "",
                    "country": "",
                    "date": "",
                    "review": "",
                    "Aircraft": '',
                    "Type Of Traveller": '',
                    "Seat Type": '',
                    "Route": '',
                    "Date Flown": '',
                    "Seat Comfort": '',
                    "Cabin Staff Service": '',
                    "Food & Beverages": '',
                    "Inflight Entertainment": '',
                    "Ground Service": '',
                    "Wifi & Connectivity": '',
                    "Value For Money": '',
                    "Recommended": '',
                    "Airlines": airline
                }

                try:
                    # Extract the rating
                    my_dict['rating'] = item.find("span", itemprop="ratingValue").text.strip()
                except AttributeError:
                    pass

                try:
                    # Extract the review text
                    my_dict['review'] = item.find("div", class_="text_content").text.strip()
                except AttributeError:
                    pass

                try:
                    # Extract author, country, and date
                    temp = item.find("h3", class_="text_sub_header").text.strip()
                    my_dict['author'] = temp.split('(')[0].strip()
                    my_dict['country'] = temp.split('(')[1].split(')')[0].strip()
                    my_dict['date'] = temp.split(')')[1].strip()
                except AttributeError:
                    pass

                # Extract the Aircraft field
                my_dict['Aircraft'] = item.find("td", class_='')

                # Extract the remaining review fields
                for i in item.find_all('td', 'review-value'):
                    my_dict[i.previous_sibling.get_text().strip()] = i.get_text().strip()

                # Extract the rating fields
                for j in item.find_all('td', 'review-rating-stars'):
                    category = j.parent.find('td', 'review-rating-header').text.strip()
                    my_dict[category] = str(len(j.parent.find_all('span', class_="fill")))

                # Append the review dictionary to the list of DataFrames
                dfs.append(pd.DataFrame([my_dict]))

# Concatenate all DataFrames into a single DataFrame
output = pd.concat(dfs, ignore_index=True)

In [6]:
output

Unnamed: 0,rating,country,date,review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Airlines,author
0,1,Jordan,8th April 2024,✅ Trip Verified | Ryanair lost my luggage on a...,,Business,Economy Class,"Charleroi (Brussels South) to Amman, Jordan",April 2024,2,1,,,1,,1,no,ryanair,Alan Robinson
1,1,Switzerland,8th April 2024,✅ Trip Verified | Booked Basel to Dublin 11.1...,,Family Leisure,Economy Class,Basel to Dublin,April 2024,1,1,,,1,,1,no,ryanair,T Maysan
2,6,Germany,5th April 2024,✅ Trip Verified | You get what you pay. Had ...,Boeing 737-800,Couple Leisure,Economy Class,Bologna to Cologne,April 2024,3,3,3,,3,,5,yes,ryanair,55 reviews\n\n\n\nR Darnel
3,3,Italy,1st April 2024,Not Verified | Very cheeky check-in system: t...,,Couple Leisure,Economy Class,Manchester to Milan,March 2024,1,2,,1,2,,2,no,ryanair,Y Chen
4,1,Spain,28th March 2024,Not Verified | Terrible customer service. Han...,A321,Family Leisure,Economy Class,Marrakech to Sevilla,March 2024,2,4,,1,1,1,3,no,ryanair,Diego Perez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13160,9,Switzerland,5th August 2013,Flew AUH-GVA A333. The Business class product ...,,,Business Class,,,5,4,4,5,,,4,yes,etihad-airways,David Monteiro
13161,2,United Kingdom,5th August 2013,March 2013 MAN-AUH-DEL in Economy Class. While...,,,Economy Class,,,2,2,3,2,,,2,no,etihad-airways,V Saxena
13162,8,Greece,4th August 2013,For some reason First Class is not always avai...,,,First Class,,,5,5,5,5,,,5,yes,etihad-airways,S Nkavoyannis
13163,2,Australia,1st August 2013,Very disappointing flight (DUB-AUH-SYD) recent...,,,Economy Class,,,3,1,2,3,,,3,no,etihad-airways,Colin Walsh


In [12]:
# save the data in csv format
output.to_csv('web_scraped_data.csv', index=False)