In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser
import matplotlib.pyplot as plt
from textblob import TextBlob
from collections import Counter
from openpyxl import Workbook

## Try out for British Airways

In [8]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pagesize = 100 
pages = 37
airline = "BA"
region = "Europe"

# Create empty lists for ratings
categories = ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 
              'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity','Value For Money']
ratings = {category: [] for category in categories}

# Create an empty DataFrame
db = pd.DataFrame(columns=['Date_of_Review', 'Region', 'Airline', 'Title', 'Rating', 'Date Flown', 'Aircraft','Type_of_Traveller',
                           'Seat_Type','Route','Recommended', 'Review'] + categories)

for i in range(1, pages + 1):
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={pagesize}"
    response = requests.get(url)
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')

    for div in soup.find_all("article", itemprop="review"):
        rating = div.find("span", itemprop="ratingValue")
        rating = rating.text if rating else "N/A"
        
        title = div.find("h2", class_="text_header").get_text().strip('"')
        
        date = div.find("time", itemprop="datePublished")
        date = parser.parse(date.text).strftime('%Y-%m-%d') if date else "N/A"        
        
        review = div.find("div", class_="text_content").get_text().strip('✅')
        
        aircraft = div.find("td", class_="review-rating-header aircraft")
        aircraft = aircraft.find_next("td", class_="review-value").text if aircraft else "N/A"
        
        traveller = div.find("td", class_="review-rating-header type_of_traveller")
        traveller = traveller.find_next("td", class_="review-value").text if traveller else "N/A"
        
        seat_type = div.find("td", class_="review-rating-header cabin_flown")
        seat_type = seat_type.find_next("td", class_="review-value").text if seat_type else "N/A"
        
        route = div.find("td", class_="review-rating-header route")
        route = route.find_next("td", class_="review-value").text if route else "N/A"

        date_flown = div.find("td", text="Date Flown")
        date_flown = parser.parse(date_flown.find_next("td", class_="review-value").text).strftime('%Y-%m') if date_flown else "N/A"

        recommended = div.find("td", class_="review-rating-header recommended")
        recommended = recommended.find_next("td", class_=lambda x: x and x.startswith("review-value rating-")).text if recommended else "N/A"

        category_values = {}

        for category in categories:
            category_values[category] = "N/A"

        for row in div.find_all('tr'):
            header = row.find('td', class_='review-rating-header').text
            value = len(row.find_all('span', class_='star fill')) if row.find_all('span', class_='star fill') else "N/A"
            if header in categories:
                category_values[header] = value

        # Append the data to the DataFrame
        db_new_row = pd.DataFrame({
            'Aircraft':[aircraft],
            'Type_of_Traveller':[traveller],
            'Seat_Type':[seat_type],
            'Route':[route],
            'Region': [region],
            'Airline': [airline],
            'Rating': [rating],
            'Title': [title],
            'Date_of_Review': [date],
            'Review': [review],
            'Date Flown': [date_flown],
            'Recommended': [recommended],
            **category_values
        })

        db = pd.concat([db, db_new_row], ignore_index=True)


Unnamed: 0,Date_of_Review,Region,Airline,Title,Rating,Date Flown,Aircraft,Type_of_Traveller,Seat_Type,Route,Recommended,Review,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money
0,2023-10-31,Europe,BA,“BA clearly does not care”,1,2023-10,,Couple Leisure,Economy Class,Anchorage to Heathrow via Barcelona,no,Trip Verified | 1. Ground crew in Heathrow ...,3,3,3,3,1,3,1
1,2023-10-28,Europe,BA,reflect a downward trend of the airline,6,2023-10,A320,Solo Leisure,Economy Class,London Heathrow to Gibraltar,no,Trip Verified | London Heathrow to Gibraltar...,3,1,1,,3,1,2
2,2023-10-24,Europe,BA,One of my better flights,8,2023-10,,Couple Leisure,Economy Class,London to Philadelphia,yes,Trip Verified | First time flying British Air...,4,5,4,3,5,,5
3,2023-10-23,Europe,BA,"I would still recommend BA""",7,2023-10,A321 neo,Business,Economy Class,London Heathrow to Cairo,yes,Not Verified | I flew London to Cairo and ret...,3,5,3,,5,,2
4,2023-10-22,Europe,BA,the worst experience ever,1,2023-10,,Couple Leisure,Economy Class,Seattle to Porto via Heathrow,no,Not Verified | Absolutely the worst experienc...,2,3,3,3,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3685,2012-08-29,Europe,BA,British Airways customer review,5,,,,Business Class,,yes,Flew return in CW from LHR to BKK in August 20...,4,3,2,4,,,3
3686,2012-08-28,Europe,BA,British Airways customer review,9,,,,Business Class,,yes,LHR to HAM. Purser addresses all club passenge...,4,5,4,,,,3
3687,2011-10-12,Europe,BA,British Airways customer review,5,,,,Economy Class,,yes,My son who had worked for British Airways urge...,,,,,,,4
3688,2011-10-11,Europe,BA,British Airways customer review,4,,,,Premium Economy,,no,London City-New York JFK via Shannon on A318 b...,1,3,5,,,,1


In [9]:
csv_file_name = f'{airline}_reviews.csv'
db.to_csv(csv_file_name, index=False)

## Define a function to implement the scrapping

In [15]:


def scrape_airline_reviews(base_url, pagesize, pages, airline, region):
    # Create empty lists for ratings
    categories = ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 
                  'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity','Value For Money']
    ratings = {category: [] for category in categories}

    # Create an empty DataFrame
    db = pd.DataFrame(columns=['Date_of_Review', 'Region', 'Airline', 'Title', 'Rating', 'Date Flown', 'Aircraft','Type_of_Traveller',
                               'Seat_Type','Route','Recommended', 'Review'] + categories)

    for i in range(1, pages + 1):
        url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={pagesize}"
        response = requests.get(url)
        content = response.content
        soup = BeautifulSoup(content, 'html.parser')

        for div in soup.find_all("article", itemprop="review"):
            rating = div.find("span", itemprop="ratingValue")
            rating = rating.text if rating else "N/A"
            
            title = div.find("h2", class_="text_header").get_text().strip('"')
            
            date = div.find("time", itemprop="datePublished")
            date = parser.parse(date.text).strftime('%Y-%m-%d') if date else "N/A"        
            
            review = div.find("div", class_="text_content").get_text().strip('✅')
            
            aircraft = div.find("td", class_="review-rating-header aircraft")
            aircraft = aircraft.find_next("td", class_="review-value").text if aircraft else "N/A"
            
            traveller = div.find("td", class_="review-rating-header type_of_traveller")
            traveller = traveller.find_next("td", class_="review-value").text if traveller else "N/A"
            
            seat_type = div.find("td", class_="review-rating-header cabin_flown")
            seat_type = seat_type.find_next("td", class_="review-value").text if seat_type else "N/A"
            
            route = div.find("td", class_="review-rating-header route")
            route = route.find_next("td", class_="review-value").text if route else "N/A"

            date_flown = div.find("td", text="Date Flown")
            date_flown = parser.parse(date_flown.find_next("td", class_="review-value").text).strftime('%Y-%m') if date_flown else "N/A"

            recommended = div.find("td", class_="review-rating-header recommended")
            recommended = recommended.find_next("td", class_=lambda x: x and x.startswith("review-value rating-")).text if recommended else "N/A"

            category_values = {}

            for category in categories:
                category_values[category] = "N/A"

            for row in div.find_all('tr'):
                header = row.find('td', class_='review-rating-header').text
                value = len(row.find_all('span', class_='star fill')) if row.find_all('span', class_='star fill') else "N/A"
                if header in categories:
                    category_values[header] = value

            # Append the data to the DataFrame
            db_new_row = pd.DataFrame({
                'Aircraft':[aircraft],
                'Type_of_Traveller':[traveller],
                'Seat_Type':[seat_type],
                'Route':[route],
                'Region': [region],
                'Airline': [airline],
                'Rating': [rating],
                'Title': [title],
                'Date_of_Review': [date],
                'Review': [review],
                'Date Flown': [date_flown],
                'Recommended': [recommended],
                **category_values
            })

            db = pd.concat([db, db_new_row], ignore_index=True)  

    return db

### List all the airlines

In [16]:
# List of airlines with their respective information
airlines = [
    {
        "name": "British_Airways",
        "url": "https://www.airlinequality.com/airline-reviews/british-airways",
        "pagesize": 100,
        "pages": 37,
        "region": "Europe"
    },
    {        
        "name": "Lufthansa",
        "url": "https://www.airlinequality.com/airline-reviews/lufthansa",
        "pagesize": 100,
        "pages": 24,
        "region": "Europe"
    },
    {        
        "name": "Qatar_Airways",
        "url": "https://www.airlinequality.com/airline-reviews/qatar-airways",
        "pagesize": 100,
        "pages": 24,
        "region": "Middle_East"
    },
    {        
        "name": "Southwest_Airlines",
        "url": "https://www.airlinequality.com/airline-reviews/southwest-airlines",
        "pagesize": 100,
        "pages": 18,
        "region": "North_America"
    },
    {        
        "name": "Etihad_Airways",
        "url": "https://www.airlinequality.com/airline-reviews/etihad-airways",
        "pagesize": 100,
        "pages": 18,
        "region": "Middle_East"
    },
        {        
        "name": "Jetblue_Airways",
        "url": "https://www.airlinequality.com/airline-reviews/jetblue-airways",
        "pagesize": 100,
        "pages": 16,
        "region": "North_America"
    },    
    {        
        "name": "KLM_Royal_Dutch_Airlines",
        "url": "https://www.airlinequality.com/airline-reviews/klm-royal-dutch-airlines",
        "pagesize": 100,
        "pages": 16,
        "region": "Europe"
    },
    {        
        "name": "Singapore_Airlines",
        "url": "https://www.airlinequality.com/airline-reviews/singapore-airlines",
        "pagesize": 100,
        "pages": 16,
        "region": "Asia"
    },
        {        
        "name": "TAP_Portugal",
        "url": "https://www.airlinequality.com/airline-reviews/tap-portugal",
        "pagesize": 100,
        "pages": 15,
        "region": "Europe"
    }
]


### Run the function and concatenate them into a single DataFrame

In [17]:
all_reviews = pd.DataFrame()

# Loop through each airline
for airline_info in airlines:
    airline_name = airline_info["name"]
    base_url = airline_info["url"]
    pagesize = airline_info["pagesize"]
    pages = airline_info["pages"]
    region = airline_info["region"]

    # Call the function for scraping reviews
    reviews = scrape_airline_reviews(base_url, pagesize, pages, airline_name, region)

    # Concatenate the reviews to the combined DataFrame
    all_reviews = pd.concat([all_reviews, reviews], ignore_index=True)

# Save the combined DataFrame to a CSV file
all_reviews.to_csv('combined_airline_reviews.csv', index=False)

In [18]:
all_reviews

Unnamed: 0,Date_of_Review,Region,Airline,Title,Rating,Date Flown,Aircraft,Type_of_Traveller,Seat_Type,Route,Recommended,Review,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money
0,2023-10-31,Europe,British_Airways,“BA clearly does not care”,1,2023-10,,Couple Leisure,Economy Class,Anchorage to Heathrow via Barcelona,no,Trip Verified | 1. Ground crew in Heathrow ...,3,3,3,3,1,3,1
1,2023-10-28,Europe,British_Airways,reflect a downward trend of the airline,6,2023-10,A320,Solo Leisure,Economy Class,London Heathrow to Gibraltar,no,Trip Verified | London Heathrow to Gibraltar...,3,1,1,,3,1,2
2,2023-10-24,Europe,British_Airways,One of my better flights,8,2023-10,,Couple Leisure,Economy Class,London to Philadelphia,yes,Trip Verified | First time flying British Air...,4,5,4,3,5,,5
3,2023-10-23,Europe,British_Airways,"I would still recommend BA""",7,2023-10,A321 neo,Business,Economy Class,London Heathrow to Cairo,yes,Not Verified | I flew London to Cairo and ret...,3,5,3,,5,,2
4,2023-10-22,Europe,British_Airways,the worst experience ever,1,2023-10,,Couple Leisure,Economy Class,Seattle to Porto via Heathrow,no,Not Verified | Absolutely the worst experienc...,2,3,3,3,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17992,2012-01-18,Europe,TAP_Portugal,TAP Portugal customer review,9,,,,Economy Class,,yes,Flew from MAN to LIS on 9th Jan 2012 and retur...,4,5,3,,,,4
17993,2012-01-17,Europe,TAP_Portugal,TAP Portugal customer review,6,,,,Economy Class,,no,Lisbon to Recife but returned after an hour du...,3,2,2,2,,,1
17994,2012-01-17,Europe,TAP_Portugal,TAP Portugal customer review,9,,,,Economy Class,,yes,LGW-OPO-LGW. On the flight out I was upgraded ...,4,3,5,3,,,4
17995,2012-01-10,Europe,TAP_Portugal,TAP Portugal customer review,2,,,,Economy Class,,no,My wife and I flew to Madeira via Lisbon Dec 2...,2,1,1,1,,,2
