In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import time

In [2]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
page_size = 10
total_pages = 500


In [4]:
# Create a list of page numbers
page_numbers = list(range(1, total_pages + 1))
# Shuffle the page order before scraping
random.shuffle(page_numbers)


In [5]:
#create an empty list to store all reviews
reviews = []

#create an empty list to store all stars
ratings = []

#create an empty list to store all date
date = []

#create an empty list to store all location
country = []

In [6]:
# Set a delay between requests to avoid overloading the website
request_delay = 1  # 1 second


In [7]:
# Loop through the pages in random order
for i in page_numbers:
    print(f"Scraping page {i}")
    
    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    
    # Collect HTML data from this page
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse content
        content = response.content
        parsed_content = BeautifulSoup(content, 'html.parser')
        
        # Extract data and append to lists
        for para in parsed_content.find_all("div", {"class": "text_content"}):
            reviews.append(para.get_text())
        
        for item in parsed_content.find_all("div", class_="rating-10"):
            try:
                ratings.append(item.span.text)
            except:
                print(f"Error on page {i}")
                ratings.append("None")
        
        for item in parsed_content.find_all("time"):
            date.append(item.text)
        
        for item in parsed_content.find_all("h3"):
            country.append(item.span.next_sibling.text.strip(" ()"))
    else:
        print(f"Failed to retrieve page {i}. Status code: {response.status_code}")
    
    # Add a delay to avoid overloading the website
    time.sleep(request_delay + random.uniform(0, 0.5))

Scraping page 67
Scraping page 348
Scraping page 161
Scraping page 34
Scraping page 450
Scraping page 46
Scraping page 461
Scraping page 342
Scraping page 291
Scraping page 109
Scraping page 158
Scraping page 125
Scraping page 228
Scraping page 298
Scraping page 81
Scraping page 137
Scraping page 42
Scraping page 367
Scraping page 362
Scraping page 464
Scraping page 181
Scraping page 142
Scraping page 378
Scraping page 453
Scraping page 154
Scraping page 431
Scraping page 23
Scraping page 496
Scraping page 185
Scraping page 474
Scraping page 64
Scraping page 151
Scraping page 397
Scraping page 294
Scraping page 299
Scraping page 422
Scraping page 172
Scraping page 4
Scraping page 102
Scraping page 255
Scraping page 469
Scraping page 128
Scraping page 457
Scraping page 381
Scraping page 318
Scraping page 373
Scraping page 359
Scraping page 319
Scraping page 222
Scraping page 310
Scraping page 326
Scraping page 27
Scraping page 416
Scraping page 361
Scraping page 160
Scraping page 327
Er

In [8]:
len(reviews)

3690

In [9]:
len(country)

3690

In [10]:
len(date)

3690

In [11]:
len(ratings)

4190

In [12]:
# Filter out extra ratings
filtered_ratings = ratings[:len(reviews)]

In [13]:
data = {
    'review': reviews,
    'rating': filtered_ratings,
    'date': date,
    'location': country
}

In [14]:
df = pd.DataFrame(data)

In [15]:
df.shape

(3690, 4)

In [16]:
df.head()

Unnamed: 0,review,rating,date,location
0,✅ Trip Verified | London to Glasgow with Brit...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,14th December 2019,United Kingdom
1,✅ Trip Verified | London to San Francisco. Thi...,1,13th December 2019,United States
2,Not Verified | Heathrow to Belfast. Great ser...,9,11th December 2019,United Kingdom
3,Not Verified | London to Singapore. I was app...,1,10th December 2019,Ireland
4,✅ Trip Verified | Toulouse to London. I'm a ab...,9,8th December 2019,United Kingdom


In [17]:
# 10. Count the unique values in the 'Location' column
df['date'].value_counts()

19th January 2015     26
20th November 2014    18
28th October 2014     14
6th September 2014    12
12th October 2014     12
                      ..
5th May 2014           1
8th January 2017       1
9th January 2017       1
12th January 2017      1
3rd July 2023          1
Name: date, Length: 1812, dtype: int64

In [18]:
# 10. Count the unique values in the 'Location' column
location_counts = df['location'].value_counts()
location_counts

United Kingdom    2321
United States      406
Australia          162
Canada             119
Germany             64
                  ... 
Bahrain              1
Laos                 1
Indonesia            1
Barbados             1
Jordan               1
Name: location, Length: 72, dtype: int64

In [23]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Initialize the WordNetLemmatizer
lemma = WordNetLemmatizer()

# Define a function for text preprocessing
def preprocess_text(text):
    # Remove non-alphabet characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenize the text into words
    words = text.split()
    # Lemmatize words and remove stopwords
    words = [lemma.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    # Join the words back into a string
    return " ".join(words)

# Apply the preprocessing function to the reviews in the DataFrame
df['cleaned_reviews'] = df['review'].apply(preprocess_text)

In [25]:
import os

cwd = os.getcwd()
df.to_csv(cwd+ "/BA_review.csv")