In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 366
reviews = []
for i in range(1, pages + 1):
    print(f"Scraping page {i}")
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    response = requests.get(url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", class_= "text_content"):
        reviews.append(para.get_text())
    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 366 total reviews
Scraping page 2
   ---> 732 total reviews
Scraping page 3
   ---> 1098 total reviews
Scraping page 4
   ---> 1464 total reviews
Scraping page 5
   ---> 1830 total reviews
Scraping page 6
   ---> 2196 total reviews
Scraping page 7
   ---> 2562 total reviews
Scraping page 8
   ---> 2928 total reviews
Scraping page 9
   ---> 3294 total reviews
Scraping page 10
   ---> 3651 total reviews


In [3]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Check in and security cleara...
1,Not Verified | British Airways has confirmed ...
2,✅ Trip Verified | Worst BA experience. I was s...
3,✅ Trip Verified | My daughter and I were deni...
4,✅ Trip Verified | Despite boarding being the u...


In [9]:
df.to_csv("Data/British_Airway_Reviews.csv", index=False)

In [23]:
df = pd.read_csv('Data/British_Airway_Reviews.csv')
text_to_remove =  "✅ Trip Verified"
df['reviews'] = df['reviews'].str.replace(text_to_remove, '')

In [24]:
!pip install textblob



In [25]:
#Analyzing data
#Sentiment Analysis
from textblob import TextBlob
def get_overall_sentiment(reviews):
    total_polarity = 0
    total_subjectivity = 0
    for review in reviews:
        analysis = TextBlob(review)
        total_polarity += analysis.sentiment.polarity
        total_subjectivity += analysis.sentiment.subjectivity
    avg_polarity = total_polarity / len(reviews)
    avg_subjectivity = total_subjectivity / len(reviews)
    if avg_polarity > 0:
        sentiment = "Positive"
    elif avg_polarity < 0:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    return sentiment, avg_polarity, avg_subjectivity

overall_sentiment, avg_polarity, avg_subjectivity = get_overall_sentiment(reviews)
print("Overall Sentiment:", overall_sentiment)
print("Average Polarity:", avg_polarity)
print("Average Subjectivity:", avg_subjectivity)

Overall Sentiment: Positive
Average Polarity: 0.09269223253462028
Average Subjectivity: 0.4892789502777161


In [26]:
#TO get top 20 repeated words
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
all_text = ' '.join(reviews)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    return tokens
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokens = preprocess_text(all_text)
filtered_tokens = [word for word in tokens if word not in stop_words]
word_freq = Counter(filtered_tokens)
top_20_freq_rep_words = word_freq.most_common(20)
print("Top 20 Frequently repeated Words:")
for word, freq in top_20_freq_rep_words:
    print(f"{word}: {freq}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\malav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\malav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 20 Frequently repeated Words:
flight: 6518
ba: 4430
service: 2779
seat: 2349
food: 2322
london: 2157
british: 2125
crew: 2120
verified: 2113
airways: 2109
seats: 2077
good: 2050
time: 2010
cabin: 1947
class: 1849
trip: 1608
one: 1579
staff: 1555
business: 1546
heathrow: 1506


In [27]:
#Frequency of Mentioned Destinations:
from collections import defaultdict
destination_counts = defaultdict(int)
destinations_list = ["London", "Heathrow", "Gatwick", "LAX", "JFK", "New York", "Los Angeles", "Paris", "Dubai"]
destination_pattern = r'\b(?:' + '|'.join(destinations_list) + r')\b'
for review in df['reviews']:
    destinations = re.findall(destination_pattern, review, flags=re.IGNORECASE)
    for destination in destinations:
        destination_counts[destination] += 1
sorted_destinations = sorted(destination_counts.items(), key=lambda x: x[1], reverse=True)
top_destinations = sorted_destinations[:10]  
for destination, count in top_destinations:
    print(f"{destination}: {count} mentions")

London: 2201 mentions
Heathrow: 1518 mentions
Gatwick: 529 mentions
JFK: 150 mentions
LAX: 97 mentions
New York: 95 mentions
Paris: 79 mentions
Dubai: 67 mentions
Los Angeles: 48 mentions
heathrow: 3 mentions


In [28]:
#Customer Demographics
df = pd.read_csv('Data/British_Airway_Reviews.csv')
age_groups = {'child': 0, 'teen': 0, 'young_adult': 0, 'adult': 0, 'senior': 0}
genders = {'male': 0, 'female': 0, 'other': 0}
locations = {}
age_keywords = { 'child': r'\b(?:child|kid|son|daughter)\b',
    'teen': r'\b(?:teen|adolescent)\b',
    'young_adult': r'\b(?:young adult|youth|college student)\b',
    'adult': r'\b(?:adult|grown-up|parent)\b',
    'senior': r'\b(?:senior citizen|elderly|retired)\b'}
gender_keywords = {'male': r'\b(?:male|man|boy|father|husband)\b',
    'female': r'\b(?:female|woman|girl|mother|wife)\b',
    'other': r'\b(?:other|non-binary|genderqueer|prefer not to say)\b'}
for review in df['reviews']:
    for age_group, pattern in age_keywords.items():
        if re.search(pattern, review, re.IGNORECASE):
            age_groups[age_group] += 1
    for gender, pattern in gender_keywords.items():
        if re.search(pattern, review, re.IGNORECASE):
            genders[gender] += 1
print("Age Demographics:")
for age_group, count in age_groups.items():
    print(f"{age_group.capitalize()}: {count}")
print("\nGender Demographics:")
for gender, count in genders.items():
    print(f"{gender.capitalize()}: {count}")

Age Demographics:
Child: 106
Teen: 0
Young_adult: 1
Adult: 13
Senior: 27

Gender Demographics:
Male: 183
Female: 272
Other: 835


In [29]:
#Service Categories:
df = pd.read_csv('Data/British_Airway_Reviews.csv')
service_categories = defaultdict(int)
service_keywords = {
    'flight': r'\b(?:flight|boarding|check-in)\b',
    'customer_service': r'\b(?:customer service|staff|assistance|help)\b',
    'baggage': r'\b(?:baggage|luggage|lost luggage)\b',
    'seating': r'\b(?:seating|seat|legroom|comfort)\b',
    'food': r'\b(?:food|meal|menu)\b',
    'entertainment': r'\b(?:entertainment|IFE|in-flight entertainment)\b',
    'delay': r'\b(?:delay|cancellation|on-time)\b',
    'cleanliness': r'\b(?:cleanliness|hygiene)\b',
    'policies': r'\b(?:policy|refund|compensation)\b',
    'booking': r'\b(?:booking|reservation|ticket)\b'}
for review in df['reviews']:
    for category, pattern in service_keywords.items():
        if re.search(pattern, review, re.IGNORECASE):
            service_categories[category] += 1
print("Service Category Counts:")
for category, count in service_categories.items():
    print(f"{category.capitalize()}: {count}")

Service Category Counts:
Flight: 2935
Seating: 1536
Delay: 332
Customer_service: 1440
Booking: 419
Policies: 289
Food: 2021
Baggage: 600
Entertainment: 834
Cleanliness: 15


In [30]:
#Customer Suggestions
df = pd.read_csv('Data/British_Airway_Reviews.csv')
customer_suggestions = []
suggestion_keywords = {
    'suggest': r'\b(?:suggest|recommend|advise|propose)\b',
    'improve': r'\b(?:improve|enhance|better)\b',
    'wish': r'\b(?:wish|hope|would like)\b',
    'request': r'\b(?:request|ask for|please)\b',}
for review in df['reviews']:
    suggestions_in_review = []
    for keyword, pattern in suggestion_keywords.items():
        matches = re.findall(pattern, review, re.IGNORECASE)
        if matches:
            suggestions_in_review.extend(matches)
    if suggestions_in_review:
        customer_suggestions.append({
            'review': review,
            'suggestions': suggestions_in_review})
for suggestion_data in customer_suggestions:
    print("Review:")
    print(suggestion_data['review'])
    print("Suggestions:")
    print(suggestion_data['suggestions'])
    print("\n")

Review:
 | Despite boarding being the usual free for all at LHR with groups 1-3 being called to board all at once. Whilst those passengers who had agreed to check in a bag in at the gate, getting to board before anyone else! However, the service on board was impeccable. Superb crew, friendly, attentive, interacting with passengers and generally appearing to be enjoying their job. Like BA in the old days and at its best. Food choice and quality seems to have improved too (no beef cheeks thankfully) and even the Club kitchen appears to have returned. Old style Club seating, but is at least better, and feels less cramped, on the 787's.
Suggestions:
['better']


Review:
 |  4/4 flights we booked this holiday were delayed about 1-2 hours. No wifi available on 2 of 4 flights Food and drinks very basic and expensive. Cannot recommend
Suggestions:
['recommend']


Review:
 |  British Airways absolutely does not care. My reserved seat was change from an aisle to center seat for this overnight, f