# Data Collection
  - Web scrapping from the website [Skytrax](https://www.airlinequality.com/airline-reviews/british-airways), clean, and analysis.

In [58]:
# imports necessary modules

import os
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import datetime as dt

from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [41]:
# collect all reviews, ratings stars, date, reviewer country
reviews, stars, date, country  = [], [], [], []

In [42]:
for i in range(1, 36):
    page = requests.get(f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100")

    soup = BeautifulSoup(page.content, "html5")

    for item in soup.find_all("div", class_="text_content"):
        reviews.append(item.text)

    for item in soup.find_all("div", class_ = "rating-10"):
        try:
            stars.append(item.span.text)
        except:
            print(f"Error on page {i}")
            stars.append("None")

    #date
    for item in soup.find_all("time"):
        date.append(item.text)

    #country
    for item in soup.find_all("h3"):
        country.append(item.span.next_sibling.text.strip(" ()"))

Error on page 34
Error on page 35
Error on page 35


In [43]:
# total reviews extracted
len(reviews)

3500

In [44]:
# total reviewer country
len(country)

3500

In [45]:
# total star length
len(stars)

3535

In [46]:
# total date
len(date)

3500

In [47]:
# fix the length of stars
stars = stars[:3500]
len(stars)

3500

In [48]:
#create  a dataframe from these collected lists of data
df = pd.DataFrame({"reviews":reviews,"stars": stars, "date":date, "country": country})

In [49]:
# first five rows
df.head()

Unnamed: 0,reviews,stars,date,country
0,"✅ Trip Verified | The seats were excellent, ...",\n\t\t\t\t\t\t\t\t\t\t\t\t5,28th February 2025,United Kingdom
1,✅ Trip Verified | After the nightmare of get...,9,27th February 2025,United Kingdom
2,✅ Trip Verified | Prior to boarding a gate a...,5,21st February 2025,United Kingdom
3,✅ Trip Verified | I flew from Amsterdam to L...,3,18th February 2025,Netherlands
4,"✅ Trip Verified | First the good news, the clu...",1,14th February 2025,United Kingdom


In [50]:
df.shape

(3500, 4)

In [51]:
# export the data in csv format
df.to_csv("data/BA_reviews.csv")

## Data Cleaning

In [52]:
# import data
df = pd.read_csv("data/BA_reviews.csv", index_col=0)

In [53]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,"✅ Trip Verified | The seats were excellent, ...",5.0,28th February 2025,United Kingdom
1,✅ Trip Verified | After the nightmare of get...,9.0,27th February 2025,United Kingdom
2,✅ Trip Verified | Prior to boarding a gate a...,5.0,21st February 2025,United Kingdom
3,✅ Trip Verified | I flew from Amsterdam to L...,3.0,18th February 2025,Netherlands
4,"✅ Trip Verified | First the good news, the clu...",1.0,14th February 2025,United Kingdom


In [54]:
# column which mentions if the user is verified or not
df['verified'] = df.reviews.str.contains("Trip Verified")

In [55]:
df['verified']

Unnamed: 0,verified
0,True
1,True
2,True
3,True
4,True
...,...
3495,False
3496,False
3497,False
3498,False


In [60]:
#extract the column of reviews into a separate dataframe and clean it for semantic analysis
#for lemmatization of words we will use nltk library
lemma = WordNetLemmatizer()
import nltk
nltk.download('stopwords')
# Download the 'wordnet' dataset
nltk.download('wordnet')


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [61]:
# add the corpus to the original dataframe
df['corpus'] = corpus

In [62]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,"✅ Trip Verified | The seats were excellent, ...",5.0,28th February 2025,United Kingdom,True,seat excellent feel much room official seat pi...
1,✅ Trip Verified | After the nightmare of get...,9.0,27th February 2025,United Kingdom,True,nightmare getting triple security nairobi airp...
2,✅ Trip Verified | Prior to boarding a gate a...,5.0,21st February 2025,United Kingdom,True,prior boarding gate agent seemed pick elderly ...
3,✅ Trip Verified | I flew from Amsterdam to L...,3.0,18th February 2025,Netherlands,True,flew amsterdam la vega layover heathrow novemb...
4,"✅ Trip Verified | First the good news, the clu...",1.0,14th February 2025,United Kingdom,True,first good news club suite huge improvement ol...


In [63]:
df.dtypes

Unnamed: 0,0
reviews,object
stars,float64
date,object
country,object
verified,bool
corpus,object


In [65]:
# convert the date to datetime format
df.date = pd.to_datetime(df.date, format='mixed')

In [66]:
df.date.head()

Unnamed: 0,date
0,2025-02-28
1,2025-02-27
2,2025-02-21
3,2025-02-18
4,2025-02-14


In [67]:
#check for unique ratings
df.stars.unique()

array([ 5.,  9.,  3.,  1.,  4.,  7.,  2.,  8., 10.,  6., nan])

In [69]:
# Convert the 'stars' column to string type before using .str accessor
df.stars = df.stars.astype(str).str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [70]:
df.stars.value_counts()

Unnamed: 0_level_0,count
stars,Unnamed: 1_level_1
1.0,893
2.0,407
3.0,402
8.0,335
10.0,280
7.0,271
9.0,265
5.0,245
4.0,234
6.0,167


In [71]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [72]:
#check the unique values again
df.stars.unique()

array(['5.0', '9.0', '3.0', '1.0', '4.0', '7.0', '2.0', '8.0', '10.0',
       '6.0', 'nan'], dtype=object)

In [73]:
# checking null value on all dataset
df.isnull().value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count
reviews,stars,date,country,verified,corpus,Unnamed: 6_level_1
False,False,False,False,False,False,3499
False,False,False,True,False,False,1


In [74]:
df.country.isnull().value_counts()

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
False,3499
True,1


In [75]:
# drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [76]:
df.shape

(3499, 6)

In [77]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,"✅ Trip Verified | The seats were excellent, ...",5.0,2025-02-28,United Kingdom,True,seat excellent feel much room official seat pi...
1,✅ Trip Verified | After the nightmare of get...,9.0,2025-02-27,United Kingdom,True,nightmare getting triple security nairobi airp...
2,✅ Trip Verified | Prior to boarding a gate a...,5.0,2025-02-21,United Kingdom,True,prior boarding gate agent seemed pick elderly ...
3,✅ Trip Verified | I flew from Amsterdam to L...,3.0,2025-02-18,Netherlands,True,flew amsterdam la vega layover heathrow novemb...
4,"✅ Trip Verified | First the good news, the clu...",1.0,2025-02-14,United Kingdom,True,first good news club suite huge improvement ol...
...,...,...,...,...,...,...
3494,On past experience I chose BA for our long hau...,9.0,2014-11-25,United Kingdom,False,past experience chose ba long haul return flig...
3495,BA16 Singapore to London. B777 World Traveller...,10.0,2014-11-20,Singapore,False,ba singapore london b world traveller cabin on...
3496,LHR-LCA in Club Europe. The First class lounge...,10.0,2014-11-20,United Kingdom,False,lhr lca club europe first class lounge fairly ...
3497,LHR – LAX Club World A380 return a week later ...,7.0,2014-11-20,United Kingdom,False,lhr lax club world return week later lax lhr f...


In [78]:
# export the cleaned data
df.to_csv("data/cleaned-BA-reviews.csv")